]> gitweb.fperrin.net Git - DictionaryPC.git/commitdiff
Updated input locations. Moved pairs in builder.
authorthadh <thadh@thadh-macbookpro>
Thu, 4 Oct 2012 15:09:10 +0000 (08:09 -0700)
committerthadh <thadh@thadh-macbookpro>
Thu, 4 Oct 2012 15:09:10 +0000 (08:09 -0700)
data/downloadInputs.sh
src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java
todo.txt

index c14d4604b57591c4675fbffe980285bea90a496f..5e7990fbe39bb68ad4d01023e045b5b78c90c4c9 100755 (executable)
@@ -8,8 +8,8 @@ echo "Note that unzipping is slow."
 
 L=en
 echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
-WIKI=${L}wiktionary-20120714-pages-articles.xml
-curl --remote-name http://dumps.wikimedia.org/${L}wiktionary/20120714/${WIKI}.bz2
+WIKI=${L}wiktionary-20120930-pages-articles.xml
+curl --remote-name http://dumps.wikimedia.org/${L}wiktionary/20120930/${WIKI}.bz2
 bunzip2 ${WIKI}.bz2
 mv ${WIKI} inputs/${L}wiktionary-pages-articles.xml
 
@@ -21,29 +21,29 @@ mv ${CHEMNITZ} inputs/
 
 L=fr
 echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
-WIKI=${L}wiktionary-20120719-pages-articles.xml
-curl --remote-name http://dumps.wikimedia.org/${L}wiktionary/20120719/${WIKI}.bz2
+WIKI=${L}wiktionary-20120926-pages-articles.xml
+curl --remote-name http://dumps.wikimedia.org/${L}wiktionary/20120926/${WIKI}.bz2
 bunzip2 ${WIKI}.bz2
 mv ${WIKI} inputs/${L}wiktionary-pages-articles.xml
 
 L=it
 echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
-WIKI=${L}wiktionary-20120720-pages-articles.xml
-curl --remote-name http://dumps.wikimedia.org/${L}wiktionary/20120720/${WIKI}.bz2
+WIKI=${L}wiktionary-20120926-pages-articles.xml
+curl --remote-name http://dumps.wikimedia.org/${L}wiktionary/20120926/${WIKI}.bz2
 bunzip2 ${WIKI}.bz2
 mv ${WIKI} inputs/${L}wiktionary-pages-articles.xml
 
 L=de
 echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
-WIKI=${L}wiktionary-20120714-pages-articles.xml
-curl --remote-name http://dumps.wikimedia.org/${L}wiktionary/20120714/${WIKI}.bz2
+WIKI=${L}wiktionary-20120928-pages-articles.xml
+curl --remote-name http://dumps.wikimedia.org/${L}wiktionary/20120928/${WIKI}.bz2
 bunzip2 ${WIKI}.bz2
 mv ${WIKI} inputs/${L}wiktionary-pages-articles.xml
 
 L=es
 echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
-WIKI=${L}wiktionary-20120718-pages-articles.xml
-curl --remote-name http://dumps.wikimedia.org/${L}wiktionary/20120718/${WIKI}.bz2
+WIKI=${L}wiktionary-20120924-pages-articles.xml
+curl --remote-name http://dumps.wikimedia.org/${L}wiktionary/20120924/${WIKI}.bz2
 bunzip2 ${WIKI}.bz2
 mv ${WIKI} inputs/${L}wiktionary-pages-articles.xml
 
index 5fc5b3c41fe580dd93ac3d8fb17a4fd4d3b51b70..2677bb422ac531ad02eb6468dc8fa909aca0cd3e 100644 (file)
@@ -34,6 +34,107 @@ public class DictionaryBuilderMain extends TestCase {
   static final String INPUTS = "data/inputs/";
   static final String STOPLISTS = "data/inputs/stoplists/";
   static final String OUTPUTS = "data/outputs/";  
+  
+  // Build the non EN ones.
+  static final String[][] nonEnPairs = new String[][] {
+      {"EN"},
+      {"DE"},
+      {"IT"},
+      {"FR"},
+          
+      // The 3 I use most:
+      {"IT", "EN" },
+      {"DE", "EN" },
+      {"DE", "IT" },
+          
+          
+      {"AR", "DE" },
+      {"AR", "ES" },
+      {"AR", "FR" },
+      {"AR", "HE" },
+      {"AR", "IT" },
+      {"AR", "JA" },
+      {"AR", "RU" },
+      {"AR", "TR" },  // Turkish
+      {"AR", "ZH" },
+      
+      {"DE", "AR" },
+      {"DE", "FR" },
+      {"DE", "CA" },  // Catalan
+      {"DE", "CS" },  // Czech
+      {"DE", "EO" },  // Esperanto
+      {"DE", "ES" },
+      {"DE", "FR" },
+      {"DE", "HE" },
+      {"DE", "HU" },  // Hungarian
+      {"DE", "IT" },
+      {"DE", "JA" },
+      {"DE", "LA" },  // Latin
+      {"DE", "NL" },  // Dutch
+      {"DE", "PL" },  // Polish
+      {"DE", "RU" },
+      {"DE", "SV" },  // Swedish
+      {"DE", "TR" },  // Turkish
+      {"DE", "ZH" },
+      {"DE", "TA" },  // Tamil
+      
+      {"ES", "RU" },  // Spanish-Russian
+      
+      {"FR", "BG" },  // Bulgarian
+      {"FR", "CS" },  // Czech
+      {"FR", "DE" },
+      {"FR", "ES" },
+      {"FR", "IT" },
+      {"FR", "JA" },
+      {"FR", "LA" },
+      {"FR", "NL" },  // Dutch
+      {"FR", "RU" },
+      {"FR", "TR" },  // Turkish
+      {"FR", "ZH" },
+
+      {"IT", "DE" },
+      {"IT", "EL" },  // Greek
+      {"IT", "ES" },
+      {"IT", "FR" },
+      {"IT", "HU" },
+      {"IT", "JA" },
+      {"IT", "LA" },  // Latin
+      {"IT", "LV" },  // Latvian
+      {"IT", "NL" },
+      {"IT", "PL" },
+      {"IT", "RU" },
+      {"IT", "SV" },
+      {"IT", "TR" },  // Turkish
+      {"IT", "ZH" },
+
+      {"JA", "ZH" },
+      {"JA", "AR" },
+      {"JA", "KO" },
+
+      {"ZH", "AR" },
+      {"ZH", "DE" },
+      {"ZH", "ES" },
+      {"ZH", "FR" },
+      {"ZH", "IT" },
+      {"ZH", "KO" },
+
+      
+      {"NO", "SV" },
+      {"NO", "FI" },
+      {"FI", "SV" },
+      
+      {"PL", "FR" },  // Polish
+      {"PL", "RU" },  // Polish
+      {"PL", "HU" },  // Polish
+      {"PL", "ES" },  // Polish
+      
+      {"TR", "EL" },  // Turkish, Greek
+
+      {"FA", "HY" },  // Persian, Armenian, by request.
+      {"FA", "SV" },  // Persian, Swedish, by request.
+
+  };
+
 
   
   static final Map<String,String>  isoToDedication = new LinkedHashMap<String, String>();
@@ -97,7 +198,7 @@ public class DictionaryBuilderMain extends TestCase {
             result.add(String.format("--input%dWiktionaryLang=%s", i, lang1));
             result.add(String.format("--input%dSkipLang=%s", i, lang1));
             result.add(String.format("--input%dWebUrlTemplate=http://%s.wiktionary.org/wiki/%%s", i, lang1.toLowerCase()));
-            result.add(String.format("--input%dPageLimit=100", i));
+            //result.add(String.format("--input%dPageLimit=100", i));
             ++i;
         } else {
             System.err.println("Can't read file: " + wikiSplitFile);
@@ -111,7 +212,7 @@ public class DictionaryBuilderMain extends TestCase {
             result.add(String.format("--input%dLangPattern=%s", i, "English"));
             result.add(String.format("--input%dLangCodePattern=%s", i, lang1.toLowerCase()));
             result.add(String.format("--input%dEnIndex=%d", i, 1));
-            result.add(String.format("--input%dPageLimit=100", i));
+            //result.add(String.format("--input%dPageLimit=100", i));
             ++i;
         }
         
@@ -210,105 +311,6 @@ public class DictionaryBuilderMain extends TestCase {
     
     final List<String[]> allPairs = new ArrayList<String[]>();
     
-    // Build the non EN ones.
-    final String[][] nonEnPairs = new String[][] {
-        {"EN"},
-        {"DE"},
-        {"IT"},
-        {"FR"},
-            
-        // The 3 I use most:
-        {"IT", "EN" },
-        {"DE", "EN" },
-        {"DE", "IT" },
-            
-            
-        {"AR", "DE" },
-        {"AR", "ES" },
-        {"AR", "FR" },
-        {"AR", "HE" },
-        {"AR", "IT" },
-        {"AR", "JA" },
-        {"AR", "RU" },
-        {"AR", "TR" },  // Turkish
-        {"AR", "ZH" },
-        
-        {"DE", "AR" },
-        {"DE", "FR" },
-        {"DE", "CA" },  // Catalan
-        {"DE", "CS" },  // Czech
-        {"DE", "EO" },  // Esperanto
-        {"DE", "ES" },
-        {"DE", "FR" },
-        {"DE", "HE" },
-        {"DE", "HU" },  // Hungarian
-        {"DE", "IT" },
-        {"DE", "JA" },
-        {"DE", "LA" },  // Latin
-        {"DE", "NL" },  // Dutch
-        {"DE", "PL" },  // Polish
-        {"DE", "RU" },
-        {"DE", "SV" },  // Swedish
-        {"DE", "TR" },  // Turkish
-        {"DE", "ZH" },
-        {"DE", "TA" },  // Tamil
-        
-        {"ES", "RU" },  // Spanish-Russian
-        
-        {"FR", "BG" },  // Bulgarian
-        {"FR", "CS" },  // Czech
-        {"FR", "DE" },
-        {"FR", "ES" },
-        {"FR", "IT" },
-        {"FR", "JA" },
-        {"FR", "LA" },
-        {"FR", "NL" },  // Dutch
-        {"FR", "RU" },
-        {"FR", "TR" },  // Turkish
-        {"FR", "ZH" },
-
-        {"IT", "DE" },
-        {"IT", "EL" },  // Greek
-        {"IT", "ES" },
-        {"IT", "FR" },
-        {"IT", "HU" },
-        {"IT", "JA" },
-        {"IT", "LA" },  // Latin
-        {"IT", "LV" },  // Latvian
-        {"IT", "NL" },
-        {"IT", "PL" },
-        {"IT", "RU" },
-        {"IT", "SV" },
-        {"IT", "TR" },  // Turkish
-        {"IT", "ZH" },
-
-        {"JA", "ZH" },
-        {"JA", "AR" },
-        {"JA", "KO" },
-
-        {"ZH", "AR" },
-        {"ZH", "DE" },
-        {"ZH", "ES" },
-        {"ZH", "FR" },
-        {"ZH", "IT" },
-        {"ZH", "KO" },
-
-        
-        {"NO", "SV" },
-        {"NO", "FI" },
-        {"FI", "SV" },
-        
-        {"PL", "FR" },  // Polish
-        {"PL", "RU" },  // Polish
-        {"PL", "HU" },  // Polish
-        {"PL", "ES" },  // Polish
-        
-        {"TR", "EL" },  // Turkish, Greek
-
-        {"FA", "HY" },  // Persian, Armenian, by request.
-        {"FA", "SV" },  // Persian, Swedish, by request.
-
-    };
     allPairs.addAll(Arrays.asList(nonEnPairs));
     
     // Add all the EN-XX pairs.
index 085a52a2b35881344ebdc4a8252311c077eee1c0..df462df84785f0063fc8378f8902f5837827de7f 100644 (file)
--- a/todo.txt
+++ b/todo.txt
@@ -1,6 +1,8 @@
-EN dictionaries.
+Compression
+{{de-conj
+{{term
+{{etyl
 Spaces in links are done wrong: "perche mai",click "why on earth", see "why%20..."
-De verbs.
 Delete it conjugation of entries.
 
 
@@ -127,4 +129,5 @@ long click on see also works to link
 Links in HTML work: mostly, they don't open with the keyboard open
 when edit text loses focus, all highlighted: impossible if it's not focused.
 Web view search works.
+EN dictionaries.
   
\ No newline at end of file