Added WholeSection entries and parser.

[DictionaryPC.git] / src / com / hughes / android / dictionary / engine / DictionaryBuilderMain.java
diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java

index 201ac591df8912643f38baf8da0292207bc24f46..db5c757fe83a391848174eba0047e9d44a97c43c 100644 (file)
--- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java
+++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java
@@ -14,11 +14,16 @@
  
  package com.hughes.android.dictionary.engine;
  
+import java.util.Arrays;
  import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
+import java.util.List;
  import java.util.Map;
+import java.util.Set;
  
  import junit.framework.TestCase;
  
+import com.hughes.android.dictionary.parser.wiktionary.EnTranslationToTranslationParser;
  import com.hughes.android.dictionary.parser.wiktionary.WiktionaryLangs;
  
  public class DictionaryBuilderMain extends TestCase {
@@ -61,11 +66,144 @@ public class DictionaryBuilderMain extends TestCase {
      // HACK: The missing "e" prevents a full match, causing "Cantonese" to be appended to the entries.
      isoToRegex.put("ZH", "Chinese|Mandarin|Cantones");
      
+    
+    // Build the non EN ones.
+    
+    final String[][] nonEnPairs = new String[][] {
+        
+        /*
+        {"AR", "DE" },
+        {"AR", "ES" },
+        {"AR", "FR" },
+        {"AR", "HE" },
+        {"AR", "IT" },
+        {"AR", "JA" },
+        {"AR", "RU" },
+        {"AR", "TR" },  // Turkish
+        {"AR", "ZH" },
+        
+        {"DE", "AR" },
+        {"DE", "FR" },
+        {"DE", "CA" },  // Catalan
+        {"DE", "CS" },  // Czech
+        {"DE", "EO" },  // Esperanto
+        {"DE", "ES" },
+        {"DE", "FR" },
+        {"DE", "HE" },
+        {"DE", "HU" },  // Hungarian
+        {"DE", "IT" },
+        {"DE", "JA" },
+        {"DE", "LA" },  // Latin
+        {"DE", "PL" },  // Polish
+        {"DE", "RU" },
+        {"DE", "SV" },  // Swedish
+        {"DE", "TR" },  // Turkish
+        {"DE", "ZH" },
+
+        
+        {"FR", "BG" },  // Bulgarian
+        {"FR", "CS" },  // Czech
+        {"FR", "DE" },
+        {"FR", "ES" },
+        {"FR", "IT" },
+        {"FR", "JA" },
+        {"FR", "LA" },
+        {"FR", "NL" },  // Dutch
+        {"FR", "RU" },
+        {"FR", "TR" },  // Turkish
+        {"FR", "ZH" },
+
+        {"IT", "DE" },
+        {"IT", "EL" },  // Greek
+        {"IT", "ES" },
+        {"IT", "FR" },
+        {"IT", "HU" },
+        {"IT", "JA" },
+        {"IT", "LA" },  // Latin
+        {"IT", "LV" },  // Latvian
+        {"IT", "NL" },
+        {"IT", "PL" },
+        {"IT", "RU" },
+        {"IT", "SV" },
+        {"IT", "TR" },  // Turkish
+        {"IT", "ZH" },
+
+        {"JA", "ZH" },
+        {"JA", "AR" },
+        {"JA", "KO" },
+
+        {"ZH", "AR" },
+        {"ZH", "DE" },
+        {"ZH", "ES" },
+        {"ZH", "FR" },
+        {"ZH", "IT" },
+        {"ZH", "KO" },
+
+        
+        {"NO", "SV" },
+        {"NO", "FI" },
+        {"FI", "SV" },
+        
+        {"PL", "FR" },  // Polish
+        {"PL", "RU" },  // Polish
+        {"PL", "HU" },  // Polish
+        {"PL", "ES" },  // Polish
+
+        */
+        
+
+    };
+    
+    final Set<List<String>> done = new LinkedHashSet<List<String>>();
+    for (final String[] pair : nonEnPairs) {
+      Arrays.sort(pair);
+      final List<String> pairList = Arrays.asList(pair);
+      if (done.contains(pairList)) {
+        continue;
+      }
+      done.add(pairList);
+      
+      final String lang1 = pair[0];
+      final String lang2 = pair[1];
+      
+      final String dictFile = String.format("%s/%s-%s_enwiktionary_BETA.quickdic", 
+          OUTPUTS, lang1, lang2);
+      System.out.println("building dictFile: " + dictFile);
+
+      if (!isoToStoplist.containsKey(lang1)) {
+        isoToStoplist.put(lang1, "empty.txt");
+      }
+      if (!isoToStoplist.containsKey(lang2)) {
+        isoToStoplist.put(lang2, "empty.txt");
+      }
+      
+      DictionaryBuilder.main(new String[] {
+          String.format("--dictOut=%s", dictFile),
+          String.format("--lang1=%s", lang1),
+          String.format("--lang2=%s", lang2),
+          String.format("--lang1Stoplist=%s", STOPLISTS + isoToStoplist.get(lang1)),
+          String.format("--lang2Stoplist=%s", STOPLISTS + isoToStoplist.get(lang2)),
+          String.format("--dictInfo=(EN)Wikitionary-based %s-%s dictionary.", lang1, lang2),
+
+          String.format("--input2=%swikiSplit/en/EN.data", INPUTS),
+          String.format("--input2Name=BETA!enwiktionary.%s-%s", lang1, lang2),
+          String.format("--input2Format=%s", EnTranslationToTranslationParser.NAME),
+          String.format("--input2LangPattern1=%s", lang1),
+          String.format("--input2LangPattern2=%s", lang2),
+      });
+    }
+    if (1==1) {
+      //return;
+    }
+
+
+    // Now build the EN ones.
+    
  //    isoToWikiName.keySet().retainAll(Arrays.asList("UK", "HR", "FI"));
      //isoToWikiName.clear();
      boolean go = false;
      for (final String foreignIso : isoToWikiName.keySet()) {
-      if (foreignIso.equals("BO")) {
+      if (foreignIso.equals("SL")) {
          go = true;
        }
        if (!go) {
@@ -93,16 +231,18 @@ public class DictionaryBuilderMain extends TestCase {
              String.format("--lang2Stoplist=%s", STOPLISTS + isoToStoplist.get(foreignIso)),
              String.format("--dictInfo=(EN)Wikitionary-based EN-%s dictionary.\n\n%s", foreignIso, isoToDedication.get(foreignIso)),
  
-            "--input2=" + INPUTS + "enWikiSplit/" + foreignIso + ".data",
+            "--input2=" + INPUTS + "wikiSplit/en/" + foreignIso + ".data",
              "--input2Name=enwiktionary." + foreignIso,
              "--input2Format=enwiktionary",
+            "--input2WiktionaryType=EnForeign",
              "--input2LangPattern=" + isoToRegex.get(foreignIso),
              "--input2LangCodePattern=" + foreignIso.toLowerCase(),
              "--input2EnIndex=1",
  
-            "--input3=" + INPUTS + "enWikiSplit/EN.data",
+            "--input3=" + INPUTS + "wikiSplit/en/EN.data",
              "--input3Name=enwiktionary.english",
              "--input3Format=enwiktionary",
+            "--input3WiktionaryType=EnToTranslation",
              "--input3LangPattern=" + isoToRegex.get(foreignIso),
              "--input3LangCodePattern=" + foreignIso.toLowerCase(),
              "--input3EnIndex=1",
@@ -111,6 +251,8 @@ public class DictionaryBuilderMain extends TestCase {
          
      }  // foreignIso
  
+    // Now special case German-English.
+
      final String dictFile = String.format("%s/DE-EN_chemnitz_enwiktionary.quickdic", OUTPUTS);
      DictionaryBuilder.main(new String[] {
          "--dictOut=" + dictFile,
@@ -125,16 +267,18 @@ public class DictionaryBuilderMain extends TestCase {
          "--input4Charset=UTF8",
          "--input4Format=chemnitz",
          
-        "--input2=" + INPUTS + "enWikiSplit/DE.data",
+        "--input2=" + INPUTS + "wikiSplit/en/DE.data",
          "--input2Name=enwiktionary.DE",
          "--input2Format=enwiktionary",
+        "--input2WiktionaryType=EnForeign",
          "--input2LangPattern=German",
          "--input2LangCodePattern=de",
          "--input2EnIndex=2",
  
-        "--input3=" + INPUTS + "enWikiSplit/EN.data",
+        "--input3=" + INPUTS + "wikiSplit/en/EN.data",
          "--input3Name=enwiktionary.english",
          "--input3Format=enwiktionary",
+        "--input3WiktionaryType=EnToTranslation",
          "--input3LangPattern=German",
          "--input3LangCodePattern=de",
          "--input3EnIndex=2",