]> gitweb.fperrin.net Git - DictionaryPC.git/commitdiff
Move test data, fix DictFileParser, fix splitter, fix crash during
authorThad Hughes <thad.hughes@gmail.com>
Sun, 18 Dec 2011 19:38:00 +0000 (11:38 -0800)
committerThad Hughes <thad.hughes@gmail.com>
Sun, 18 Dec 2011 19:38:00 +0000 (11:38 -0800)
weird qualifier.

src/com/hughes/android/dictionary/engine/DictionaryBuilder.java
src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java
src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java
src/com/hughes/android/dictionary/engine/WiktionarySplitter.java
src/com/hughes/android/dictionary/parser/DictFileParser.java
src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java

index 04b72b4d8265fd976e05f3d36c2bc11544a4b9a2..888d5c87b5ccddf93194bafcbbdfebebf8efcf45 100644 (file)
@@ -131,7 +131,7 @@ public class DictionaryBuilder {
         } else if ("chemnitz".equals(inputFormat)) {
           new DictFileParser(charset, false, DictFileParser.DOUBLE_COLON, DictFileParser.PIPE, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parseFile(file);
         } else if ("enwiktionary".equals(inputFormat)) {
-          final Pattern langPattern = Pattern.compile(keyValueArgs.remove(prefix + "LangPattern"));
+          final Pattern langPattern = Pattern.compile(keyValueArgs.remove(prefix + "LangPattern"), Pattern.CASE_INSENSITIVE);
           final Pattern langCodePattern = Pattern.compile(keyValueArgs.remove(prefix + "LangCodePattern"));
           final int enIndex = Integer.parseInt(keyValueArgs.remove(prefix + "EnIndex")) - 1;
           String pageLimit = keyValueArgs.remove(prefix + "PageLimit");
index 833e5e97c4fe0a02e1431a1a24f91824e45824c9..36564eacce3ef918459e6c906849d9833c9af7a6 100644 (file)
@@ -19,7 +19,9 @@ import java.io.PrintWriter;
 import java.io.RandomAccessFile;
 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.LinkedHashMap;
 import java.util.List;
+import java.util.Map;
 
 import junit.framework.TestCase;
 
@@ -28,116 +30,83 @@ public class DictionaryBuilderMain extends TestCase {
   static final String INPUTS = "../DictionaryData/inputs/";
   static final String STOPLISTS = "../DictionaryData/inputs/stoplists/";
   static final String OUTPUTS = "../DictionaryData/outputs/";
-  
-  static class Lang {
-    final String nameRegex;
-    final String isoCode;
-    final String wikiSplit;
-    final String stoplistFile;
-    public Lang(String nameRegex, String code, final String wikiSplit, final String stoplistFile) {
-      this.nameRegex = nameRegex;
-      this.isoCode = code;
-      this.wikiSplit = wikiSplit;
-      this.stoplistFile = stoplistFile;
-    }
-  }
-  
-  
+    
   public static void main(final String[] args) throws Exception {
-
-    Lang[] langs1 = new Lang[] { 
-        new Lang("^English$", "EN", null, "en.txt"),
-    };
-    Lang[] langs2 = new Lang[] { 
-//        new Lang("^.*Italian.*$", "IT", "italian.data", "it.txt"),
-//        new Lang("^.*French.*$", "FR", "french.data", "empty.txt"),
-//        new Lang("^.*Spanish.*$", "ES", "spanish.data", "es.txt"),
-//        new Lang("^.*Greek.*$", "EL", "greek.data", "el.txt"),
-//        new Lang("^.*Japanese.*$", "JA", "japanese.data", "empty.txt"),
-//        new Lang("^.*Chinese.*$|^.*Mandarin.*$", "ZH", "mandarin.data", "empty.txt"),
-        new Lang("^.*Afrikaans.*$", "AF", "afrikaans.data", "empty.txt"),
-        new Lang("^.*Arabic.*$", "AR", "".data, "empty.txt"),
-        new Lang("^.*Hebrew.*$", "HE"),
-        new Lang("^.*Hindi.*$", "HI"),
-        new Lang("^.*Icelandic.*$", "IS"),
-        new Lang("^.*Irish.*$", "GA"),
-        new Lang("^.*Korean.*$", "KO"),
-        new Lang("^.*Maori.*$", "MI"),
-        new Lang("^.*Norwegian.*$", "NO"),
-        new Lang("^.*Persian.*$", "FA"),
-        new Lang("^.*Portuguese.*$", "PT"),
-        new Lang("^.*Romanian.*$", "RO"),
-        new Lang("^.*Russian.*$", "RU"),
-        new Lang("^.*Sanskrit.*$", "SA"),
-        new Lang("^.*Serbian.*$", "SR"),
-        new Lang("^.*Swedish.*$", "SV"),
-        new Lang("^.*Tajik.*$", "TG"),
-        new Lang("^.*Thai.*$", "TH"),
-        new Lang("^.*Tibetan.*$", "BO"),
-        new Lang("^.*Turkish.*$", "TR"),
-        new Lang("^.*Ukranian.*$", "UK"),
-        new Lang("^.*Vietnamese.*$", "VI"),
-        new Lang("^.*Welsh.*$", "CY"),
-        new Lang("^.*Zulu.*$", "ZU"),
-        new Lang("^.*Croation.*$", "HR"),
-        new Lang("^.*Czech.*$", "CS"),
-        new Lang("^.*Dutch.*$", "NL"),
-        new Lang("^.*Finnish.*$", "FI"),
-        /*
-        new Lang("^German$", "DE"),
-        new Lang("^Armenian$", "HY"),
-        new Lang("^English$", "EN"),
-        new Lang("^Kurdish$", "KU"),
-        new Lang("^Lithuanian$", "LT"),
-        new Lang("^Malay$", "MS"),
-        new Lang("^Mongolian$", "MN"),
-        new Lang("^Somali$", "SO"),
-        new Lang("^Sudanese$", "SU"),
-        new Lang("^Yiddish$", "YI"),
-        */
-    };
     
-    for (final Lang lang1 : langs1) {
-      for (final Lang lang2 : langs2) {
-        if (lang1.nameRegex.equals(lang2.nameRegex)) {
-          continue;
-        }
+    final Map<String,String> isoToWikiName = new LinkedHashMap<String, String>(Language.isoCodeToWikiName);
+    isoToWikiName.remove("EN");
+    isoToWikiName.remove("DE");
+
+    final Map<String,String>  isoToDedication = new LinkedHashMap<String, String>();
+    isoToDedication.put("AF", "Afrikaans dictionary dedicated to Heiko and Mariëtte Horn.");
+    isoToDedication.put("HR", "Croation dictionary dedicated to Ines Viskic and Miro Kresonja.");
+    isoToDedication.put("NL", "Dutch dictionary dedicated to Mike LeBeau.");
+    // German handled in file.
+    isoToDedication.put("EL", "Greek dictionary dedicated to Noah Egge.");
+    isoToDedication.put("IT", "Italian dictionary dedicated to Carolina Tropini, my favorite stardust in the whole universe!  Ti amo!");
+    isoToDedication.put("JA", "Japanese dictionary dedicated to Akane Watanabe.");
+    isoToDedication.put("KO", "Korean dictionary dedicated to Ande Elwood--fall fashion und Fernsehturms!");
+    isoToDedication.put("PT", "Portuguese dictionary dedicated to Carlos Melo, one Tough Mudder.");
+    isoToDedication.put("RO", "Romanian dictionary dedicated to Radu Teodorescu.");
+    isoToDedication.put("RU", "Russian dictionary dedicated to Maxim Aronin--best friend always!.");
+    isoToDedication.put("SR", "Serbian dictionary dedicated to Filip Crnogorac--thanks for the honey.");
+    isoToDedication.put("ES", "Spanish dictionary made especially for Carolina Tropini! <3 XoXoXXXXX!");
+    isoToDedication.put("SV", "Swedish dictionary dedicated to Kajsa Palmblad--björn kramar!");
+
+    final Map<String,String>  isoToStoplist = new LinkedHashMap<String, String>();
+    isoToStoplist.put("DE", "de.txt");
+    isoToStoplist.put("EN", "en.txt");
+    isoToStoplist.put("ES", "es.txt");
+    isoToStoplist.put("IT", "it.txt");
+    isoToStoplist.put("FR", "fr.txt");
+
+    final Map<String,String>  isoToRegex = new LinkedHashMap<String, String>();
+    isoToRegex.put("ZH", ".*Chinese.*|.*Mandarin.*|.*Cantonese.*");
+    
+    boolean go = false; 
+    isoToWikiName.clear();
+    for (final String foreignIso : isoToWikiName.keySet()) {
+      if (foreignIso.equals("GA")) {
+        go = true;
+      }
+      if (!go) {
+        continue;
+      }
+
+        final String dictFile = String.format(OUTPUTS + "/EN-%s_enwiktionary.quickdic", foreignIso);
+        System.out.println("building dictFile: " + dictFile);
         
-        int enIndex = -1;
-        Lang nonEnglish = null;
-        if (lang2.isoCode.equals("EN")) {
-          enIndex = 2;
-          nonEnglish = lang1;
+        if (!isoToStoplist.containsKey(foreignIso)) {
+          isoToStoplist.put(foreignIso, "empty.txt");
         }
-        if (lang1.isoCode.equals("EN")) {
-          enIndex = 1;
-          nonEnglish = lang2;
+        if (!isoToDedication.containsKey(foreignIso)) {
+          isoToDedication.put(foreignIso, "");
         }
-        assert nonEnglish != null;
-
-        final String dictFile = String.format(OUTPUTS + "/%s-%s_enwiktionary.quickdic", lang1.isoCode, lang2.isoCode);
-        System.out.println("building dictFile: " + dictFile);
+        if (!isoToRegex.containsKey(foreignIso)) {
+          isoToRegex.put(foreignIso, ".*" + isoToWikiName.get(foreignIso) + ".*");
+        }
+  
         DictionaryBuilder.main(new String[] {
             String.format("--dictOut=%s", dictFile),
-            String.format("--lang1=%s", lang1.isoCode),
-            String.format("--lang2=%s", lang2.isoCode),
-            String.format("--lang1Stoplist=%s", STOPLISTS + lang1.stoplistFile),
-            String.format("--lang2Stoplist=%s", STOPLISTS + lang2.stoplistFile),
-            String.format("--dictInfo=(EN)Wikitionary-based %s-%s dictionary", lang1.isoCode, lang2.isoCode),
-
-            "--input2=" + INPUTS + "enWikiSplit/" + nonEnglish.wikiSplit,
-            "--input2Name=enwiktionary." + nonEnglish.wikiSplit,
+            String.format("--lang1=EN"),
+            String.format("--lang2=%s", foreignIso),
+            String.format("--lang1Stoplist=%s", STOPLISTS + isoToStoplist.get("EN")),
+            String.format("--lang2Stoplist=%s", STOPLISTS + isoToStoplist.get(foreignIso)),
+            String.format("--dictInfo=(EN)Wikitionary-based EN-%s dictionary.  %s", foreignIso, isoToDedication.get(foreignIso)),
+
+            "--input2=" + INPUTS + "enWikiSplit/" + foreignIso + ".data",
+            "--input2Name=enwiktionary." + foreignIso,
             "--input2Format=enwiktionary",
-            "--input2LangPattern=" + nonEnglish.nameRegex,
-            "--input2LangCodePattern=" + nonEnglish.isoCode.toLowerCase(),
-            "--input2EnIndex=" + enIndex,
+            "--input2LangPattern=" + isoToRegex.get(foreignIso),
+            "--input2LangCodePattern=" + foreignIso.toLowerCase(),
+            "--input2EnIndex=2",
 
-            "--input3=" + INPUTS + "enWikiSplit/english.data",
+            "--input3=" + INPUTS + "enWikiSplit/EN.data",
             "--input3Name=enwiktionary.english",
             "--input3Format=enwiktionary",
-            "--input3LangPattern=" + nonEnglish.nameRegex,
-            "--input3LangCodePattern=" + (enIndex == 1 ? lang2.isoCode : lang1.isoCode).toLowerCase(),
-            "--input3EnIndex=" + enIndex,
+            "--input3LangPattern=" + isoToRegex.get(foreignIso),
+            "--input3LangCodePattern=" + foreignIso.toLowerCase(),
+            "--input3EnIndex=2",
 
         });
         
@@ -153,36 +122,33 @@ public class DictionaryBuilderMain extends TestCase {
         textOut.close();
         raf.close();
 
-      }  // langs2
-    }  // langs1
+    }  // foreignIso
 
     DictionaryBuilder.main(new String[] {
-        "--dictOut=" + OUTPUTS + "DE-EN_all_free.quickdic",
+        "--dictOut=" + OUTPUTS + "DE-EN_chemnitz_enwiktionary",
         "--lang1=DE",
         "--lang2=EN",
-        "--dictInfo=@" + INPUTS + "de-en_all_free.info",
+        "--dictInfo=@" + INPUTS + "de-en_chemnitz_enwiktionary.info",
 
         "--input1=" + INPUTS + "de-en_chemnitz.txt",
         "--input1Name=chemnitz",
         "--input1Charset=UTF8",
         "--input1Format=chemnitz",
-    });
-
-    DictionaryBuilder.main(new String[] {
-        "--dictOut=" + OUTPUTS + "de-en_all.quickdic",
-        "--lang1=DE",
-        "--lang2=EN",
-        "--dictInfo=@" + INPUTS + "de-en_all.info",
-
-        "--input2=" + INPUTS + "de-en_chemnitz.txt",
-        "--input2Name=dictcc",
-        "--input2Charset=UTF8",
-        "--input2Format=chemnitz",
-
-        "--input3=" + INPUTS + "/NONFREE/de-en_dictcc.txt",
-        "--input3Name=dictcc",
-        "--input3Charset=UTF8",
-        "--input3Format=dictcc",
+        
+        "--input2=" + INPUTS + "enWikiSplit/DE.data",
+        "--input2Name=enwiktionary.DE",
+        "--input2Format=enwiktionary",
+        "--input2LangPattern=" + isoToRegex.get("DE"),
+        "--input2LangCodePattern=de",
+        "--input2EnIndex=2",
+
+        "--input3=" + INPUTS + "enWikiSplit/EN.data",
+        "--input3Name=enwiktionary.english",
+        "--input3Format=enwiktionary",
+        "--input3LangPattern=" + isoToRegex.get("DE"),
+        "--input3LangCodePattern=de",
+        "--input3EnIndex=2",
+        
     });
 
   }
index 20a017211f3a7524adc23b3bec969e826822e09d..8059a1ef5e78e3b8c6cba9cb06d33b9119e5c1dd 100644 (file)
@@ -26,12 +26,12 @@ import junit.framework.TestCase;
 
 public class DictionaryBuilderTest extends TestCase {
   
-  public static final String TEST_INPUTS = "../DictionaryData/testdata/inputs/";
+  public static final String TEST_INPUTS = "testdata/inputs/";
   public static final String WIKISPLIT = "../DictionaryData/inputs/enWikiSplit/";
   public static final String STOPLISTS = "../DictionaryData/inputs/stoplists/";
-  public static final String GOLDENS = "../DictionaryData/testdata/goldens/";
+  public static final String GOLDENS = "testdata/goldens/";
 
-  public static final String TEST_OUTPUTS = "../DictionaryData/testdata/outputs/";
+  public static final String TEST_OUTPUTS = "testdata/outputs/";
 
   public void testWiktionaryItalianFromItalian() throws Exception {
     final String name = "wiktionary.it_it.quickdic";
@@ -45,7 +45,7 @@ public class DictionaryBuilderTest extends TestCase {
         "--lang2Stoplist=" + STOPLISTS + "en.txt",
         "--dictInfo=SomeWikiData",
 
-        "--input4=" + WIKISPLIT + "italian.data",
+        "--input4=" + WIKISPLIT + "IT.data",
         "--input4Name=enwiktionary.italian",
         "--input4Format=enwiktionary",
         "--input4LangPattern=Italian",
@@ -71,7 +71,7 @@ public class DictionaryBuilderTest extends TestCase {
         "--lang2Stoplist=" + STOPLISTS + "en.txt",
         "--dictInfo=SomeWikiData",
 
-        "--input3=" + WIKISPLIT + "english.data",
+        "--input3=" + WIKISPLIT + "EN.data",
         "--input3Name=enwiktionary.english",
         "--input3Format=enwiktionary",
         "--input3LangPattern=Italian",
index 6dd043aa9ef69a8e38bd4a04e4984c858dfd9bd2..94f7e262b7421c7af26cf0710ee77cee8736c4b1 100644 (file)
@@ -79,50 +79,52 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
 
     if (selectors.isEmpty()) {
       selectors.addAll(Arrays.asList(
-          new Selector("../DictionaryData/inputs/enWikiSplit/AF.data", ".*[Aa]frikaans.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/AR.data", ".*[Aa]rabic.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/HY.data", ".*[Aa]rmenian.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/HR.data", ".*[Cc]roation.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/CS.data", ".*[Cc]zech.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/ZH.data", ".*[Mm]andarin|[Cc]hinese.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/NL.data", ".*[Du]utch.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/EN.data", ".*[Ee]nglish.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/FI.data", ".*[Ff]inish.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/FR.data", ".*[Ff]rench.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/DE.data", ".*[Gg]erman.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/EL.data", ".*[Gg]reek.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/HE.data", ".*[Hh]ewbrew.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/HI.data", ".*[Hh]indi.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/IS.data", ".*[Ii]celandic.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/GA.data", ".*[Ii]rish.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/IT.data", ".*[Ii]talian.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/LT.data", ".*[Ll]ithuanian.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/JA.data", ".*[Jj]apanese.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/KO.data", ".*[Kk]orean.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/KU.data", ".*[Kk]urdish.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/MS.data", ".*[Mm]alay.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/MI.data", ".*[Mm]aori.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/MN.data", ".*[Mm]ongolian.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/NO.data", ".*[Nn]orwegian.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/FA.data", ".*[Pp]ersian.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/PT.data", ".*[Pp]ortuguese.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/RO.data", ".*[Rr]omanian.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/RU.data", ".*[Rr]ussian.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/SA.data", ".*[Ss]anskrit.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/SR.data", ".*[Ss]erbian.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/SO.data", ".*[Ss]omali.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/ES.data", ".*[Ss]panish.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/SU.data", ".*[Ss]udanese.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/SV.data", ".*[Ss]wedish.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/TG.data", ".*[Tt]ajik.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/TH.data", ".*[Tt]hai.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/BO.data", ".*[Tt]ibetan.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/TR.data", ".*[Tt]urkish.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/UK.data", ".*[Uu]kranian.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/VI.data", ".*[Vv]ietnamese.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/CI.data", ".*[Ww]elsh.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/YI.data", ".*[Yy]iddish.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/ZU.data", ".*[Zz]ulu.*")
+//          new Selector("../DictionaryData/inputs/enWikiSplit/AF.data", ".*[Aa]frikaans.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/AR.data", ".*[Aa]rabic.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/HY.data", ".*[Aa]rmenian.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/HR.data", ".*[Cc]roation.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/CS.data", ".*[Cc]zech.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/ZH.data", ".*[Cc]hinese.*|.*[Mm]andarin.*|.*Cantonese.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/DA.data", ".*[Dd]anish.*")
+//          new Selector("../DictionaryData/inputs/enWikiSplit/NL.data", ".*[Dd]utch.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/EN.data", ".*[Ee]nglish.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/FI.data", ".*[Ff]innish.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/FR.data", ".*[Ff]rench.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/DE.data", ".*[Gg]erman.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/EL.data", ".*[Gg]reek.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/haw.data", ".*[Hh]awaiian.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/HE.data", ".*[Hh]ebrew.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/HI.data", ".*[Hh]indi.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/IS.data", ".*[Ii]celandic.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/GA.data", ".*[Ii]rish.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/IT.data", ".*[Ii]talian.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/LT.data", ".*[Ll]ithuanian.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/JA.data", ".*[Jj]apanese.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/KO.data", ".*[Kk]orean.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/KU.data", ".*[Kk]urdish.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/MS.data", ".*[Mm]alay.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/MI.data", ".*[Mm]aori.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/MN.data", ".*[Mm]ongolian.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/NO.data", ".*[Nn]orwegian.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/FA.data", ".*[Pp]ersian.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/PT.data", ".*[Pp]ortuguese.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/PL.data", ".*[Pp]olish.*")
+//          new Selector("../DictionaryData/inputs/enWikiSplit/RO.data", ".*[Rr]omanian.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/RU.data", ".*[Rr]ussian.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/SA.data", ".*[Ss]anskrit.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/SR.data", ".*[Ss]erbian.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/SO.data", ".*[Ss]omali.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/ES.data", ".*[Ss]panish.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/SV.data", ".*[Ss]wedish.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/TG.data", ".*[Tt]ajik.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/TH.data", ".*[Tt]hai.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/BO.data", ".*[Tt]ibetan.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/TR.data", ".*[Tt]urkish.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/UK.data", ".*[Uu]krainian.*")
+//          new Selector("../DictionaryData/inputs/enWikiSplit/VI.data", ".*[Vv]ietnamese.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/CI.data", ".*[Ww]elsh.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/YI.data", ".*[Yy]iddish.*"),
+//          new Selector("../DictionaryData/inputs/enWikiSplit/ZU.data", ".*[Zz]ulu.*")
           ));
     }
     
@@ -137,9 +139,12 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
 
   static final Pattern headingStart = Pattern.compile("^(=+)[^=]+=+", Pattern.MULTILINE);
   
+  int pageCount = 0;
   private void endPage() {
     final String title = titleBuilder.toString();
-    System.out.println("endPage: " + title);
+    if (++pageCount % 1000 == 0) {
+      System.out.println("endPage: " + title + ", count=" + pageCount);
+    }
     
     String text = textBuilder.toString();
     
index 67ca43293cbdbf9bde04bd27de5ff03c7f55ce50..861c693bbbaab35a255d3891c255171ed7d8bda3 100644 (file)
@@ -135,6 +135,16 @@ public class DictFileParser {
     for (int i = 0; i < subfields[0].length; ++i) {
       subfields[0][i] = subfields[0][i].trim();
       subfields[1][i] = subfields[1][i].trim();
+      if (subfields[0][i].length() == 0 && subfields[1][i].length() == 0) {
+        logger.warning("Empty pair: " + line);
+        continue;
+      }
+      if (subfields[0][i].length() == 0) {
+        subfields[0][i] = "__";
+      }
+      if (subfields[1][i].length() == 0) {
+        subfields[1][i] = "__";
+      }
       pairEntry.pairs.add(new Pair(subfields[0][i], subfields[1][i]));
     }
     final IndexedEntry entryData = new IndexedEntry(pairEntry);
index 600c6e7b8056fa92bcf0635857530e5f5b60f9b3..6a6f43856a2bb34aa3e720d9ae839bc80401f6b3 100644 (file)
@@ -306,12 +306,16 @@ public class EnWiktionaryXmlParser {
             }
           //}
         } else if (functionName.equals("qualifier")) {
-          String qualifier = args.get(0);
-          if (!namedArgs.isEmpty() || args.size() > 1) {
-            LOG.warning("weird qualifier: " + line);
+          if (args.size() == 0) {
+           otherText.append(wikiTokenizer.token()); 
+          } else { 
+            String qualifier = args.get(0);
+            if (!namedArgs.isEmpty() || args.size() > 1) {
+              LOG.warning("weird qualifier: " + line);
+            }
+            // Unindexed!
+            otherText.append("(").append(qualifier).append(")");
           }
-          // Unindexed!
-          otherText.append("(").append(qualifier).append(")");
         } else if (encodings.contains(functionName)) {
           otherText.append("").append(args.get(0));
           otherIndexBuilder.addEntryWithString(indexedEntry, args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT);
@@ -578,6 +582,7 @@ public class EnWiktionaryXmlParser {
     
     final String prefix = listSection.firstPrefix;
     if (prefix.length() > 1) {
+      // Could just get looser and say that any prefix longer than first is a sublist.
       LOG.warning("Prefix too long: " + listSection);
       return;
     }