]> gitweb.fperrin.net Git - DictionaryPC.git/commitdiff
Redo splitter language codes.
authorThad Hughes <thad.hughes@gmail.com>
Sat, 17 Dec 2011 06:14:56 +0000 (22:14 -0800)
committerThad Hughes <thad.hughes@gmail.com>
Sat, 17 Dec 2011 06:14:56 +0000 (22:14 -0800)
bugs
src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java
src/com/hughes/android/dictionary/engine/WiktionarySplitter.java
src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java

diff --git a/bugs b/bugs
index 141f52c6b0fe5deafbe4b8af81a786d6deece1d0..4050d591a1f0ecc255de650045ec956ee2f5f519 100644 (file)
--- a/bugs
+++ b/bugs
@@ -1,4 +1,4 @@
-handle examples.
+icons
 handle word-info in English.
 
 
index 3479ec70bcb8d288d1bfba4e4f910ac04597f2fc..833e5e97c4fe0a02e1431a1a24f91824e45824c9 100644 (file)
@@ -49,51 +49,52 @@ public class DictionaryBuilderMain extends TestCase {
         new Lang("^English$", "EN", null, "en.txt"),
     };
     Lang[] langs2 = new Lang[] { 
-        //new Lang("^.*Italian.*$", "IT", "italian.data", "it.txt"),
-        new Lang("^.*French.*$", "FR", "french.data", "empty.txt"),
-        new Lang("^.*Spanish.*$", "ES", "spanish.data", "empty.txt"),
-        new Lang("^.*Greek.*$", "EL", "greek.data", "empty.txt"),
-        new Lang("^.*Japanese.*$", "JA", "japanese.data", "empty.txt"),
-        new Lang("^.*Chinese.*$|^.*Mandarin.*$", "ZH", "mandarin.data", "empty.txt"),
+//        new Lang("^.*Italian.*$", "IT", "italian.data", "it.txt"),
+//        new Lang("^.*French.*$", "FR", "french.data", "empty.txt"),
+//        new Lang("^.*Spanish.*$", "ES", "spanish.data", "es.txt"),
+//        new Lang("^.*Greek.*$", "EL", "greek.data", "el.txt"),
+//        new Lang("^.*Japanese.*$", "JA", "japanese.data", "empty.txt"),
+//        new Lang("^.*Chinese.*$|^.*Mandarin.*$", "ZH", "mandarin.data", "empty.txt"),
+        new Lang("^.*Afrikaans.*$", "AF", "afrikaans.data", "empty.txt"),
+        new Lang("^.*Arabic.*$", "AR", "".data, "empty.txt"),
+        new Lang("^.*Hebrew.*$", "HE"),
+        new Lang("^.*Hindi.*$", "HI"),
+        new Lang("^.*Icelandic.*$", "IS"),
+        new Lang("^.*Irish.*$", "GA"),
+        new Lang("^.*Korean.*$", "KO"),
+        new Lang("^.*Maori.*$", "MI"),
+        new Lang("^.*Norwegian.*$", "NO"),
+        new Lang("^.*Persian.*$", "FA"),
+        new Lang("^.*Portuguese.*$", "PT"),
+        new Lang("^.*Romanian.*$", "RO"),
+        new Lang("^.*Russian.*$", "RU"),
+        new Lang("^.*Sanskrit.*$", "SA"),
+        new Lang("^.*Serbian.*$", "SR"),
+        new Lang("^.*Swedish.*$", "SV"),
+        new Lang("^.*Tajik.*$", "TG"),
+        new Lang("^.*Thai.*$", "TH"),
+        new Lang("^.*Tibetan.*$", "BO"),
+        new Lang("^.*Turkish.*$", "TR"),
+        new Lang("^.*Ukranian.*$", "UK"),
+        new Lang("^.*Vietnamese.*$", "VI"),
+        new Lang("^.*Welsh.*$", "CY"),
+        new Lang("^.*Zulu.*$", "ZU"),
+        new Lang("^.*Croation.*$", "HR"),
+        new Lang("^.*Czech.*$", "CS"),
+        new Lang("^.*Dutch.*$", "NL"),
+        new Lang("^.*Finnish.*$", "FI"),
         /*
         new Lang("^German$", "DE"),
-        new Lang("^Afrikaans$", "AF"),
         new Lang("^Armenian$", "HY"),
-        new Lang("^Arabic$", "AR"),
-        new Lang("^Croation$", "HR"),
-        new Lang("^Czech$", "CS"),
-        new Lang("^Dutch$", "NL"),
         new Lang("^English$", "EN"),
-        new Lang("^Finnish$", "FI"),
-        new Lang("^Hebrew$", "HE"),
-        new Lang("^Hindi$", "HI"),
-        new Lang("^Icelandic$", "IS"),
-        new Lang("^Irish$", "GA"),
-        new Lang("^Korean$", "KO"),
         new Lang("^Kurdish$", "KU"),
         new Lang("^Lithuanian$", "LT"),
         new Lang("^Malay$", "MS"),
-        new Lang("^Maori$", "MI"),
         new Lang("^Mongolian$", "MN"),
-        new Lang("^Norwegian$", "NO"),
-        new Lang("^Persian$", "FA"),
-        new Lang("^Portuguese$", "PT"),
-        new Lang("^Romanian$", "RO"),
-        new Lang("^Russian$", "RU"),
-        new Lang("^Sanskrit$", "SA"),
-        new Lang("^Serbian$", "SR"),
         new Lang("^Somali$", "SO"),
         new Lang("^Sudanese$", "SU"),
-        new Lang("^Swedish$", "SV"),
-        new Lang("^Tajik$", "TG"),
-        new Lang("^Thai$", "TH"),
-        new Lang("^Tibetan$", "BO"),
-        new Lang("^Turkish$", "TR"),
-        new Lang("^Ukranian$", "UK"),
-        new Lang("^Vietnamese$", "VI"),
-        new Lang("^Welsh$", "CY"),
         new Lang("^Yiddish$", "YI"),
-        new Lang("^Zulu$", "ZU"),*/
+        */
     };
     
     for (final Lang lang1 : langs1) {
@@ -156,10 +157,10 @@ public class DictionaryBuilderMain extends TestCase {
     }  // langs1
 
     DictionaryBuilder.main(new String[] {
-        "--dictOut=" + OUTPUTS + "DE-EN_chemnitz.quickdic",
+        "--dictOut=" + OUTPUTS + "DE-EN_all_free.quickdic",
         "--lang1=DE",
         "--lang2=EN",
-        "--dictInfo=@" + INPUTS + "de-en_chemnitz.info",
+        "--dictInfo=@" + INPUTS + "de-en_all_free.info",
 
         "--input1=" + INPUTS + "de-en_chemnitz.txt",
         "--input1Name=chemnitz",
@@ -182,8 +183,6 @@ public class DictionaryBuilderMain extends TestCase {
         "--input3Name=dictcc",
         "--input3Charset=UTF8",
         "--input3Format=dictcc",
-        
-        // TODO: wiktionary
     });
 
   }
index 39addd58db7321d186503147cc6ff8bfc2e40f82..6dd043aa9ef69a8e38bd4a04e4984c858dfd9bd2 100644 (file)
@@ -79,27 +79,50 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
 
     if (selectors.isEmpty()) {
       selectors.addAll(Arrays.asList(
-          new Selector("../DictionaryData/inputs/enWikiSplit/arabic.data", ".*[Ar]rabic.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/croation.data", ".*[Cc]roation.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/czech.data", ".*[Cc]zech.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/mandarin.data", ".*[Mm]andarin|[Cc]hinese.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/dutch.data", ".*[Du]utch.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/english.data", ".*[Ee]nglish.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/french.data", ".*[Ff]rench.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/german.data", ".*[Gg]erman.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/greek.data", ".*[Gg]reek.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/hindi.data", ".*[Hh]indi.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/italian.data", ".*[Ii]talian.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/japanese.data", ".*[Jj]apanese.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/korean.data", ".*[Kk]orean.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/persian.data", ".*[Pp]ersian.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/portuguese.data", ".*[Pp]ortuguese.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/romanian.data", ".*[Rr]omanian.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/russian.data", ".*[Rr]ussian.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/spanish.data", ".*[Ss]panish.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/swedish.data", ".*[Ss]wedish.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/thai.data", ".*[Tt]hai.*"),
-          new Selector("../DictionaryData/inputs/enWikiSplit/vietnamese.data", ".*[Vv]ietnamese.*")
+          new Selector("../DictionaryData/inputs/enWikiSplit/AF.data", ".*[Aa]frikaans.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/AR.data", ".*[Aa]rabic.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/HY.data", ".*[Aa]rmenian.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/HR.data", ".*[Cc]roation.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/CS.data", ".*[Cc]zech.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/ZH.data", ".*[Mm]andarin|[Cc]hinese.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/NL.data", ".*[Du]utch.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/EN.data", ".*[Ee]nglish.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/FI.data", ".*[Ff]inish.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/FR.data", ".*[Ff]rench.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/DE.data", ".*[Gg]erman.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/EL.data", ".*[Gg]reek.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/HE.data", ".*[Hh]ewbrew.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/HI.data", ".*[Hh]indi.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/IS.data", ".*[Ii]celandic.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/GA.data", ".*[Ii]rish.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/IT.data", ".*[Ii]talian.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/LT.data", ".*[Ll]ithuanian.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/JA.data", ".*[Jj]apanese.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/KO.data", ".*[Kk]orean.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/KU.data", ".*[Kk]urdish.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/MS.data", ".*[Mm]alay.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/MI.data", ".*[Mm]aori.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/MN.data", ".*[Mm]ongolian.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/NO.data", ".*[Nn]orwegian.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/FA.data", ".*[Pp]ersian.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/PT.data", ".*[Pp]ortuguese.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/RO.data", ".*[Rr]omanian.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/RU.data", ".*[Rr]ussian.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/SA.data", ".*[Ss]anskrit.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/SR.data", ".*[Ss]erbian.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/SO.data", ".*[Ss]omali.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/ES.data", ".*[Ss]panish.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/SU.data", ".*[Ss]udanese.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/SV.data", ".*[Ss]wedish.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/TG.data", ".*[Tt]ajik.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/TH.data", ".*[Tt]hai.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/BO.data", ".*[Tt]ibetan.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/TR.data", ".*[Tt]urkish.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/UK.data", ".*[Uu]kranian.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/VI.data", ".*[Vv]ietnamese.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/CI.data", ".*[Ww]elsh.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/YI.data", ".*[Yy]iddish.*"),
+          new Selector("../DictionaryData/inputs/enWikiSplit/ZU.data", ".*[Zz]ulu.*")
           ));
     }
     
index c2e6e7c2726e1f7ea5d8b5c0aa3d33fbd295bc2e..600c6e7b8056fa92bcf0635857530e5f5b60f9b3 100644 (file)
@@ -40,7 +40,6 @@ public class EnWiktionaryXmlParser {
   
   static final Logger LOG = Logger.getLogger(EnWiktionaryXmlParser.class.getName());
   
-  // TODO: look for {{ and [[ and <adf> <!-- in output.
   // TODO: process {{ttbc}} lines
   
   static final Pattern partOfSpeechHeader = Pattern.compile(
@@ -200,13 +199,13 @@ public class EnWiktionaryXmlParser {
           sense = null;
         } else if (functionName.equals("trans-mid")) {
         } else if (functionName.equals("trans-see")) {
-          // TODO
+          // TODO: would also be nice...
         } else if (functionName.startsWith("picdic")) {
         } else if (functionName.startsWith("checktrans")) {
         } else if (functionName.startsWith("ttbc")) {
           wikiTokenizer.nextLine();
-          // TODO: would be great to handle
-          //TODO: Check this: done = true;
+          // TODO: would be great to handle ttbc
+          // TODO: Check this: done = true;
         } else {
           LOG.warning("Unexpected translation wikifunction: " + wikiTokenizer.token() + ", title=" + title);
         }
@@ -230,7 +229,7 @@ public class EnWiktionaryXmlParser {
         if (rest.length() > 0) {
           doTranslationLine(line, title, pos, sense, rest);
         } else {
-          // TODO: do lines that are like Greek:
+          // TODO: do lines that are like "Greek:"
         }
         
       } else if (wikiTokenizer.remainderStartsWith("''See''")) {
@@ -467,7 +466,7 @@ public class EnWiktionaryXmlParser {
       }
       
       if (currentHeadingDepth > posDepth) {
-        // TODO
+        // TODO: deal with other neat info sections
         continue;
       }
       
@@ -556,7 +555,7 @@ public class EnWiktionaryXmlParser {
     
     } finally {
       // Here's where we exit.
-      // TODO: Should we make an entry even if there are no foreign list items?
+      // Should we make an entry even if there are no foreign list items?
       String foreign = foreignBuilder.toString().trim();
       if (!foreign.toLowerCase().startsWith(title.toLowerCase())) {
         foreign = title + " " + foreign;
@@ -666,50 +665,64 @@ public class EnWiktionaryXmlParser {
         mdashLen = 3;
       }
       
-      // TODO: index and clean these!!!
-      if (nextPrefix.equals("#:") && dash != -1) {
+      if ((nextPrefix.equals("#:") || nextPrefix.equals("##:")) && dash != -1) {
         final String foreignEx = nextLine.substring(0, dash);
         final String englishEx = nextLine.substring(dash + mdashLen);
-        final Pair pair = new Pair(formatAndIndexExampleString(englishEx, enIndexBuilder), formatAndIndexExampleString(foreignEx, otherIndexBuilder), swap);
-        pairEntry.pairs.add(pair);
+        final Pair pair = new Pair(formatAndIndexExampleString(englishEx, enIndexBuilder, indexedEntry), formatAndIndexExampleString(foreignEx, otherIndexBuilder, indexedEntry), swap);
+        if (pair.lang1 != "--" && pair.lang1 != "--") {
+          pairEntry.pairs.add(pair);
+        }
         lastForeign = null;
-      } else if (nextPrefix.equals("#:")){
-        final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null), swap);
+      } else if (nextPrefix.equals("#:") || nextPrefix.equals("##:")){
+        final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap);
         lastForeign = nextLine;
-        pairEntry.pairs.add(pair);
+        if (pair.lang1 != "--" && pair.lang1 != "--") {
+          pairEntry.pairs.add(pair);
+        }
       } else if (nextPrefix.equals("#::") || nextPrefix.equals("#**")) {
         if (lastForeign != null) {
           pairEntry.pairs.remove(pairEntry.pairs.size() - 1);
-          final Pair pair = new Pair(formatAndIndexExampleString(nextLine, enIndexBuilder), formatAndIndexExampleString(lastForeign, otherIndexBuilder), swap);
-          pairEntry.pairs.add(pair);
+          final Pair pair = new Pair(formatAndIndexExampleString(nextLine, enIndexBuilder, indexedEntry), formatAndIndexExampleString(lastForeign, otherIndexBuilder, indexedEntry), swap);
+          if (pair.lang1 != "--" && pair.lang1 != "--") {
+            pairEntry.pairs.add(pair);
+          }
         } else {
           LOG.warning("English example with no foreign: " + title + ", " + nextLine);
         }
       } else if (nextPrefix.equals("#*")) {
         // Can't really index these.
-        final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null), swap);
+        final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap);
         lastForeign = nextLine;
-        pairEntry.pairs.add(pair);
-      } else if (nextPrefix.equals("#::*")) {
-        final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null), swap);
-        pairEntry.pairs.add(pair);
-      } else {
-        assert false;
+        if (pair.lang1 != "--" && pair.lang1 != "--") {
+          pairEntry.pairs.add(pair);
+        }
+      } else if (nextPrefix.equals("#::*") || nextPrefix.equals("##") || nextPrefix.equals("#*:") || nextPrefix.equals("#:*") || true) {
+        final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap);
+        if (pair.lang1 != "--" && pair.lang1 != "--") {
+          pairEntry.pairs.add(pair);
+        }
+//      } else {
+//        assert false;
       }
     }
   }
   
-  private String formatAndIndexExampleString(final String example, final IndexBuilder indexBuilder) {
+  private String formatAndIndexExampleString(final String example, final IndexBuilder indexBuilder, final IndexedEntry indexedEntry) {
     final WikiTokenizer wikiTokenizer = new WikiTokenizer(example, false);
     final StringBuilder builder = new StringBuilder();
     boolean insideTripleQuotes = false;
     while (wikiTokenizer.nextToken() != null) {
       if (wikiTokenizer.isPlainText()) {
         builder.append(wikiTokenizer.token());
-        
+        if (indexBuilder != null) {
+          indexBuilder.addEntryWithString(indexedEntry, wikiTokenizer.token(), EntryTypeName.WIKTIONARY_EXAMPLE);
+        }
       } else if (wikiTokenizer.isWikiLink()) {
-        builder.append(wikiTokenizer.wikiLinkText());
-        
+        final String text = wikiTokenizer.wikiLinkText().replaceAll("'", ""); 
+        builder.append(text);
+        if (indexBuilder != null) {
+          indexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_EXAMPLE);
+        }
       } else if (wikiTokenizer.isFunction()) {
         builder.append(wikiTokenizer.token());
       } else if (wikiTokenizer.isMarkup()) {
@@ -722,7 +735,8 @@ public class EnWiktionaryXmlParser {
         LOG.warning("unexpected token: " + wikiTokenizer.token());
       }
     }
-    return trim(builder.toString()); 
+    final String result = trim(builder.toString());
+    return result.length() > 0 ? result : "--";
   }