From 317fb0c2a57c997af6f7f6111d6f423ba3adf1a0 Mon Sep 17 00:00:00 2001 From: Thad Hughes Date: Fri, 16 Dec 2011 22:14:56 -0800 Subject: [PATCH] Redo splitter language codes. --- bugs | 2 +- .../engine/DictionaryBuilderMain.java | 75 +++++++++---------- .../dictionary/engine/WiktionarySplitter.java | 65 ++++++++++------ .../parser/EnWiktionaryXmlParser.java | 70 ++++++++++------- 4 files changed, 124 insertions(+), 88 deletions(-) diff --git a/bugs b/bugs index 141f52c..4050d59 100644 --- a/bugs +++ b/bugs @@ -1,4 +1,4 @@ -handle examples. +icons handle word-info in English. diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java index 3479ec7..833e5e9 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java @@ -49,51 +49,52 @@ public class DictionaryBuilderMain extends TestCase { new Lang("^English$", "EN", null, "en.txt"), }; Lang[] langs2 = new Lang[] { - //new Lang("^.*Italian.*$", "IT", "italian.data", "it.txt"), - new Lang("^.*French.*$", "FR", "french.data", "empty.txt"), - new Lang("^.*Spanish.*$", "ES", "spanish.data", "empty.txt"), - new Lang("^.*Greek.*$", "EL", "greek.data", "empty.txt"), - new Lang("^.*Japanese.*$", "JA", "japanese.data", "empty.txt"), - new Lang("^.*Chinese.*$|^.*Mandarin.*$", "ZH", "mandarin.data", "empty.txt"), +// new Lang("^.*Italian.*$", "IT", "italian.data", "it.txt"), +// new Lang("^.*French.*$", "FR", "french.data", "empty.txt"), +// new Lang("^.*Spanish.*$", "ES", "spanish.data", "es.txt"), +// new Lang("^.*Greek.*$", "EL", "greek.data", "el.txt"), +// new Lang("^.*Japanese.*$", "JA", "japanese.data", "empty.txt"), +// new Lang("^.*Chinese.*$|^.*Mandarin.*$", "ZH", "mandarin.data", "empty.txt"), + new Lang("^.*Afrikaans.*$", "AF", "afrikaans.data", "empty.txt"), + new Lang("^.*Arabic.*$", "AR", "".data, "empty.txt"), + new Lang("^.*Hebrew.*$", "HE"), + new Lang("^.*Hindi.*$", "HI"), + new Lang("^.*Icelandic.*$", "IS"), + new Lang("^.*Irish.*$", "GA"), + new Lang("^.*Korean.*$", "KO"), + new Lang("^.*Maori.*$", "MI"), + new Lang("^.*Norwegian.*$", "NO"), + new Lang("^.*Persian.*$", "FA"), + new Lang("^.*Portuguese.*$", "PT"), + new Lang("^.*Romanian.*$", "RO"), + new Lang("^.*Russian.*$", "RU"), + new Lang("^.*Sanskrit.*$", "SA"), + new Lang("^.*Serbian.*$", "SR"), + new Lang("^.*Swedish.*$", "SV"), + new Lang("^.*Tajik.*$", "TG"), + new Lang("^.*Thai.*$", "TH"), + new Lang("^.*Tibetan.*$", "BO"), + new Lang("^.*Turkish.*$", "TR"), + new Lang("^.*Ukranian.*$", "UK"), + new Lang("^.*Vietnamese.*$", "VI"), + new Lang("^.*Welsh.*$", "CY"), + new Lang("^.*Zulu.*$", "ZU"), + new Lang("^.*Croation.*$", "HR"), + new Lang("^.*Czech.*$", "CS"), + new Lang("^.*Dutch.*$", "NL"), + new Lang("^.*Finnish.*$", "FI"), /* new Lang("^German$", "DE"), - new Lang("^Afrikaans$", "AF"), new Lang("^Armenian$", "HY"), - new Lang("^Arabic$", "AR"), - new Lang("^Croation$", "HR"), - new Lang("^Czech$", "CS"), - new Lang("^Dutch$", "NL"), new Lang("^English$", "EN"), - new Lang("^Finnish$", "FI"), - new Lang("^Hebrew$", "HE"), - new Lang("^Hindi$", "HI"), - new Lang("^Icelandic$", "IS"), - new Lang("^Irish$", "GA"), - new Lang("^Korean$", "KO"), new Lang("^Kurdish$", "KU"), new Lang("^Lithuanian$", "LT"), new Lang("^Malay$", "MS"), - new Lang("^Maori$", "MI"), new Lang("^Mongolian$", "MN"), - new Lang("^Norwegian$", "NO"), - new Lang("^Persian$", "FA"), - new Lang("^Portuguese$", "PT"), - new Lang("^Romanian$", "RO"), - new Lang("^Russian$", "RU"), - new Lang("^Sanskrit$", "SA"), - new Lang("^Serbian$", "SR"), new Lang("^Somali$", "SO"), new Lang("^Sudanese$", "SU"), - new Lang("^Swedish$", "SV"), - new Lang("^Tajik$", "TG"), - new Lang("^Thai$", "TH"), - new Lang("^Tibetan$", "BO"), - new Lang("^Turkish$", "TR"), - new Lang("^Ukranian$", "UK"), - new Lang("^Vietnamese$", "VI"), - new Lang("^Welsh$", "CY"), new Lang("^Yiddish$", "YI"), - new Lang("^Zulu$", "ZU"),*/ + */ }; for (final Lang lang1 : langs1) { @@ -156,10 +157,10 @@ public class DictionaryBuilderMain extends TestCase { } // langs1 DictionaryBuilder.main(new String[] { - "--dictOut=" + OUTPUTS + "DE-EN_chemnitz.quickdic", + "--dictOut=" + OUTPUTS + "DE-EN_all_free.quickdic", "--lang1=DE", "--lang2=EN", - "--dictInfo=@" + INPUTS + "de-en_chemnitz.info", + "--dictInfo=@" + INPUTS + "de-en_all_free.info", "--input1=" + INPUTS + "de-en_chemnitz.txt", "--input1Name=chemnitz", @@ -182,8 +183,6 @@ public class DictionaryBuilderMain extends TestCase { "--input3Name=dictcc", "--input3Charset=UTF8", "--input3Format=dictcc", - - // TODO: wiktionary }); } diff --git a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java index 39addd5..6dd043a 100644 --- a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java +++ b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java @@ -79,27 +79,50 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { if (selectors.isEmpty()) { selectors.addAll(Arrays.asList( - new Selector("../DictionaryData/inputs/enWikiSplit/arabic.data", ".*[Ar]rabic.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/croation.data", ".*[Cc]roation.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/czech.data", ".*[Cc]zech.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/mandarin.data", ".*[Mm]andarin|[Cc]hinese.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/dutch.data", ".*[Du]utch.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/english.data", ".*[Ee]nglish.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/french.data", ".*[Ff]rench.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/german.data", ".*[Gg]erman.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/greek.data", ".*[Gg]reek.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/hindi.data", ".*[Hh]indi.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/italian.data", ".*[Ii]talian.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/japanese.data", ".*[Jj]apanese.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/korean.data", ".*[Kk]orean.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/persian.data", ".*[Pp]ersian.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/portuguese.data", ".*[Pp]ortuguese.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/romanian.data", ".*[Rr]omanian.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/russian.data", ".*[Rr]ussian.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/spanish.data", ".*[Ss]panish.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/swedish.data", ".*[Ss]wedish.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/thai.data", ".*[Tt]hai.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/vietnamese.data", ".*[Vv]ietnamese.*") + new Selector("../DictionaryData/inputs/enWikiSplit/AF.data", ".*[Aa]frikaans.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/AR.data", ".*[Aa]rabic.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/HY.data", ".*[Aa]rmenian.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/HR.data", ".*[Cc]roation.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/CS.data", ".*[Cc]zech.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/ZH.data", ".*[Mm]andarin|[Cc]hinese.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/NL.data", ".*[Du]utch.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/EN.data", ".*[Ee]nglish.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/FI.data", ".*[Ff]inish.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/FR.data", ".*[Ff]rench.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/DE.data", ".*[Gg]erman.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/EL.data", ".*[Gg]reek.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/HE.data", ".*[Hh]ewbrew.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/HI.data", ".*[Hh]indi.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/IS.data", ".*[Ii]celandic.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/GA.data", ".*[Ii]rish.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/IT.data", ".*[Ii]talian.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/LT.data", ".*[Ll]ithuanian.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/JA.data", ".*[Jj]apanese.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/KO.data", ".*[Kk]orean.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/KU.data", ".*[Kk]urdish.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/MS.data", ".*[Mm]alay.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/MI.data", ".*[Mm]aori.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/MN.data", ".*[Mm]ongolian.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/NO.data", ".*[Nn]orwegian.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/FA.data", ".*[Pp]ersian.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/PT.data", ".*[Pp]ortuguese.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/RO.data", ".*[Rr]omanian.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/RU.data", ".*[Rr]ussian.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/SA.data", ".*[Ss]anskrit.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/SR.data", ".*[Ss]erbian.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/SO.data", ".*[Ss]omali.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/ES.data", ".*[Ss]panish.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/SU.data", ".*[Ss]udanese.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/SV.data", ".*[Ss]wedish.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/TG.data", ".*[Tt]ajik.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/TH.data", ".*[Tt]hai.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/BO.data", ".*[Tt]ibetan.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/TR.data", ".*[Tt]urkish.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/UK.data", ".*[Uu]kranian.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/VI.data", ".*[Vv]ietnamese.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/CI.data", ".*[Ww]elsh.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/YI.data", ".*[Yy]iddish.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/ZU.data", ".*[Zz]ulu.*") )); } diff --git a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java index c2e6e7c..600c6e7 100644 --- a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java +++ b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java @@ -40,7 +40,6 @@ public class EnWiktionaryXmlParser { static final Logger LOG = Logger.getLogger(EnWiktionaryXmlParser.class.getName()); - // TODO: look for {{ and [[ and