From: Thad Hughes Date: Sun, 18 Dec 2011 19:38:00 +0000 (-0800) Subject: Move test data, fix DictFileParser, fix splitter, fix crash during X-Git-Url: http://gitweb.fperrin.net/?p=DictionaryPC.git;a=commitdiff_plain;h=a7ae2524281869de5aa756ae35524b21bab3e08a Move test data, fix DictFileParser, fix splitter, fix crash during weird qualifier. --- diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java index 04b72b4..888d5c8 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java @@ -131,7 +131,7 @@ public class DictionaryBuilder { } else if ("chemnitz".equals(inputFormat)) { new DictFileParser(charset, false, DictFileParser.DOUBLE_COLON, DictFileParser.PIPE, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parseFile(file); } else if ("enwiktionary".equals(inputFormat)) { - final Pattern langPattern = Pattern.compile(keyValueArgs.remove(prefix + "LangPattern")); + final Pattern langPattern = Pattern.compile(keyValueArgs.remove(prefix + "LangPattern"), Pattern.CASE_INSENSITIVE); final Pattern langCodePattern = Pattern.compile(keyValueArgs.remove(prefix + "LangCodePattern")); final int enIndex = Integer.parseInt(keyValueArgs.remove(prefix + "EnIndex")) - 1; String pageLimit = keyValueArgs.remove(prefix + "PageLimit"); diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java index 833e5e9..36564ea 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java @@ -19,7 +19,9 @@ import java.io.PrintWriter; import java.io.RandomAccessFile; import java.util.ArrayList; import java.util.Collections; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; import junit.framework.TestCase; @@ -28,116 +30,83 @@ public class DictionaryBuilderMain extends TestCase { static final String INPUTS = "../DictionaryData/inputs/"; static final String STOPLISTS = "../DictionaryData/inputs/stoplists/"; static final String OUTPUTS = "../DictionaryData/outputs/"; - - static class Lang { - final String nameRegex; - final String isoCode; - final String wikiSplit; - final String stoplistFile; - public Lang(String nameRegex, String code, final String wikiSplit, final String stoplistFile) { - this.nameRegex = nameRegex; - this.isoCode = code; - this.wikiSplit = wikiSplit; - this.stoplistFile = stoplistFile; - } - } - - + public static void main(final String[] args) throws Exception { - - Lang[] langs1 = new Lang[] { - new Lang("^English$", "EN", null, "en.txt"), - }; - Lang[] langs2 = new Lang[] { -// new Lang("^.*Italian.*$", "IT", "italian.data", "it.txt"), -// new Lang("^.*French.*$", "FR", "french.data", "empty.txt"), -// new Lang("^.*Spanish.*$", "ES", "spanish.data", "es.txt"), -// new Lang("^.*Greek.*$", "EL", "greek.data", "el.txt"), -// new Lang("^.*Japanese.*$", "JA", "japanese.data", "empty.txt"), -// new Lang("^.*Chinese.*$|^.*Mandarin.*$", "ZH", "mandarin.data", "empty.txt"), - new Lang("^.*Afrikaans.*$", "AF", "afrikaans.data", "empty.txt"), - new Lang("^.*Arabic.*$", "AR", "".data, "empty.txt"), - new Lang("^.*Hebrew.*$", "HE"), - new Lang("^.*Hindi.*$", "HI"), - new Lang("^.*Icelandic.*$", "IS"), - new Lang("^.*Irish.*$", "GA"), - new Lang("^.*Korean.*$", "KO"), - new Lang("^.*Maori.*$", "MI"), - new Lang("^.*Norwegian.*$", "NO"), - new Lang("^.*Persian.*$", "FA"), - new Lang("^.*Portuguese.*$", "PT"), - new Lang("^.*Romanian.*$", "RO"), - new Lang("^.*Russian.*$", "RU"), - new Lang("^.*Sanskrit.*$", "SA"), - new Lang("^.*Serbian.*$", "SR"), - new Lang("^.*Swedish.*$", "SV"), - new Lang("^.*Tajik.*$", "TG"), - new Lang("^.*Thai.*$", "TH"), - new Lang("^.*Tibetan.*$", "BO"), - new Lang("^.*Turkish.*$", "TR"), - new Lang("^.*Ukranian.*$", "UK"), - new Lang("^.*Vietnamese.*$", "VI"), - new Lang("^.*Welsh.*$", "CY"), - new Lang("^.*Zulu.*$", "ZU"), - new Lang("^.*Croation.*$", "HR"), - new Lang("^.*Czech.*$", "CS"), - new Lang("^.*Dutch.*$", "NL"), - new Lang("^.*Finnish.*$", "FI"), - /* - new Lang("^German$", "DE"), - new Lang("^Armenian$", "HY"), - new Lang("^English$", "EN"), - new Lang("^Kurdish$", "KU"), - new Lang("^Lithuanian$", "LT"), - new Lang("^Malay$", "MS"), - new Lang("^Mongolian$", "MN"), - new Lang("^Somali$", "SO"), - new Lang("^Sudanese$", "SU"), - new Lang("^Yiddish$", "YI"), - */ - }; - for (final Lang lang1 : langs1) { - for (final Lang lang2 : langs2) { - if (lang1.nameRegex.equals(lang2.nameRegex)) { - continue; - } + final Map isoToWikiName = new LinkedHashMap(Language.isoCodeToWikiName); + isoToWikiName.remove("EN"); + isoToWikiName.remove("DE"); + + final Map isoToDedication = new LinkedHashMap(); + isoToDedication.put("AF", "Afrikaans dictionary dedicated to Heiko and Mariëtte Horn."); + isoToDedication.put("HR", "Croation dictionary dedicated to Ines Viskic and Miro Kresonja."); + isoToDedication.put("NL", "Dutch dictionary dedicated to Mike LeBeau."); + // German handled in file. + isoToDedication.put("EL", "Greek dictionary dedicated to Noah Egge."); + isoToDedication.put("IT", "Italian dictionary dedicated to Carolina Tropini, my favorite stardust in the whole universe! Ti amo!"); + isoToDedication.put("JA", "Japanese dictionary dedicated to Akane Watanabe."); + isoToDedication.put("KO", "Korean dictionary dedicated to Ande Elwood--fall fashion und Fernsehturms!"); + isoToDedication.put("PT", "Portuguese dictionary dedicated to Carlos Melo, one Tough Mudder."); + isoToDedication.put("RO", "Romanian dictionary dedicated to Radu Teodorescu."); + isoToDedication.put("RU", "Russian dictionary dedicated to Maxim Aronin--best friend always!."); + isoToDedication.put("SR", "Serbian dictionary dedicated to Filip Crnogorac--thanks for the honey."); + isoToDedication.put("ES", "Spanish dictionary made especially for Carolina Tropini! <3 XoXoXXXXX!"); + isoToDedication.put("SV", "Swedish dictionary dedicated to Kajsa Palmblad--björn kramar!"); + + final Map isoToStoplist = new LinkedHashMap(); + isoToStoplist.put("DE", "de.txt"); + isoToStoplist.put("EN", "en.txt"); + isoToStoplist.put("ES", "es.txt"); + isoToStoplist.put("IT", "it.txt"); + isoToStoplist.put("FR", "fr.txt"); + + final Map isoToRegex = new LinkedHashMap(); + isoToRegex.put("ZH", ".*Chinese.*|.*Mandarin.*|.*Cantonese.*"); + + boolean go = false; + isoToWikiName.clear(); + for (final String foreignIso : isoToWikiName.keySet()) { + if (foreignIso.equals("GA")) { + go = true; + } + if (!go) { + continue; + } + + final String dictFile = String.format(OUTPUTS + "/EN-%s_enwiktionary.quickdic", foreignIso); + System.out.println("building dictFile: " + dictFile); - int enIndex = -1; - Lang nonEnglish = null; - if (lang2.isoCode.equals("EN")) { - enIndex = 2; - nonEnglish = lang1; + if (!isoToStoplist.containsKey(foreignIso)) { + isoToStoplist.put(foreignIso, "empty.txt"); } - if (lang1.isoCode.equals("EN")) { - enIndex = 1; - nonEnglish = lang2; + if (!isoToDedication.containsKey(foreignIso)) { + isoToDedication.put(foreignIso, ""); } - assert nonEnglish != null; - - final String dictFile = String.format(OUTPUTS + "/%s-%s_enwiktionary.quickdic", lang1.isoCode, lang2.isoCode); - System.out.println("building dictFile: " + dictFile); + if (!isoToRegex.containsKey(foreignIso)) { + isoToRegex.put(foreignIso, ".*" + isoToWikiName.get(foreignIso) + ".*"); + } + DictionaryBuilder.main(new String[] { String.format("--dictOut=%s", dictFile), - String.format("--lang1=%s", lang1.isoCode), - String.format("--lang2=%s", lang2.isoCode), - String.format("--lang1Stoplist=%s", STOPLISTS + lang1.stoplistFile), - String.format("--lang2Stoplist=%s", STOPLISTS + lang2.stoplistFile), - String.format("--dictInfo=(EN)Wikitionary-based %s-%s dictionary", lang1.isoCode, lang2.isoCode), - - "--input2=" + INPUTS + "enWikiSplit/" + nonEnglish.wikiSplit, - "--input2Name=enwiktionary." + nonEnglish.wikiSplit, + String.format("--lang1=EN"), + String.format("--lang2=%s", foreignIso), + String.format("--lang1Stoplist=%s", STOPLISTS + isoToStoplist.get("EN")), + String.format("--lang2Stoplist=%s", STOPLISTS + isoToStoplist.get(foreignIso)), + String.format("--dictInfo=(EN)Wikitionary-based EN-%s dictionary. %s", foreignIso, isoToDedication.get(foreignIso)), + + "--input2=" + INPUTS + "enWikiSplit/" + foreignIso + ".data", + "--input2Name=enwiktionary." + foreignIso, "--input2Format=enwiktionary", - "--input2LangPattern=" + nonEnglish.nameRegex, - "--input2LangCodePattern=" + nonEnglish.isoCode.toLowerCase(), - "--input2EnIndex=" + enIndex, + "--input2LangPattern=" + isoToRegex.get(foreignIso), + "--input2LangCodePattern=" + foreignIso.toLowerCase(), + "--input2EnIndex=2", - "--input3=" + INPUTS + "enWikiSplit/english.data", + "--input3=" + INPUTS + "enWikiSplit/EN.data", "--input3Name=enwiktionary.english", "--input3Format=enwiktionary", - "--input3LangPattern=" + nonEnglish.nameRegex, - "--input3LangCodePattern=" + (enIndex == 1 ? lang2.isoCode : lang1.isoCode).toLowerCase(), - "--input3EnIndex=" + enIndex, + "--input3LangPattern=" + isoToRegex.get(foreignIso), + "--input3LangCodePattern=" + foreignIso.toLowerCase(), + "--input3EnIndex=2", }); @@ -153,36 +122,33 @@ public class DictionaryBuilderMain extends TestCase { textOut.close(); raf.close(); - } // langs2 - } // langs1 + } // foreignIso DictionaryBuilder.main(new String[] { - "--dictOut=" + OUTPUTS + "DE-EN_all_free.quickdic", + "--dictOut=" + OUTPUTS + "DE-EN_chemnitz_enwiktionary", "--lang1=DE", "--lang2=EN", - "--dictInfo=@" + INPUTS + "de-en_all_free.info", + "--dictInfo=@" + INPUTS + "de-en_chemnitz_enwiktionary.info", "--input1=" + INPUTS + "de-en_chemnitz.txt", "--input1Name=chemnitz", "--input1Charset=UTF8", "--input1Format=chemnitz", - }); - - DictionaryBuilder.main(new String[] { - "--dictOut=" + OUTPUTS + "de-en_all.quickdic", - "--lang1=DE", - "--lang2=EN", - "--dictInfo=@" + INPUTS + "de-en_all.info", - - "--input2=" + INPUTS + "de-en_chemnitz.txt", - "--input2Name=dictcc", - "--input2Charset=UTF8", - "--input2Format=chemnitz", - - "--input3=" + INPUTS + "/NONFREE/de-en_dictcc.txt", - "--input3Name=dictcc", - "--input3Charset=UTF8", - "--input3Format=dictcc", + + "--input2=" + INPUTS + "enWikiSplit/DE.data", + "--input2Name=enwiktionary.DE", + "--input2Format=enwiktionary", + "--input2LangPattern=" + isoToRegex.get("DE"), + "--input2LangCodePattern=de", + "--input2EnIndex=2", + + "--input3=" + INPUTS + "enWikiSplit/EN.data", + "--input3Name=enwiktionary.english", + "--input3Format=enwiktionary", + "--input3LangPattern=" + isoToRegex.get("DE"), + "--input3LangCodePattern=de", + "--input3EnIndex=2", + }); } diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java index 20a0172..8059a1e 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java @@ -26,12 +26,12 @@ import junit.framework.TestCase; public class DictionaryBuilderTest extends TestCase { - public static final String TEST_INPUTS = "../DictionaryData/testdata/inputs/"; + public static final String TEST_INPUTS = "testdata/inputs/"; public static final String WIKISPLIT = "../DictionaryData/inputs/enWikiSplit/"; public static final String STOPLISTS = "../DictionaryData/inputs/stoplists/"; - public static final String GOLDENS = "../DictionaryData/testdata/goldens/"; + public static final String GOLDENS = "testdata/goldens/"; - public static final String TEST_OUTPUTS = "../DictionaryData/testdata/outputs/"; + public static final String TEST_OUTPUTS = "testdata/outputs/"; public void testWiktionaryItalianFromItalian() throws Exception { final String name = "wiktionary.it_it.quickdic"; @@ -45,7 +45,7 @@ public class DictionaryBuilderTest extends TestCase { "--lang2Stoplist=" + STOPLISTS + "en.txt", "--dictInfo=SomeWikiData", - "--input4=" + WIKISPLIT + "italian.data", + "--input4=" + WIKISPLIT + "IT.data", "--input4Name=enwiktionary.italian", "--input4Format=enwiktionary", "--input4LangPattern=Italian", @@ -71,7 +71,7 @@ public class DictionaryBuilderTest extends TestCase { "--lang2Stoplist=" + STOPLISTS + "en.txt", "--dictInfo=SomeWikiData", - "--input3=" + WIKISPLIT + "english.data", + "--input3=" + WIKISPLIT + "EN.data", "--input3Name=enwiktionary.english", "--input3Format=enwiktionary", "--input3LangPattern=Italian", diff --git a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java index 6dd043a..94f7e26 100644 --- a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java +++ b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java @@ -79,50 +79,52 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { if (selectors.isEmpty()) { selectors.addAll(Arrays.asList( - new Selector("../DictionaryData/inputs/enWikiSplit/AF.data", ".*[Aa]frikaans.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/AR.data", ".*[Aa]rabic.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/HY.data", ".*[Aa]rmenian.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/HR.data", ".*[Cc]roation.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/CS.data", ".*[Cc]zech.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/ZH.data", ".*[Mm]andarin|[Cc]hinese.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/NL.data", ".*[Du]utch.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/EN.data", ".*[Ee]nglish.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/FI.data", ".*[Ff]inish.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/FR.data", ".*[Ff]rench.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/DE.data", ".*[Gg]erman.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/EL.data", ".*[Gg]reek.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/HE.data", ".*[Hh]ewbrew.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/HI.data", ".*[Hh]indi.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/IS.data", ".*[Ii]celandic.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/GA.data", ".*[Ii]rish.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/IT.data", ".*[Ii]talian.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/LT.data", ".*[Ll]ithuanian.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/JA.data", ".*[Jj]apanese.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/KO.data", ".*[Kk]orean.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/KU.data", ".*[Kk]urdish.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/MS.data", ".*[Mm]alay.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/MI.data", ".*[Mm]aori.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/MN.data", ".*[Mm]ongolian.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/NO.data", ".*[Nn]orwegian.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/FA.data", ".*[Pp]ersian.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/PT.data", ".*[Pp]ortuguese.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/RO.data", ".*[Rr]omanian.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/RU.data", ".*[Rr]ussian.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/SA.data", ".*[Ss]anskrit.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/SR.data", ".*[Ss]erbian.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/SO.data", ".*[Ss]omali.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/ES.data", ".*[Ss]panish.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/SU.data", ".*[Ss]udanese.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/SV.data", ".*[Ss]wedish.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/TG.data", ".*[Tt]ajik.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/TH.data", ".*[Tt]hai.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/BO.data", ".*[Tt]ibetan.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/TR.data", ".*[Tt]urkish.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/UK.data", ".*[Uu]kranian.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/VI.data", ".*[Vv]ietnamese.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/CI.data", ".*[Ww]elsh.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/YI.data", ".*[Yy]iddish.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/ZU.data", ".*[Zz]ulu.*") +// new Selector("../DictionaryData/inputs/enWikiSplit/AF.data", ".*[Aa]frikaans.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/AR.data", ".*[Aa]rabic.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/HY.data", ".*[Aa]rmenian.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/HR.data", ".*[Cc]roation.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/CS.data", ".*[Cc]zech.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/ZH.data", ".*[Cc]hinese.*|.*[Mm]andarin.*|.*Cantonese.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/DA.data", ".*[Dd]anish.*") +// new Selector("../DictionaryData/inputs/enWikiSplit/NL.data", ".*[Dd]utch.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/EN.data", ".*[Ee]nglish.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/FI.data", ".*[Ff]innish.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/FR.data", ".*[Ff]rench.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/DE.data", ".*[Gg]erman.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/EL.data", ".*[Gg]reek.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/haw.data", ".*[Hh]awaiian.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/HE.data", ".*[Hh]ebrew.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/HI.data", ".*[Hh]indi.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/IS.data", ".*[Ii]celandic.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/GA.data", ".*[Ii]rish.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/IT.data", ".*[Ii]talian.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/LT.data", ".*[Ll]ithuanian.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/JA.data", ".*[Jj]apanese.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/KO.data", ".*[Kk]orean.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/KU.data", ".*[Kk]urdish.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/MS.data", ".*[Mm]alay.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/MI.data", ".*[Mm]aori.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/MN.data", ".*[Mm]ongolian.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/NO.data", ".*[Nn]orwegian.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/FA.data", ".*[Pp]ersian.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/PT.data", ".*[Pp]ortuguese.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/PL.data", ".*[Pp]olish.*") +// new Selector("../DictionaryData/inputs/enWikiSplit/RO.data", ".*[Rr]omanian.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/RU.data", ".*[Rr]ussian.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/SA.data", ".*[Ss]anskrit.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/SR.data", ".*[Ss]erbian.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/SO.data", ".*[Ss]omali.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/ES.data", ".*[Ss]panish.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/SV.data", ".*[Ss]wedish.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/TG.data", ".*[Tt]ajik.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/TH.data", ".*[Tt]hai.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/BO.data", ".*[Tt]ibetan.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/TR.data", ".*[Tt]urkish.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/UK.data", ".*[Uu]krainian.*") +// new Selector("../DictionaryData/inputs/enWikiSplit/VI.data", ".*[Vv]ietnamese.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/CI.data", ".*[Ww]elsh.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/YI.data", ".*[Yy]iddish.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/ZU.data", ".*[Zz]ulu.*") )); } @@ -137,9 +139,12 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { static final Pattern headingStart = Pattern.compile("^(=+)[^=]+=+", Pattern.MULTILINE); + int pageCount = 0; private void endPage() { final String title = titleBuilder.toString(); - System.out.println("endPage: " + title); + if (++pageCount % 1000 == 0) { + System.out.println("endPage: " + title + ", count=" + pageCount); + } String text = textBuilder.toString(); diff --git a/src/com/hughes/android/dictionary/parser/DictFileParser.java b/src/com/hughes/android/dictionary/parser/DictFileParser.java index 67ca432..861c693 100644 --- a/src/com/hughes/android/dictionary/parser/DictFileParser.java +++ b/src/com/hughes/android/dictionary/parser/DictFileParser.java @@ -135,6 +135,16 @@ public class DictFileParser { for (int i = 0; i < subfields[0].length; ++i) { subfields[0][i] = subfields[0][i].trim(); subfields[1][i] = subfields[1][i].trim(); + if (subfields[0][i].length() == 0 && subfields[1][i].length() == 0) { + logger.warning("Empty pair: " + line); + continue; + } + if (subfields[0][i].length() == 0) { + subfields[0][i] = "__"; + } + if (subfields[1][i].length() == 0) { + subfields[1][i] = "__"; + } pairEntry.pairs.add(new Pair(subfields[0][i], subfields[1][i])); } final IndexedEntry entryData = new IndexedEntry(pairEntry); diff --git a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java index 600c6e7..6a6f438 100644 --- a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java +++ b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java @@ -306,12 +306,16 @@ public class EnWiktionaryXmlParser { } //} } else if (functionName.equals("qualifier")) { - String qualifier = args.get(0); - if (!namedArgs.isEmpty() || args.size() > 1) { - LOG.warning("weird qualifier: " + line); + if (args.size() == 0) { + otherText.append(wikiTokenizer.token()); + } else { + String qualifier = args.get(0); + if (!namedArgs.isEmpty() || args.size() > 1) { + LOG.warning("weird qualifier: " + line); + } + // Unindexed! + otherText.append("(").append(qualifier).append(")"); } - // Unindexed! - otherText.append("(").append(qualifier).append(")"); } else if (encodings.contains(functionName)) { otherText.append("").append(args.get(0)); otherIndexBuilder.addEntryWithString(indexedEntry, args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); @@ -578,6 +582,7 @@ public class EnWiktionaryXmlParser { final String prefix = listSection.firstPrefix; if (prefix.length() > 1) { + // Could just get looser and say that any prefix longer than first is a sublist. LOG.warning("Prefix too long: " + listSection); return; }