From e55f0ff1e195e1238e91909d0366eac391395ab8 Mon Sep 17 00:00:00 2001 From: Thad Hughes Date: Mon, 7 Mar 2011 18:17:54 -0800 Subject: [PATCH] go --- .../engine/DictionaryBuilderMain.java | 4 +- .../parser/EnWiktionaryXmlParser.java | 12 ++--- .../android/dictionary/parser/WikiParser.java | 54 +++++++++---------- .../dictionary/parser/WikiParserTest.java | 8 ++- 4 files changed, 41 insertions(+), 37 deletions(-) diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java index 7438dc2..c49305e 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java @@ -22,7 +22,6 @@ public class DictionaryBuilderMain extends TestCase { public static void main(final String[] args) throws Exception { - DictionaryBuilder.main(new String[] { "--dictOut=dictOutputs/DE-EN_chemnitz.quickdic", "--lang1=DE", @@ -35,6 +34,7 @@ public class DictionaryBuilderMain extends TestCase { "--input1Format=chemnitz", }); + Lang[] langs1 = new Lang[] { new Lang("^English$", "EN"), new Lang("^German$", "DE"), @@ -108,7 +108,7 @@ public class DictionaryBuilderMain extends TestCase { String.format("--lang2=%s", lang2.code), String.format("--dictInfo=(EN)Wikitionary-based %s-%s dictionary", lang1.code, lang2.code), - "--input1=dictInputs/enwiktionary-20101015-pages-articles", + "--input1=dictInputs/enwiktionary-20110205-pages-articles.xml", "--input1Name=enwiktionary", "--input1Format=enwiktionary", String.format("--input1TranslationPattern1=%s", lang1.nameRegex), diff --git a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java index 761c5dc..5277ce4 100644 --- a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java +++ b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java @@ -186,9 +186,9 @@ public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler im static final Set useRemainingArgTemplates = new LinkedHashSet(Arrays.asList( "Arab", "Cyrl", "fa-Arab", "italbrac", "Khmr", "ku-Arab", "IPAchar", "Laoo", "sd-Arab", "Thai", "ttbc", "unicode", "ur-Arab", "yue-yue-j", "zh-ts", - "zh-tsp", "zh-zh-p")); - static final Set ignoreTemplates = new LinkedHashSet(Arrays.asList("")); - static final Set grammarTemplates = new LinkedHashSet(Arrays.asList("impf", "pf")); + "zh-tsp", "zh-zh-p", "ug-Arab", "ko-inline", "Jpan", "Kore", "rfscript", "Latinx")); + static final Set ignoreTemplates = new LinkedHashSet(Arrays.asList("audio", "rhymes", "hyphenation", "homophones", "wikipedia", "rel-top", "rel-bottom", "sense", "wikisource1911Enc", "g")); + static final Set grammarTemplates = new LinkedHashSet(Arrays.asList("impf", "pf", "pf.", "indeclinable")); static final Set passThroughTemplates = new LinkedHashSet(Arrays.asList("zzzzzzzzzzzzzzz")); @Override @@ -267,7 +267,7 @@ public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler im return; } - if (name.equals("audio") || name.equals("rhymes") || name.equals("hyphenation")) { + if (ignoreTemplates.contains(name)) { return; } @@ -358,7 +358,7 @@ public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler im wikiBuilder.append("sg."); } else if (grammarTemplates.contains(name)) { - assert positionalArgs.size() == 1 && namedArgs.isEmpty(); + assert positionalArgs.size() == 1 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs; wikiBuilder.append(name).append("."); } else if (name.equals("l")) { @@ -627,7 +627,7 @@ public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler im @Override public void onUnterminated(String start, String rest) { - throw new RuntimeException(start + rest); + System.err.printf("OnUnterminated: %s %s %s\n", title, start, rest); } @Override public void onInvalidHeaderEnd(String rest) { diff --git a/src/com/hughes/android/dictionary/parser/WikiParser.java b/src/com/hughes/android/dictionary/parser/WikiParser.java index 5b73b87..37c7a53 100644 --- a/src/com/hughes/android/dictionary/parser/WikiParser.java +++ b/src/com/hughes/android/dictionary/parser/WikiParser.java @@ -7,6 +7,8 @@ import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; +import com.hughes.util.StringUtil; + public class WikiParser { private static final Pattern markup = Pattern.compile("$|''|\\{\\{|\\[\\[|(==+)\\s*$|"); if (end == -1) { callback.onUnterminated("