From 7573784eea75700436bb900861b93a6d53210fc8 Mon Sep 17 00:00:00 2001 From: Thad Hughes Date: Mon, 12 Dec 2011 13:29:27 -0800 Subject: [PATCH] go --- .../engine/DictionaryBuilderMain.java | 26 +++++++++------ .../parser/EnWiktionaryXmlParser.java | 32 +++++++++++++------ .../dictionary/parser/WikiTokenizer.java | 9 ++++-- 3 files changed, 46 insertions(+), 21 deletions(-) diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java index 9d17081..90c4b88 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java @@ -42,7 +42,10 @@ public class DictionaryBuilderMain extends TestCase { //new Lang("^German$", "DE"), }; Lang[] langs2 = new Lang[] { - new Lang("^Italian$", "IT"), +// new Lang("^.*Greek.*$", "EL"), + new Lang("^.*Spanish.*$", "ES"), + new Lang("^.*Italian.*$", "IT"), + /* new Lang("^German$", "DE"), new Lang("^Afrikaans$", "AF"), new Lang("^Armenian$", "HY"), @@ -54,7 +57,6 @@ public class DictionaryBuilderMain extends TestCase { new Lang("^English$", "EN"), new Lang("^Finnish$", "FI"), new Lang("^French$", "FR"), - new Lang("^Greek$", "EL"), new Lang("^Hebrew$", "HE"), new Lang("^Hindi$", "HI"), new Lang("^Icelandic$", "IS"), @@ -74,7 +76,6 @@ public class DictionaryBuilderMain extends TestCase { new Lang("^Sanskrit$", "SA"), new Lang("^Serbian$", "SR"), new Lang("^Somali$", "SO"), - new Lang("^Spanish$", "ES"), new Lang("^Sudanese$", "SU"), new Lang("^Swedish$", "SV"), new Lang("^Tajik$", "TG"), @@ -85,7 +86,7 @@ public class DictionaryBuilderMain extends TestCase { new Lang("^Vietnamese$", "VI"), new Lang("^Welsh$", "CY"), new Lang("^Yiddish$", "YI"), - new Lang("^Zulu$", "ZU"), + new Lang("^Zulu$", "ZU"),*/ }; for (final Lang lang1 : langs1) { @@ -95,12 +96,16 @@ public class DictionaryBuilderMain extends TestCase { } int enIndex = -1; + Lang nonEnglish = null; if (lang2.code.equals("EN")) { enIndex = 2; + nonEnglish = lang1; } if (lang1.code.equals("EN")) { enIndex = 1; + nonEnglish = lang2; } + assert nonEnglish != null; final String dictFile = String.format("dictOutputs/%s-%s_enwiktionary.quickdic", lang1.code, lang2.code); System.out.println("building dictFile: " + dictFile); @@ -110,12 +115,13 @@ public class DictionaryBuilderMain extends TestCase { String.format("--lang2=%s", lang2.code), String.format("--dictInfo=(EN)Wikitionary-based %s-%s dictionary", lang1.code, lang2.code), - "--input1=dictInputs/enwiktionary-20110205-pages-articles.xml", - "--input1Name=enwiktionary", - "--input1Format=enwiktionary", - String.format("--input1TranslationPattern1=%s", lang1.nameRegex), - String.format("--input1TranslationPattern2=%s", lang2.nameRegex), - String.format("--input1EnIndex=%d", enIndex), + "--input3=wikiSplit/english.data", + "--input3Name=enwiktionary.english", + "--input3Format=enwiktionary", + "--input3LangPattern=" + nonEnglish.nameRegex, + "--input3LangCodePattern=" + (enIndex == 1 ? lang2.code : lang1.code).toLowerCase(), + "--input3EnIndex=" + enIndex, + }); // Print the entries for diffing. diff --git a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java index 4d94804..4746837 100644 --- a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java +++ b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java @@ -190,7 +190,12 @@ public class EnWiktionaryXmlParser { sense = null; } else if (functionName.equals("trans-mid")) { } else if (functionName.equals("trans-see")) { + // TODO + } else if (functionName.startsWith("picdic")) { } else if (functionName.startsWith("checktrans")) { + } else if (functionName.startsWith("ttbc")) { + wikiTokenizer.nextLine(); + // TODO: would be great to handle //TODO: Check this: done = true; } else { System.err.println("Unexpected translation wikifunction: " + wikiTokenizer.token() + ", title=" + title); @@ -212,7 +217,11 @@ public class EnWiktionaryXmlParser { } String rest = line.substring(colonIndex + 1).trim(); - doTranslationLine(line, title, sense, rest); + if (rest.length() > 0) { + doTranslationLine(line, title, sense, rest); + } else { + // TODO: do lines that are like Greek: + } } else if (wikiTokenizer.remainderStartsWith("''See''")) { wikiTokenizer.nextLine(); @@ -244,9 +253,9 @@ public class EnWiktionaryXmlParser { // Good chance we'll actually file this one... final PairEntry pairEntry = new PairEntry(); final IndexedEntry indexedEntry = new IndexedEntry(pairEntry); - + final StringBuilder otherText = new StringBuilder(); - final WikiTokenizer wikiTokenizer = new WikiTokenizer(rest); + final WikiTokenizer wikiTokenizer = new WikiTokenizer(rest, false); while (wikiTokenizer.nextToken() != null) { if (wikiTokenizer.isPlainText()) { @@ -264,13 +273,13 @@ public class EnWiktionaryXmlParser { final List args = wikiTokenizer.functionPositionArgs(); final Map namedArgs = wikiTokenizer.functionNamedArgs(); - if (functionName.equals("t") || functionName.equals("t+") || functionName.equals("t-") || functionName.equals("tø")) { + if (functionName.equals("t") || functionName.equals("t+") || functionName.equals("t-") || functionName.equals("tø") || functionName.equals("apdx-t")) { if (args.size() < 2) { System.err.println("{{t}} with too few args: " + line + ", title=" + title); continue; } final String langCode = get(args, 0); - if (this.langCodePattern.matcher(langCode).matches()) { + //if (this.langCodePattern.matcher(langCode).matches()) { final String word = get(args, 1); final String gender = get(args, 2); final String transliteration = namedArgs.get("tr"); @@ -286,7 +295,7 @@ public class EnWiktionaryXmlParser { otherText.append(String.format(" (tr. %s)", transliteration)); otherIndexBuilder.addEntryWithString(indexedEntry, transliteration, EntryTypeName.WIKTIONARY_TRANSLITERATION); } - } + //} } else if (functionName.equals("qualifier")) { String qualifier = args.get(0); if (!namedArgs.isEmpty() || args.size() > 1) { @@ -339,7 +348,10 @@ public class EnWiktionaryXmlParser { } else { System.err.println("Bad translation token: " + wikiTokenizer.token()); } - + } + if (otherText.length() == 0) { + System.err.println("Empty otherText: " + line); + return; } StringBuilder englishText = new StringBuilder(); @@ -356,7 +368,9 @@ public class EnWiktionaryXmlParser { final Pair pair = new Pair(trim(englishText.toString()), trim(otherText.toString()), swap); pairEntry.pairs.add(pair); - assert (pairsAdded.add(pair.toString())); + if (!pairsAdded.add(pair.toString())) { + System.err.println("Duplicate pair: " + pair.toString()); + } if (pair.toString().equals("libero {m} :: free (adjective)")) { System.out.println(); } @@ -610,7 +624,7 @@ static final Pattern UNINDEXED_WIKI_TEXT = Pattern.compile( final String mainLine = listLines.get(0); - final WikiTokenizer englishTokenizer = new WikiTokenizer(mainLine); + final WikiTokenizer englishTokenizer = new WikiTokenizer(mainLine, false); while (englishTokenizer.nextToken() != null) { // TODO handle form of.... if (englishTokenizer.isPlainText()) { diff --git a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java index 4377d8a..ad14bc0 100644 --- a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java +++ b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java @@ -64,10 +64,15 @@ public final class WikiTokenizer { public WikiTokenizer(final String wikiText) { + this(wikiText, true); + } + + public WikiTokenizer(final String wikiText, final boolean isNewline) { this.wikiText = wikiText; this.matcher = wikiTokenEvent.matcher(wikiText); + justReturnedNewline = false; } - + private void clear() { errors.clear(); tokenStack.clear(); @@ -211,7 +216,7 @@ public final class WikiTokenizer { } // Eat a newline if we're looking at one: - final boolean atNewline = wikiText.charAt(end) == '\n'; + final boolean atNewline = wikiText.charAt(end) == '\n' || wikiText.charAt(end) == '\u2028'; if (atNewline) { justReturnedNewline = true; ++end; -- 2.43.0