From: Thad Hughes Date: Wed, 4 Jan 2012 16:32:09 +0000 (-0800) Subject: Example splitting fixes, tokenizer newline handling, Chinese X-Git-Url: http://gitweb.fperrin.net/?p=DictionaryPC.git;a=commitdiff_plain;h=05a90f5458cdc0a72a8104fac86f1ba4b39a06f1 Example splitting fixes, tokenizer newline handling, Chinese transliteration unit test. --- diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java index 13ab972..a16fe91 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java @@ -70,7 +70,7 @@ public class DictionaryBuilderMain extends TestCase { //isoToWikiName.clear(); boolean go = false; for (final String foreignIso : isoToWikiName.keySet()) { - if (foreignIso.equals("JA")) { + if (foreignIso.equals("SK")) { go = true; } if (!go) { diff --git a/src/com/hughes/android/dictionary/engine/LanguageTest.java b/src/com/hughes/android/dictionary/engine/LanguageTest.java index ea28e6f..2d9b6a0 100644 --- a/src/com/hughes/android/dictionary/engine/LanguageTest.java +++ b/src/com/hughes/android/dictionary/engine/LanguageTest.java @@ -142,15 +142,15 @@ public class LanguageTest extends TestCase { final Language zh = Language.lookup("zh"); final Transliterator transliterator = Transliterator.createFromRules("", zh.getDefaultNormalizerRules(), Transliterator.FORWARD); - assertEquals("xie xie", transliterator.transliterate("謝謝")); - assertEquals("xie xie", transliterator.transliterate("谢谢")); + assertEquals("xiexie", transliterator.transliterate("謝謝")); + assertEquals("xiexie", transliterator.transliterate("谢谢")); - assertEquals("dian nao", transliterator.transliterate("電腦")); - assertEquals("dian nao", transliterator.transliterate("电脑")); - assertEquals("ji suan ji", transliterator.transliterate("計算機")); - assertEquals("ji suan ji", transliterator.transliterate("计算机")); + assertEquals("diannao", transliterator.transliterate("電腦")); + assertEquals("diannao", transliterator.transliterate("电脑")); + assertEquals("jisuanji", transliterator.transliterate("計算機")); + assertEquals("jisuanji", transliterator.transliterate("计算机")); - assertEquals("cheng jiu", transliterator.transliterate("成就")); + assertEquals("chengjiu", transliterator.transliterate("成就")); } diff --git a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java index b79013d..5ac7d45 100644 --- a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java +++ b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java @@ -228,17 +228,17 @@ public final class WikiTokenizer { assert isWikiLink(); // "[[.." if (lastUnescapedPipePos != -1) { - return wikiText.substring(lastUnescapedPipePos + 1, end - 2); + return trimNewlines(wikiText.substring(lastUnescapedPipePos + 1, end - 2)); } assert start + 2 < wikiText.length() && end >= 2: wikiText; - return wikiText.substring(start + 2, end - 2); + return trimNewlines(wikiText.substring(start + 2, end - 2)); } public String wikiLinkDest() { assert isWikiLink(); // "[[.." if (firstUnescapedPipePos != -1) { - return wikiText.substring(start + 2, firstUnescapedPipePos); + return trimNewlines(wikiText.substring(start + 2, firstUnescapedPipePos)); } return null; } diff --git a/src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryXmlParser.java b/src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryXmlParser.java index c49b1b6..c09bd4f 100644 --- a/src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryXmlParser.java +++ b/src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryXmlParser.java @@ -50,6 +50,7 @@ public class EnWiktionaryXmlParser { "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" + "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" + "Ligature|Idiom|Phrase|\\{\\{acronym\\}\\}|\\{\\{initialism\\}\\}|" + + "\\{\\{abbreviation\\}\\}|" + // These are @deprecated: "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" + "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb|" + @@ -161,7 +162,7 @@ public class EnWiktionaryXmlParser { } else if (headerName.equals("Translations")) { if (pos == null) { - LOG.warning("Translations without POS: " + title); + LOG.info("Translations without POS (but using anyway): " + title); } doTranslations(wikiTokenizer, pos); } else if (headerName.equals("Pronunciation")) { @@ -518,6 +519,8 @@ public class EnWiktionaryXmlParser { for (int i = 0; i < listSection.nextPrefixes.size(); ++i) { final String nextPrefix = listSection.nextPrefixes.get(i); final String nextLine = listSection.nextLines.get(i); + + // TODO: This splitting is not sensitive to wiki code. int dash = nextLine.indexOf("—"); int mdashLen = 7; if (dash == -1) { @@ -585,7 +588,12 @@ public class EnWiktionaryXmlParser { appendAndIndexWikiCallback.reset(builder, indexedEntry); appendAndIndexWikiCallback.entryTypeName = EntryTypeName.WIKTIONARY_EXAMPLE; appendAndIndexWikiCallback.entryTypeNameSticks = true; - appendAndIndexWikiCallback.dispatch(example, indexBuilder, EntryTypeName.WIKTIONARY_EXAMPLE); + try { + // TODO: this is a hack needed because we don't safely split on the dash. + appendAndIndexWikiCallback.dispatch(example, indexBuilder, EntryTypeName.WIKTIONARY_EXAMPLE); + } catch (AssertionError e) { + return "--"; + } final String result = trim(builder.toString()); return result.length() > 0 ? result : "--"; } diff --git a/src/com/hughes/android/dictionary/parser/enwiktionary/FunctionCallbacksDefault.java b/src/com/hughes/android/dictionary/parser/enwiktionary/FunctionCallbacksDefault.java index 7452645..7941a97 100644 --- a/src/com/hughes/android/dictionary/parser/enwiktionary/FunctionCallbacksDefault.java +++ b/src/com/hughes/android/dictionary/parser/enwiktionary/FunctionCallbacksDefault.java @@ -256,7 +256,11 @@ public final class FunctionCallbacksDefault { displayText = ListUtil.get(args, 1, null); } - appendAndIndexWikiCallback.dispatch(displayText, indexBuilder, entryTypeName); + if (displayText != null) { + appendAndIndexWikiCallback.dispatch(displayText, indexBuilder, entryTypeName); + } else { + LOG.warning("no display text: " + wikiTokenizer.token()); + } final String tr = namedArgs.remove("tr"); if (tr != null) { diff --git a/todo.txt b/todo.txt index 8f7e76b..2016c55 100644 --- a/todo.txt +++ b/todo.txt @@ -1,11 +1,9 @@ For next release: -refactor wiki parsing. "form of" to bottom handle examples like "asdf (asdf)" random word jump multiword find. dictionary update. -{{Arab}} ???italian verbs pronunciation @@ -60,13 +58,6 @@ Bad filing: under Arab? fare {{it-verb}} {{transitive}} :: To do - - -**** Wiktionary: - -in wiktionary - futurismo :: futurism () (noun) - done: {infl} @@ -80,4 +71,6 @@ always put defs in list... ! Check analytics ! Upload dics font size +refactor wiki parsing. +{{Arab}} \ No newline at end of file