From 5a79ed5f458a5c469d4b5fa81f25e83baabad57c Mon Sep 17 00:00:00 2001 From: Thad Hughes Date: Fri, 16 Dec 2011 19:13:01 -0800 Subject: [PATCH] Tokenizer fixes. --- bugs | 8 +++----- .../engine/DictionaryBuilderMain.java | 2 +- .../parser/EnWiktionaryXmlParser.java | 20 +++++++++++-------- .../dictionary/parser/WikiTokenizer.java | 14 ++++++++++++- .../dictionary/parser/WikiTokenizerTest.java | 16 ++++++++++++++- 5 files changed, 44 insertions(+), 16 deletions(-) diff --git a/bugs b/bugs index 0a5f46f..141f52c 100644 --- a/bugs +++ b/bugs @@ -1,15 +1,13 @@ -Setup new ICU +handle examples. +handle word-info in English. + Bad ordering: ===do=== do {{wikipedia|Do (nota)|lang=it}}{{infl|it|noun|g=m}} :: do, the musical note fare {{it-verb}} {{transitive}} :: To do - sub-levels in translations. -examples. in wiktionary futurismo :: futurism () (noun) - - \ No newline at end of file diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java index ebf4ba7..3479ec7 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java @@ -49,7 +49,7 @@ public class DictionaryBuilderMain extends TestCase { new Lang("^English$", "EN", null, "en.txt"), }; Lang[] langs2 = new Lang[] { - new Lang("^.*Italian.*$", "IT", "italian.data", "it.txt"), + //new Lang("^.*Italian.*$", "IT", "italian.data", "it.txt"), new Lang("^.*French.*$", "FR", "french.data", "empty.txt"), new Lang("^.*Spanish.*$", "ES", "spanish.data", "empty.txt"), new Lang("^.*Greek.*$", "EL", "greek.data", "empty.txt"), diff --git a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java index 41c7e41..e841e87 100644 --- a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java +++ b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java @@ -127,10 +127,11 @@ public class EnWiktionaryXmlParser { // ------------------------------------------------------------------------- - String pos = null; - int posDepth = -1; - private void doEnglishWord(String title, String text) { + + String pos = null; + int posDepth = -1; + final WikiTokenizer wikiTokenizer = new WikiTokenizer(text); while (wikiTokenizer.nextToken() != null) { @@ -146,7 +147,10 @@ public class EnWiktionaryXmlParser { posDepth = wikiTokenizer.headingDepth(); pos = wikiTokenizer.headingWikiText(); } else if (headerName.equals("Translations")) { - doTranslations(title, wikiTokenizer); + if (pos == null) { + LOG.warning("Translations without POS: " + title); + } + doTranslations(title, wikiTokenizer, pos); } else if (headerName.equals("Pronunciation")) { //doPronunciation(wikiLineReader); } @@ -161,7 +165,7 @@ public class EnWiktionaryXmlParser { "Jpan", "Kore", "Hebr", "rfscript", "Beng", "Mong", "Knda", "Cyrs", "yue-tsj", "Mlym", "Tfng", "Grek", "yue-yue-j")); - private void doTranslations(final String title, final WikiTokenizer wikiTokenizer) { + private void doTranslations(final String title, final WikiTokenizer wikiTokenizer, final String pos) { if (title.equals("absolutely")) { System.out.println(); } @@ -207,7 +211,7 @@ public class EnWiktionaryXmlParser { } else { LOG.warning("Unexpected translation wikifunction: " + wikiTokenizer.token() + ", title=" + title); } - } else if (wikiTokenizer.isListItem() && wikiTokenizer.listItemPrefix().startsWith("*")) { + } else if (wikiTokenizer.isListItem()) { final String line = wikiTokenizer.listItemWikiText(); // This line could produce an output... @@ -225,7 +229,7 @@ public class EnWiktionaryXmlParser { String rest = line.substring(colonIndex + 1).trim(); if (rest.length() > 0) { - doTranslationLine(line, title, sense, rest); + doTranslationLine(line, title, pos, sense, rest); } else { // TODO: do lines that are like Greek: } @@ -256,7 +260,7 @@ public class EnWiktionaryXmlParser { return index < list.size() ? list.get(index) : null; } - private void doTranslationLine(final String line, final String title, final String sense, final String rest) { + private void doTranslationLine(final String line, final String title, final String pos, final String sense, final String rest) { // Good chance we'll actually file this one... final PairEntry pairEntry = new PairEntry(); final IndexedEntry indexedEntry = new IndexedEntry(pairEntry); diff --git a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java index f80605d..403b27c 100644 --- a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java +++ b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java @@ -337,7 +337,7 @@ public final class WikiTokenizer { public String token() { final String token = wikiText.substring(start, end); - assert token.equals("\n") || !token.endsWith("\n") : token; + assert token.equals("\n") || !token.endsWith("\n") : "token='" + token + "'"; return token; } @@ -347,6 +347,7 @@ public final class WikiTokenizer { final boolean insideFunction = toFind.equals("}}"); int end = start; + int firstNewline = -1; while (end < wikiText.length()) { if (matcher.find(end)) { final String matchText = matcher.group(); @@ -355,6 +356,9 @@ public final class WikiTokenizer { assert matcher.end() > end || matchText.length() == 0: "Group=" + matcher.group(); if (matchText.length() == 0) { assert matchStart == wikiText.length() || wikiText.charAt(matchStart) == '\n'; + if (firstNewline == -1) { + firstNewline = matcher.end(); + } if (tokenStack.isEmpty() && toFind.equals("\n")) { return matchStart; } @@ -413,6 +417,14 @@ public final class WikiTokenizer { // Inside the while loop. Just go forward. end = Math.max(end, matcher.end()); } + if (toFind.equals("\n") && tokenStack.isEmpty()) { + // We were looking for the end, we got it. + return end; + } + if (firstNewline != -1) { + errors.add("Couldn't find: " + toFind + ", "+ wikiText.substring(start)); + return firstNewline; + } return end; } diff --git a/src/com/hughes/android/dictionary/parser/WikiTokenizerTest.java b/src/com/hughes/android/dictionary/parser/WikiTokenizerTest.java index 4079f1b..9142bd8 100644 --- a/src/com/hughes/android/dictionary/parser/WikiTokenizerTest.java +++ b/src/com/hughes/android/dictionary/parser/WikiTokenizerTest.java @@ -91,6 +91,19 @@ public class WikiTokenizerTest extends TestCase { assertEquals("202", new WikiTokenizer(wikiText).nextToken().functionNamedArgs().get("arg2")); assertEquals("{{n1|n2=7|n3}}", new WikiTokenizer(wikiText).nextToken().functionNamedArgs().get("arg3")); + wikiText = "{{gloss|asdf}\nAsdf\n\n"; + assertEquals("{{gloss|asdf}", new WikiTokenizer(wikiText).nextToken().token()); + + wikiText = "#*{{quote-book|year=1960|author={{w|P. G. Wodehouse}}\n" + + "|title={{w|Jeeves in the Offing}}\n" + + "|section=chapter XI\n" + + "|passage=“I'm sorely beset, Jeeves. Do you recall telling me once about someone who told somebody he could tell him something which would make him think a bit? Knitted socks and porcu\n" + + "pines entered into it, I remember.” “I think you may be referring to the ghost of the father of Hamlet, Prince of Denmark, sir. Addressing his son, he said ‘I could a tale unfold whos\n" + + "e lightest word would harrow up thy soul, freeze thy young blood, make thy two eyes, like stars, start from their spheres, thy knotted and combined locks to part and each particular h\n" + + "air to stand on end like quills upon the fretful '''porpentine'''.’ â€ “That's right. Locks, of course, not socks. Odd that he should have said '''porpentine''' when he meant porc\n" + + "upine. Slip of the tongue, no doubt, as so often happens with ghosts.”}}"; + assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token()); + } @@ -142,7 +155,7 @@ public class WikiTokenizerTest extends TestCase { assertTrue(new WikiTokenizer(wikiText).nextToken().isHeading()); assertEquals(1, new WikiTokenizer(wikiText).nextToken().headingDepth()); assertEquals("a", new WikiTokenizer(wikiText).nextToken().headingWikiText()); - assertEquals(1, new WikiTokenizer(wikiText).nextToken().errors.size()); + assertEquals(2, new WikiTokenizer(wikiText).nextToken().errors.size()); wikiText = "=a=="; assertEquals("=a==", new WikiTokenizer(wikiText).nextToken().token()); @@ -168,6 +181,7 @@ public class WikiTokenizerTest extends TestCase { assertEquals(2, new WikiTokenizer(wikiText).nextToken().headingDepth()); assertEquals("aa[[|=]] {{|={{=}} }}", new WikiTokenizer(wikiText).nextToken().headingWikiText()); assertEquals(0, new WikiTokenizer(wikiText).nextToken().errors.size()); + } -- 2.43.0