From 52123581b0c4aa46298b9d6cbc4697accffc1cc7 Mon Sep 17 00:00:00 2001 From: Thad Hughes Date: Fri, 30 Dec 2011 11:36:40 -0800 Subject: [PATCH] Refactoring wiki parsing, bigtime. Underway, so lots of errors.... --- .../dictionary/engine/DictionaryBuilder.java | 2 +- .../dictionary/engine/WiktionarySplitter.java | 1 - .../parser/EnWiktionaryXmlParser.java.old | 661 ------------------ .../parser/GeneralFunctionCallbacks.java | 7 - .../dictionary/parser/ItWiktionaryParser.java | 5 - .../parser/WikiFunctionCallback.java | 10 - .../dictionary/parser/WikiTokenizer.java | 2 +- .../parser/WikiTokenizerCallback.java | 5 - .../dictionary/parser/WikiWord.java.old | 353 ---------- .../AppendAndIndexWikiCallback.java | 105 +++ .../EnWiktionaryXmlParser.java | 135 +--- .../parser/enwiktionary/FunctionCallback.java | 80 +++ 12 files changed, 197 insertions(+), 1169 deletions(-) delete mode 100644 src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java.old delete mode 100644 src/com/hughes/android/dictionary/parser/GeneralFunctionCallbacks.java delete mode 100644 src/com/hughes/android/dictionary/parser/ItWiktionaryParser.java delete mode 100644 src/com/hughes/android/dictionary/parser/WikiFunctionCallback.java delete mode 100644 src/com/hughes/android/dictionary/parser/WikiTokenizerCallback.java delete mode 100644 src/com/hughes/android/dictionary/parser/WikiWord.java.old create mode 100644 src/com/hughes/android/dictionary/parser/enwiktionary/AppendAndIndexWikiCallback.java rename src/com/hughes/android/dictionary/parser/{ => enwiktionary}/EnWiktionaryXmlParser.java (89%) create mode 100644 src/com/hughes/android/dictionary/parser/enwiktionary/FunctionCallback.java diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java index 888d5c8..a3cc7c0 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java @@ -31,7 +31,7 @@ import javax.xml.parsers.ParserConfigurationException; import org.xml.sax.SAXException; import com.hughes.android.dictionary.parser.DictFileParser; -import com.hughes.android.dictionary.parser.EnWiktionaryXmlParser; +import com.hughes.android.dictionary.parser.enwiktionary.EnWiktionaryXmlParser; import com.hughes.util.Args; import com.hughes.util.FileUtil; diff --git a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java index 2628700..2e732f0 100644 --- a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java +++ b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java @@ -20,7 +20,6 @@ import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.regex.Matcher; diff --git a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java.old b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java.old deleted file mode 100644 index a3d76ee..0000000 --- a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java.old +++ /dev/null @@ -1,661 +0,0 @@ -// Copyright 2011 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package com.hughes.android.dictionary.parser; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.regex.Pattern; - -import javax.xml.parsers.ParserConfigurationException; -import javax.xml.parsers.SAXParser; -import javax.xml.parsers.SAXParserFactory; - -import org.xml.sax.Attributes; -import org.xml.sax.SAXException; - -import com.hughes.android.dictionary.engine.DictionaryBuilder; -import com.hughes.android.dictionary.engine.IndexBuilder; -import com.hughes.android.dictionary.parser.WikiWord.FormOf; -import com.hughes.android.dictionary.parser.WikiWord.Translation; -import com.hughes.util.ListUtil; -import com.hughes.util.StringUtil; - -public class EnWiktionaryXmlParserOld extends org.xml.sax.helpers.DefaultHandler implements WikiCallback { - - static final Pattern partOfSpeechHeader = Pattern.compile( - "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" + - "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" + - "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" + - "Ligature|Idiom|Phrase|" + - // These are @deprecated: - "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" + - "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb|" + - // These are extras I found: - "Determiner|Numeral|Number|Cardinal number|Ordinal number|Proverb|" + - "Particle|Interjection|Pronominal adverb" + - "Han character|Hanzi|Hanja|Kanji|Katakana character|Syllable"); - - static final Pattern wikiMarkup = Pattern.compile("\\[\\[|\\]\\]|''+"); - - final DictionaryBuilder dictBuilder; - - final IndexBuilder[] indexBuilders; - final Pattern[] langPatterns; - final int enIndexBuilder; - - StringBuilder titleBuilder; - StringBuilder textBuilder; - StringBuilder currentBuilder = null; - - static void assertTrue(final boolean condition) { - assertTrue(condition, ""); - } - - static void assertTrue(final boolean condition, final String message) { - if (!condition) { - System.err.println("Assertion failed, message: " + message); - new RuntimeException().printStackTrace(System.err); - } - } - - public EnWiktionaryXmlParserOld(final DictionaryBuilder dictBuilder, final Pattern[] langPatterns, final int enIndexBuilder) { - assertTrue(langPatterns.length == 2); - this.dictBuilder = dictBuilder; - this.indexBuilders = dictBuilder.indexBuilders.toArray(new IndexBuilder[0]); - this.langPatterns = langPatterns; - this.enIndexBuilder = enIndexBuilder; - } - - @Override - public void startElement(String uri, String localName, String qName, - Attributes attributes) { - currentBuilder = null; - if ("page".equals(qName)) { - titleBuilder = new StringBuilder(); - - // Start with "\n" to better match certain strings. - textBuilder = new StringBuilder("\n"); - } else if ("title".equals(qName)) { - currentBuilder = titleBuilder; - } else if ("text".equals(qName)) { - currentBuilder = textBuilder; - } - } - - @Override - public void characters(char[] ch, int start, int length) throws SAXException { - if (currentBuilder != null) { - currentBuilder.append(ch, start, length); - } - } - - @Override - public void endElement(String uri, String localName, String qName) - throws SAXException { - currentBuilder = null; - if ("page".equals(qName)) { - endPage(); - } - } - - - public void parse(final File file) throws ParserConfigurationException, - SAXException, IOException { - final SAXParser parser = SAXParserFactory.newInstance().newSAXParser(); - parser.parse(file, this); - } - - int pageCount = 0; - private void endPage() { - title = titleBuilder.toString(); - ++pageCount; - if (pageCount % 1000 == 0) { - System.out.println("pageCount=" + pageCount); - } - if (title.startsWith("Wiktionary:") || - title.startsWith("Template:") || - title.startsWith("Appendix:") || - title.startsWith("Category:") || - title.startsWith("Index:") || - title.startsWith("MediaWiki:") || - title.startsWith("TransWiki:") || - title.startsWith("Citations:") || - title.startsWith("Concordance:") || - title.startsWith("Help:")) { - return; - } - currentDepth = 0; - words.clear(); - currentHeading = null; - insidePartOfSpeech = false; -// System.err.println("Working on page: " + title); - try { - WikiParser.parse(textBuilder.toString(), this); - } catch (Throwable e) { - System.err.println("Failure on page: " + title); - e.printStackTrace(System.err); - } - - for (final WikiWord word : words) { - word.wikiWordToQuickDic(dictBuilder, enIndexBuilder); - } // WikiWord - - } // endPage() - - - // ------------------------------------------------------------------------ - // ------------------------------------------------------------------------ - // ------------------------------------------------------------------------ - // ------------------------------------------------------------------------ - - /** - * Two things can happen: - * - * We can be in a ==German== section. There we will see English definitions. - * Each POS should get its own QuickDic entry. Pretty much everything goes - * in. - * - * Or we can be in an ==English== section with English definitions - * and maybe see translations for languages we care about. - * - * In either case, we need to differentiate the subsections (Noun, Verb, etc.) - * into separate QuickDic entries, but that's tricky--how do we know when we - * found a subsection? Just ignore anything containing pronunciation and - * etymology? - * - * How do we decide when to seal the deal on an entry? - * - * Would be nice if the parser told us about leaving sections.... - * - * - */ - - String title; - String currentHeading; - int currentDepth; - final List words = new ArrayList(); - WikiWord currentWord; - WikiWord.PartOfSpeech currentPartOfSpeech; - WikiWord.TranslationSense currentTranslationSense; - boolean insidePartOfSpeech; - - StringBuilder wikiBuilder = null; - - @Override - public void onWikiLink(String[] args) { - if (wikiBuilder == null) { - return; - } - wikiBuilder.append(args[args.length - 1]); - } - - // ttbc: translations to be checked. - static final Set useRemainingArgTemplates = new LinkedHashSet(Arrays.asList( - "Arab", "Cyrl", "fa-Arab", "italbrac", "Khmr", "ku-Arab", "IPAchar", "Laoo", - "sd-Arab", "Thai", "ttbc", "unicode", "ur-Arab", "yue-yue-j", "zh-ts", - "zh-tsp", "zh-zh-p", "ug-Arab", "ko-inline", "Jpan", "Kore", "rfscript", "Latinx")); - static final Set ignoreTemplates = new LinkedHashSet(Arrays.asList("audio", "rhymes", "hyphenation", "homophones", "wikipedia", "rel-top", "rel-bottom", "sense", "wikisource1911Enc", "g")); - static final Set grammarTemplates = new LinkedHashSet(Arrays.asList("impf", "pf", "pf.", "indeclinable")); - static final Set passThroughTemplates = new LinkedHashSet(Arrays.asList("zzzzzzzzzzzzzzz")); - - @Override - public void onTemplate(final List positionalArgs, final Map namedArgs) { - if (positionalArgs.isEmpty()) { - // This happens very rarely with special templates. - return; - } - final String name = positionalArgs.get(0); - - namedArgs.remove("lang"); - namedArgs.remove("nocat"); - namedArgs.remove("nocap"); - namedArgs.remove("sc"); - - // Pronunciation - if (currentWord != null) { - if (name.equals("a")) { - // accent tag - currentWord.currentPronunciation = new StringBuilder(); - currentWord.accentToPronunciation.put(positionalArgs.get(1), currentWord.currentPronunciation); - return; - } - - if (name.equals("IPA") || name.equals("SAMPA") || name.equals("X-SAMPA") || name.equals("enPR")) { - namedArgs.remove("lang"); - for (int i = 0; i < 100 && !namedArgs.isEmpty(); ++i) { - final String pron = namedArgs.remove("" + i); - if (pron != null) { - positionalArgs.add(pron); - } else { - if (i > 10) { - break; - } - } - } - if (!(positionalArgs.size() >= 2 && namedArgs.isEmpty())) { - System.err.println("Invalid pronunciation: " + positionalArgs.toString() + namedArgs.toString()); - } - if (currentWord.currentPronunciation == null) { - currentWord.currentPronunciation = new StringBuilder(); - currentWord.accentToPronunciation.put("", currentWord.currentPronunciation); - } - if (currentWord.currentPronunciation.length() > 0) { - currentWord.currentPronunciation.append("; "); - } - for (int i = 1; i < positionalArgs.size(); ++i) { - if (i > 1) { - currentWord.currentPronunciation.append(","); - } - final String pron = wikiMarkup.matcher(positionalArgs.get(1)).replaceAll(""); - currentWord.currentPronunciation.append(pron).append(""); - } - currentWord.currentPronunciation.append(" (").append(name).append(")"); - return; - } - - if (name.equals("qualifier")) { - //assertTrue(positionalArgs.size() == 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs.toString()); - if (wikiBuilder == null) { - return; - } - wikiBuilder.append(" (").append(positionalArgs.get(1)).append(")"); - return; - } - - if (name.equals("...")) { - // Skipping any elided text for brevity. - wikiBuilder.append("..."); - return; - } - - if (passThroughTemplates.contains(name)) { - assertTrue(positionalArgs.size() == 1 && namedArgs.isEmpty(), positionalArgs.toString() + namedArgs); - wikiBuilder.append(name); - return; - } - - if (ignoreTemplates.contains(name)) { - return; - } - - if ("Pronunciation".equals(currentHeading)) { - System.err.println("Unhandled pronunciation template: " + positionalArgs + namedArgs); - return; - } - } // Pronunciation - - // Part of speech - if (insidePartOfSpeech) { - - // form of - if (name.equals("form of")) { - namedArgs.remove("sc"); - if (positionalArgs.size() < 3 || positionalArgs.size() > 4) { - System.err.println("Invalid form of."); - } - final String token = positionalArgs.get(positionalArgs.size() == 3 ? 2 : 3); - final String grammarForm = WikiParser.simpleParse(positionalArgs.get(1)); - currentPartOfSpeech.formOfs.add(new FormOf(grammarForm, token)); - return; - } - - // The fallback plan: append the template! - if (wikiBuilder != null) { - wikiBuilder.append("{"); - boolean first = true; - for (final String arg : positionalArgs) { - if (!first) { - wikiBuilder.append(", "); - } - first = false; - wikiBuilder.append(arg); - } - // This one isn't so useful. - for (final Map.Entry entry : namedArgs.entrySet()) { - if (!first) { - wikiBuilder.append(", "); - } - first = false; - wikiBuilder.append(entry.getKey()).append("=").append(entry.getValue()); - } - wikiBuilder.append("}"); - } - - //System.err.println("Unhandled part of speech template: " + positionalArgs + namedArgs); - return; - } // Part of speech - - - // Translations - if (name.equals("trans-top")) { - assertTrue(positionalArgs.size() >= 1 && namedArgs.isEmpty(), positionalArgs.toString() + namedArgs + title); - - if (currentPartOfSpeech == null) { - assertTrue(currentWord != null && !currentWord.partsOfSpeech.isEmpty(), title); - System.err.println("Assuming last part of speech for non-nested translation section: " + title); - currentPartOfSpeech = ListUtil.getLast(currentWord.partsOfSpeech); - } - - currentTranslationSense = new WikiWord.TranslationSense(); - currentPartOfSpeech.translationSenses.add(currentTranslationSense); - if (positionalArgs.size() > 1) { - currentTranslationSense.sense = positionalArgs.get(1); - } - return; - } // Translations - - if (wikiBuilder == null) { - return; - } - if (name.equals("m") || name.equals("f") || name.equals("n") || name.equals("c")) { - assertTrue(positionalArgs.size() >= 1 && namedArgs.isEmpty(), positionalArgs.toString() + namedArgs.toString()); - wikiBuilder.append("{"); - for (int i = 1; i < positionalArgs.size(); ++i) { - wikiBuilder.append(i > 1 ? "," : ""); - wikiBuilder.append(positionalArgs.get(i)); - } - wikiBuilder.append(name).append("}"); - - } else if (name.equals("p")) { - assertTrue(positionalArgs.size() == 1 && namedArgs.isEmpty()); - wikiBuilder.append("pl."); - - } else if (name.equals("s")) { - assertTrue(positionalArgs.size() == 1 && namedArgs.isEmpty() || title.equals("dobra"), title); - wikiBuilder.append("sg."); - - } else if (grammarTemplates.contains(name)) { - assert positionalArgs.size() == 1 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs; - wikiBuilder.append(name).append("."); - - } else if (name.equals("l")) { - // This template is designed to generate a link to a specific language-section on the target page. - wikiBuilder.append(positionalArgs.size() >= 4 ? positionalArgs.get(3) : positionalArgs.get(2)); - - } else if (name.equals("t") || name.equals("t+") || name.equals("t-") || name.equals("tø")) { - if (positionalArgs.size() > 2) { - wikiBuilder.append(positionalArgs.get(2)); - } - for (int i = 3; i < positionalArgs.size(); ++i) { - wikiBuilder.append(i == 3 ? " {" : ","); - wikiBuilder.append(positionalArgs.get(i)); - wikiBuilder.append(i == positionalArgs.size() - 1 ? "}" : ""); - } - final String transliteration = namedArgs.remove("tr"); - if (transliteration != null) { - wikiBuilder.append(" (").append(transliteration).append(")"); - } - - } else if (name.equals("trreq")) { - wikiBuilder.append("{{trreq}}"); - - } else if (name.equals("qualifier")) { - //assert positionalArgs.size() == 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs.toString(); - wikiBuilder.append(" (").append(positionalArgs.get(1)).append(")"); - - } else if (useRemainingArgTemplates.contains(name)) { - for (int i = 1; i < positionalArgs.size(); ++i) { - if (i != 1) { - wikiBuilder.append(", "); - } - wikiBuilder.append(positionalArgs.get(i)); - } - } else if (ignoreTemplates.contains(name)) { - // Do nothing. - - } else if (name.equals("initialism")) { - assert positionalArgs.size() <= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs; - wikiBuilder.append("Initialism"); - } else if (name.equals("abbreviation")) { - assert positionalArgs.size() <= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs; - wikiBuilder.append("Abbreviation"); - } else if (name.equals("acronym")) { - assert positionalArgs.size() <= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs; - wikiBuilder.append("Acronym"); - } else { - if (currentTranslationSense != null) { - System.err.println("Unhandled template: " + positionalArgs.toString() + namedArgs); - } - } - } - - @Override - public void onText(String text) { - if (wikiBuilder != null) { - wikiBuilder.append(text); - return; - } - } - - @Override - public void onHeadingStart(int depth) { - wikiBuilder = new StringBuilder(); - currentDepth = depth; - if (currentPartOfSpeech != null && depth <= currentPartOfSpeech.depth) { - currentPartOfSpeech = null; - insidePartOfSpeech = false; - } - if (currentWord != null && depth <= currentWord.depth) { - currentWord = null; - } - - currentHeading = null; - } - - @Override - public void onHeadingEnd(int depth) { - final String name = wikiBuilder.toString().trim(); - wikiBuilder = null; - currentTranslationSense = null; - currentHeading = name; - - final boolean lang0 = langPatterns[0].matcher(name).matches(); - final boolean lang1 = langPatterns[1].matcher(name).matches(); - if (name.equalsIgnoreCase("English") || lang0 || lang1 || name.equalsIgnoreCase("Translingual")) { - currentWord = new WikiWord(title, depth); - if (lang0 && lang1) { - System.err.println("Word is indexed in both index1 and index2: " + title); - } - currentWord.language = name; - currentWord.index = lang0 ? 0 : (lang1 ? 1 : -1); - words.add(currentWord); - return; - } - - if (currentWord == null) { - return; - } - - if (currentPartOfSpeech != null && depth <= currentPartOfSpeech.depth) { - currentPartOfSpeech = null; - } - - insidePartOfSpeech = false; - if (currentPartOfSpeech == null && partOfSpeechHeader.matcher(name).matches()) { - currentPartOfSpeech = new WikiWord.PartOfSpeech(depth, name); - currentWord.partsOfSpeech.add(currentPartOfSpeech); - insidePartOfSpeech = true; - return; - } - - if (name.equals("Translations")) { - if (currentWord == null || - !currentWord.language.equals("English") || - currentPartOfSpeech == null) { - System.err.println("Unexpected Translations section: " + title); - return; - } - currentTranslationSense = new WikiWord.TranslationSense(); - } - - } - - @Override - public void onListItemStart(String header, int[] section) { - wikiBuilder = new StringBuilder(); - if (currentWord != null) { - currentWord.currentPronunciation = null; - } - } - - - @Override - public void onListItemEnd(String header, int[] section) { - String item = wikiBuilder.toString().trim(); - if (item.length() == 0) { - return; - } - item = WikiParser.simpleParse(item); - wikiBuilder = null; - - // Part of speech - if (insidePartOfSpeech) { - assert currentPartOfSpeech != null : title + item; - if (header.equals("#") || - header.equals("##") || - header.equals("###") || - header.equals("####") || - header.equals(":#") || - header.equals("::") || - header.equals(":::*")) { - // Definition. - // :: should append, probably. - currentPartOfSpeech.newMeaning().meaning = item; - - // Source - } else if (header.equals("#*") || - header.equals("##*") || - header.equals("###*")) { - currentPartOfSpeech.lastMeaning().newExample().source = item; - - // Example - } else if (header.equals("#:") || - header.equals("#*:") || - header.equals("#:*") || - header.equals("##:") || - header.equals("##*:") || - header.equals("#:*:") || - header.equals("#:*#") || - header.equals("#*:") || - header.equals("*:") || - header.equals("#:::") || - header.equals("#**") || - header.equals("#*:::") || - header.equals("#:#") || - header.equals(":::") || - header.equals("##:*") || - header.equals("###*:")) { - StringUtil.appendLine(currentPartOfSpeech.lastMeaning().newExample().example, item); - - // Example in English - } else if (header.equals("#::") || - header.equals("#*::") || - header.equals("#:**") || - header.equals("#*#") || - header.equals("##*::")) { - StringUtil.appendLine(currentPartOfSpeech.lastMeaning().lastExample().exampleInEnglish, item); - - // Skip - } else if (header.equals("*") || - header.equals("**") || - header.equals("***") || - header.equals("*#") || - header.equals(":") || - header.equals("::*") || - header.equals("#**") || - header.equals(":*") || - header.equals("#*:*") || - header.equals("#*:**") || - header.equals("#*:#") || - header.equals("#*:*:") || - header.equals("#*:*") || - header.equals(";")) { - // might have: * {{seeCites}} - // * [[w:Arabic numerals|Arabic numerals]]: 2 - //assert item.trim().length() == 0; - System.err.println("Skipping meaning: " + header + " " + item); - } else { - if (title.equals("Yellowknife")) { - return; - } - System.err.println("Busted heading: " + title + " "+ header + " " + item); - } - return; - } - // Part of speech - - // Translation - if (currentTranslationSense != null) { - if (item.indexOf("{{[trreq]{}}}") != -1) { - return; - } - - if (currentPartOfSpeech.translationSenses.isEmpty()) { - currentPartOfSpeech.translationSenses.add(currentTranslationSense); - } - - final int colonPos = item.indexOf(':'); - if (colonPos == -1) { - System.err.println("Invalid translation: title=" + title + ", item=" + item); - return; - } - final String lang = item.substring(0, colonPos); - final String trans = item.substring(colonPos + 1).trim(); - for (int i = 0; i < 2; ++i) { - if (langPatterns[i].matcher(lang).find()) { - currentTranslationSense.translations.get(i).add(new Translation(lang, trans)); - } - } - } // Translation - } - - @Override - public void onNewLine() { - } - - @Override - public void onNewParagraph() { - } - - // ---------------------------------------------------------------------- - - @Override - public void onComment(String text) { - } - - @Override - public void onFormatBold(boolean boldOn) { - } - - @Override - public void onFormatItalic(boolean italicOn) { - } - - @Override - public void onUnterminated(String start, String rest) { - System.err.printf("OnUnterminated: %s %s %s\n", title, start, rest); - } - @Override - public void onInvalidHeaderEnd(String rest) { - throw new RuntimeException(rest); - } - -} diff --git a/src/com/hughes/android/dictionary/parser/GeneralFunctionCallbacks.java b/src/com/hughes/android/dictionary/parser/GeneralFunctionCallbacks.java deleted file mode 100644 index 626a17a..0000000 --- a/src/com/hughes/android/dictionary/parser/GeneralFunctionCallbacks.java +++ /dev/null @@ -1,7 +0,0 @@ -package com.hughes.android.dictionary.parser; - -public class GeneralFunctionCallbacks { - - - -} diff --git a/src/com/hughes/android/dictionary/parser/ItWiktionaryParser.java b/src/com/hughes/android/dictionary/parser/ItWiktionaryParser.java deleted file mode 100644 index be5c94c..0000000 --- a/src/com/hughes/android/dictionary/parser/ItWiktionaryParser.java +++ /dev/null @@ -1,5 +0,0 @@ -package com.hughes.android.dictionary.parser; - -public class ItWiktionaryParser { - -} diff --git a/src/com/hughes/android/dictionary/parser/WikiFunctionCallback.java b/src/com/hughes/android/dictionary/parser/WikiFunctionCallback.java deleted file mode 100644 index 5ae62d4..0000000 --- a/src/com/hughes/android/dictionary/parser/WikiFunctionCallback.java +++ /dev/null @@ -1,10 +0,0 @@ -package com.hughes.android.dictionary.parser; - -import java.util.List; -import java.util.Map; - -public interface WikiFunctionCallback { - - void onWikiFunction(final String name, final List args, final Map namedArgs); - -} diff --git a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java index 4243501..6c81749 100644 --- a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java +++ b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java @@ -23,7 +23,7 @@ import java.util.regex.Pattern; public final class WikiTokenizer { - static interface Callback { + public static interface Callback { void onPlainText(WikiTokenizer wikiTokenizer); void onMarkup(WikiTokenizer wikiTokenizer); void onWikiLink(WikiTokenizer wikiTokenizer); diff --git a/src/com/hughes/android/dictionary/parser/WikiTokenizerCallback.java b/src/com/hughes/android/dictionary/parser/WikiTokenizerCallback.java deleted file mode 100644 index 926374a..0000000 --- a/src/com/hughes/android/dictionary/parser/WikiTokenizerCallback.java +++ /dev/null @@ -1,5 +0,0 @@ -package com.hughes.android.dictionary.parser; - -public interface WikiTokenizerCallback { - -} diff --git a/src/com/hughes/android/dictionary/parser/WikiWord.java.old b/src/com/hughes/android/dictionary/parser/WikiWord.java.old deleted file mode 100644 index b4ed2d1..0000000 --- a/src/com/hughes/android/dictionary/parser/WikiWord.java.old +++ /dev/null @@ -1,353 +0,0 @@ -// Copyright 2011 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package com.hughes.android.dictionary.parser; - -import java.util.ArrayList; -import java.util.LinkedHashMap; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.regex.Pattern; - -import com.hughes.android.dictionary.engine.DictionaryBuilder; -import com.hughes.android.dictionary.engine.IndexedEntry; -import com.hughes.android.dictionary.engine.EntryTypeName; -import com.hughes.android.dictionary.engine.IndexBuilder; -import com.hughes.android.dictionary.engine.PairEntry; -import com.hughes.android.dictionary.engine.PairEntry.Pair; -import com.hughes.util.ListUtil; - -public class WikiWord { - final int depth; - - final String title; - String language; - - int index; - - final Map accentToPronunciation = new LinkedHashMap(); - StringBuilder currentPronunciation = null; - - final List partsOfSpeech = new ArrayList(); - - public WikiWord(final String title, int depth) { - this.title = title.intern(); - this.depth = depth; - } - - static class PartOfSpeech { - final int depth; - final String name; - - final List meanings = new ArrayList(); - - final List translationSenses = new ArrayList(); - - final List formOfs = new ArrayList(); - - public PartOfSpeech(final int depth, String name) { - this.depth = depth; - this.name = name.intern(); - } - - public Meaning newMeaning() { - final Meaning meaning = new Meaning(); - meanings.add(meaning); - return meaning; - } - - public Meaning lastMeaning() { - return meanings.isEmpty() ? newMeaning() : ListUtil.getLast(meanings); - } - } - - static class TranslationSense { - String sense; - List> translations = new ArrayList>(); - { - translations.add(new ArrayList()); - translations.add(new ArrayList()); - } - } - - static class Translation { - String language; - String text; - - public Translation(final String language, final String text) { - this.language = language; - this.text = text; - } - - @Override - public String toString() { - return language + ": " + text; - } - } - - static class FormOf { - final String grammarForm; - final String target; - - public FormOf(final String grammarForm, final String token) { - this.grammarForm = grammarForm; - this.target = token; - } - } - - static class Meaning { - String meaning; - final List examples = new ArrayList(); - - public Example newExample() { - final Example example = new Example(); - this.examples.add(example); - return example; - } - - public Example lastExample() { - return examples.isEmpty() ? newExample() : ListUtil.getLast(examples); - } - } - - static class Example { - String source; - final StringBuilder example = new StringBuilder(); - final StringBuilder exampleInEnglish = new StringBuilder(); - } - - // ------------------------------------------------------------------------- - - void wikiWordToQuickDic(final DictionaryBuilder dictBuilder, final int enIndexBuilder) { - //System.out.println("\n" + title + ", " + language + ", pron=" + accentToPronunciation); - if (partsOfSpeech.isEmpty() && title.indexOf(":") == -1 && !language.equals("Translingual")) { - System.err.println("Word with no POS: " + title); - } - for (final WikiWord.PartOfSpeech partOfSpeech : partsOfSpeech) { - partOfSpeechToQuickDic(dictBuilder, enIndexBuilder, partOfSpeech); - } // PartOfSpeech - - // Pronunciation. - if (index != -1) { - final PairEntry pronEntry = new PairEntry(); - for (final Map.Entry accentToPron : accentToPronunciation.entrySet()) { - String accent = accentToPron.getKey(); - if (accent.length() > 0) { - accent = accent + ": "; - } - pronEntry.pairs.add(new Pair(accent + accentToPron.getValue(), "", index != 0)); - } - if (pronEntry.pairs.size() > 0) { - final IndexedEntry entryData = new IndexedEntry(dictBuilder.dictionary.pairEntries.size(), pronEntry); - dictBuilder.dictionary.pairEntries.add(pronEntry); - final Set tokens = DictFileParser.tokenize(title, DictFileParser.NON_CHAR); - dictBuilder.indexBuilders.get(index).addEntryWithTokens(entryData, tokens, EntryTypeName.WIKTIONARY_PRONUNCIATION); - } - } - } - - - static final Pattern templateName = Pattern.compile("\\{[^,]*,"); - private void partOfSpeechToQuickDic(final DictionaryBuilder dictBuilder, - final int enIndexBuilder, final WikiWord.PartOfSpeech partOfSpeech) { - //System.out.println(" pos: " + partOfSpeech.name); - - for (final WikiWord.Meaning meaning : partOfSpeech.meanings) { - //System.out.println(" meaning: " + meaning.meaning); - for (final WikiWord.Example example : meaning.examples) { - if (example.example.length() > 0) { - //System.out.println(" example: " + example.example); - } - if (example.exampleInEnglish.length() > 0) { - //System.out.println(" exampleInEnglish: " + example.exampleInEnglish); - } - } - } - - if (index != -1) { - final boolean formOfSwap = index != 0; - for (final FormOf formOf : partOfSpeech.formOfs) { - final Pair pair = new Pair(title + ": " + formOf.grammarForm + ": " + formOf.target, "", formOfSwap); - final PairEntry pairEntry = new PairEntry(); - pairEntry.pairs.add(pair); - final IndexedEntry entryData = new IndexedEntry(dictBuilder.dictionary.pairEntries.size(), pairEntry); - dictBuilder.dictionary.pairEntries.add(pairEntry); - - // File under title token. - final Set tokens = DictFileParser.tokenize(formOf.target, DictFileParser.NON_CHAR); - dictBuilder.indexBuilders.get(index).addEntryWithTokens(entryData, tokens, EntryTypeName.WIKTIONARY_FORM_OF); - } - } - - - if (enIndexBuilder != -1 && index != -1 && enIndexBuilder != index) { - final String entryBase = title + " (" + partOfSpeech.name.toLowerCase() + ")"; - final boolean swap = enIndexBuilder == 1; - - // Meanings. - for (final Meaning meaning : partOfSpeech.meanings) { - final PairEntry pairEntry = new PairEntry(); - final List pairs = pairEntry.pairs; - - final List> exampleTokens = new ArrayList>(); - exampleTokens.add(new LinkedHashSet()); - exampleTokens.add(new LinkedHashSet()); - - if (meaning.meaning != null && meaning.meaning.length() > 0) { - final Pair meaningPair = new Pair(meaning.meaning, entryBase, swap); - pairs.add(meaningPair); - } else { - System.err.println("Empty meaning: " + title + ", " + language + ", " + partOfSpeech.name); - } - - // Examples - for (final Example example : meaning.examples) { - final int dashIndex = example.example.indexOf("—"); - if (example.exampleInEnglish.length() == 0 && dashIndex != -1) { - System.out.println("Splitting example: title=" + title + ", "+ example.example); - example.exampleInEnglish.append(example.example.substring(dashIndex + 1).trim()); - example.example.delete(dashIndex, example.example.length()); - } - - if (example.example.length() > 0 && example.exampleInEnglish.length() > 0) { - final Pair pair = new Pair(example.exampleInEnglish.toString(), example.example.toString(), swap); - pairs.add(pair); - - for (int i = 0; i < 2; ++i) { - exampleTokens.get(i).addAll(DictFileParser.tokenize(pair.get(i), DictFileParser.NON_CHAR)); - } - } - } - - // Create EntryData with the PairEntry. - final IndexedEntry entryData = new IndexedEntry(dictBuilder.dictionary.pairEntries.size(), pairEntry); - dictBuilder.dictionary.pairEntries.add(pairEntry); - - // File under title token. - final Set titleTokens = DictFileParser.tokenize(title, DictFileParser.NON_CHAR); - dictBuilder.indexBuilders.get(index).addEntryWithTokens(entryData, titleTokens, titleTokens.size() == 1 ? EntryTypeName.WIKTIONARY_TITLE_ONE_WORD : EntryTypeName.WIKTIONARY_TITLE_MULTI_WORD); - - // File under the meaning tokens (English): - if (meaning.meaning != null) { - // If the meaning contains any templates, strip out the template name - // so we don't index it. - final String meaningToIndex = templateName.matcher(meaning.meaning).replaceAll(""); - final Set meaningTokens = DictFileParser.tokenize(meaningToIndex, DictFileParser.NON_CHAR); - dictBuilder.indexBuilders.get(enIndexBuilder).addEntryWithTokens(entryData, meaningTokens, meaningTokens.size() == 1 ? EntryTypeName.WIKTIONARY_MEANING_ONE_WORD : EntryTypeName.WIKTIONARY_MEANING_MULTI_WORD); - } - - // File under other tokens that we saw. - for (int i = 0; i < 2; ++i) { - dictBuilder.indexBuilders.get(i).addEntryWithTokens(entryData, exampleTokens.get(i), EntryTypeName.WIKTIONARY_EXAMPLE_OTHER_WORDS); - } - - - } // Meanings. - - } - - translationSensesToQuickDic(dictBuilder, enIndexBuilder, partOfSpeech); - } - - - private void translationSensesToQuickDic(final DictionaryBuilder dictBuilder, - final int enIndexBuilder, final WikiWord.PartOfSpeech partOfSpeech) { - if (!partOfSpeech.translationSenses.isEmpty()) { - if (!language.equals("English")) { - System.err.println("Translation sections not in English."); - } - - final String englishBase = title + " (" + partOfSpeech.name.toLowerCase() + "%s)"; - - for (final TranslationSense translationSense : partOfSpeech.translationSenses) { - //System.out.println(" sense: " + translationSense.sense); - if (translationSense.sense == null) { - //System.err.println(" null sense: " + title); - } - String englishSense = String.format(englishBase, translationSense.sense != null ? (": " + translationSense.sense) : ""); - - final StringBuilder[] sideBuilders = new StringBuilder[2]; - final List>> sideTokens = new ArrayList>>(); - for (int i = 0; i < 2; ++i) { - sideBuilders[i] = new StringBuilder(); - sideTokens.add(new LinkedHashMap>()); - } - - if (enIndexBuilder != -1) { - sideBuilders[enIndexBuilder].append(englishSense); - addTokens(title, sideTokens.get(enIndexBuilder), EntryTypeName.WIKTIONARY_TITLE_ONE_WORD); - } - - // Get the entries from the translation section. - for (int i = 0; i < 2; ++i) { - //System.out.println(" lang: " + i); - for (final Translation translation : translationSense.translations.get(i)) { - //System.out.println(" translation: " + translation); - sideBuilders[i].append(sideBuilders[i].length() > 0 ? "\n" : ""); - if (translationSense.translations.get(i).size() > 1) { - sideBuilders[i].append(translation.language).append(": "); - } - sideBuilders[i].append(translation.text); - - // TODO: Don't index {m}, {f} - // TODO: Don't even show: (1), (1-2), etc. - addTokens(translation.text, sideTokens.get(i), EntryTypeName.WIKTIONARY_TRANSLATION_ONE_WORD); - } - } - - // Construct the Translations-based QuickDic entry for this TranslationSense. - if (sideBuilders[0].length() > 0 && sideBuilders[1].length() > 0) { - final Pair pair = new Pair(sideBuilders[0].toString(), sideBuilders[1].toString()); - final PairEntry pairEntry = new PairEntry(); - pairEntry.pairs.add(pair); - final IndexedEntry entryData = new IndexedEntry(dictBuilder.dictionary.pairEntries.size(), pairEntry); - dictBuilder.dictionary.pairEntries.add(pairEntry); - - // Add the EntryData to the indices under the correct tokens. - for (int i = 0; i < 2; ++i) { - final IndexBuilder indexBuilder = dictBuilder.indexBuilders.get(i); - for (final Map.Entry> entry : sideTokens.get(i).entrySet()) { - for (final String token : entry.getValue()) { - final List entries = indexBuilder.getOrCreateEntries(token, entry.getKey()); - entries.add(entryData); - } - } - - } - - } - } // Senses - } // Translations - } - - - static void addTokens(final String text, final Map> map, - EntryTypeName entryTypeName) { - final Set tokens = DictFileParser.tokenize(text, DictFileParser.NON_CHAR); - if (tokens.size() > 1 && entryTypeName == EntryTypeName.WIKTIONARY_TITLE_ONE_WORD) { - entryTypeName = EntryTypeName.WIKTIONARY_TITLE_MULTI_WORD; - } - List tokenList = map.get(entryTypeName); - if (tokenList == null) { - tokenList = new ArrayList(); - map.put(entryTypeName, tokenList); - } - tokenList.addAll(tokens); - } - - - -} diff --git a/src/com/hughes/android/dictionary/parser/enwiktionary/AppendAndIndexWikiCallback.java b/src/com/hughes/android/dictionary/parser/enwiktionary/AppendAndIndexWikiCallback.java new file mode 100644 index 0000000..2a5e7c6 --- /dev/null +++ b/src/com/hughes/android/dictionary/parser/enwiktionary/AppendAndIndexWikiCallback.java @@ -0,0 +1,105 @@ +package com.hughes.android.dictionary.parser.enwiktionary; + +import java.util.List; +import java.util.Map; + +import com.hughes.android.dictionary.engine.EntryTypeName; +import com.hughes.android.dictionary.engine.IndexBuilder; +import com.hughes.android.dictionary.engine.IndexedEntry; +import com.hughes.android.dictionary.parser.WikiTokenizer; + +final class AppendAndIndexWikiCallback implements WikiTokenizer.Callback { + + final EnWiktionaryXmlParser parser; + final StringBuilder builder; + final IndexedEntry indexedEntry; + IndexBuilder defaultIndexBuilder; + final Map functionCallbacks; + + // TODO: the classes of text are wrong.... + + public AppendAndIndexWikiCallback( + final EnWiktionaryXmlParser parser, + final String title, + final StringBuilder builder, + final IndexedEntry indexedEntry, + final IndexBuilder defaultIndexBuilder, + final Map functionCallbacks) { + this.parser = parser; + this.indexedEntry = indexedEntry; + this.defaultIndexBuilder = defaultIndexBuilder; + this.builder = builder; + this.functionCallbacks = functionCallbacks; + } + + @Override + public void onPlainText(WikiTokenizer wikiTokenizer) { + // The only non-recursive callback. Just appends to the builder, and indexes. + final String plainText = wikiTokenizer.token(); + builder.append(plainText); + defaultIndexBuilder.addEntryWithString(indexedEntry, plainText, EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); + } + + @Override + public void onWikiLink(WikiTokenizer wikiTokenizer) { + final String wikiText = wikiTokenizer.wikiLinkText(); + + final String linkDest = wikiTokenizer.wikiLinkDest(); + if (linkDest != null) { + System.out.println("linkDest: " + linkDest); + // TODO: Check for English before appending. + // TODO: Could also index under link dest, too. + } + // TODO: save, set, restore text type... + new WikiTokenizer(wikiText, false).dispatch(this); + } + + @Override + public void onFunction( + final String name, + final List args, + final Map namedArgs) { + + final FunctionCallback functionCallback = functionCallbacks.get(name); + if (functionCallback == null || !functionCallback.onWikiFunction(name, args, namedArgs, parser, title)) { + // Default function handling: + builder.append("{{").append(name); + for (int i = 0; i < args.size(); ++i) { + builder.append("|"); + new WikiTokenizer(args.get(i), false).dispatch(this); + } + for (final Map.Entry entry : namedArgs.entrySet()) { + builder.append("|"); + new WikiTokenizer(entry.getKey(), false).dispatch(this); + builder.append("="); + new WikiTokenizer(entry.getValue(), false).dispatch(this); + } + builder.append("}}"); + } + } + + @Override + public void onMarkup(WikiTokenizer wikiTokenizer) { + // Do nothing. + } + + @Override + public void onComment(WikiTokenizer wikiTokenizer) { + // Do nothing. + } + + @Override + public void onNewline(WikiTokenizer wikiTokenizer) { + assert false; + } + + @Override + public void onHeading(WikiTokenizer wikiTokenizer) { + assert false; + } + + @Override + public void onListItem(WikiTokenizer wikiTokenizer) { + assert false; + } +} \ No newline at end of file diff --git a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java b/src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryXmlParser.java similarity index 89% rename from src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java rename to src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryXmlParser.java index b9dbc7d..cc93049 100644 --- a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java +++ b/src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryXmlParser.java @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package com.hughes.android.dictionary.parser; +package com.hughes.android.dictionary.parser.enwiktionary; import java.io.BufferedInputStream; import java.io.DataInputStream; @@ -35,6 +35,7 @@ import com.hughes.android.dictionary.engine.IndexBuilder; import com.hughes.android.dictionary.engine.IndexedEntry; import com.hughes.android.dictionary.engine.PairEntry; import com.hughes.android.dictionary.engine.PairEntry.Pair; +import com.hughes.android.dictionary.parser.WikiTokenizer; public class EnWiktionaryXmlParser { @@ -156,6 +157,11 @@ public class EnWiktionaryXmlParser { } else if (headerName.equals("Pronunciation")) { //doPronunciation(wikiLineReader); } + } else if (wikiTokenizer.isFunction()) { + final String name = wikiTokenizer.functionName(); + if (name.equals("head")) { + LOG.warning("{{head}} without POS: " + title); + } } } } @@ -274,99 +280,6 @@ public class EnWiktionaryXmlParser { } } - private static T get(final List list, final int index, final T defaultValue) { - return index < list.size() ? list.get(index) : defaultValue; - } - - private static T get(final List list, final int index) { - return get(list, index, null); - } - - private static T remove(final List list, final int index, final T defaultValue) { - return index < list.size() ? list.remove(index) : defaultValue; - } - - - static final class AppendAndIndexCallback implements WikiTokenizer.Callback { - public AppendAndIndexCallback( - final StringBuilder builder, - final IndexedEntry indexedEntry, - final IndexBuilder defaultIndexBuilder, - final Map functionCallbacks) { - this.indexedEntry = indexedEntry; - this.defaultIndexBuilder = defaultIndexBuilder; - this.builder = builder; - this.functionCallbacks = functionCallbacks; - } - - final StringBuilder builder; - final IndexedEntry indexedEntry; - IndexBuilder defaultIndexBuilder; - final Map functionCallbacks; - - // TODO: the classes of text are wrong.... - - @Override - public void onPlainText(WikiTokenizer wikiTokenizer) { - // The only non-recursive callback. Just appends to the builder, and - final String plainText = wikiTokenizer.token(); - builder.append(plainText); - defaultIndexBuilder.addEntryWithString(indexedEntry, plainText, EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); - } - - @Override - public void onWikiLink(WikiTokenizer wikiTokenizer) { - final String plainText = wikiTokenizer.wikiLinkText(); - builder.append(plainText); - // TODO: should check for English before appending. - defaultIndexBuilder.addEntryWithString(indexedEntry, plainText, EntryTypeName.WIKTIONARY_TRANSLATION_WIKI_TEXT); - } - - @Override - public void onFunction(final String name, - final List args, final Map namedArgs) { - final WikiFunctionCallback functionCallback = functionCallbacks.get(name); - if (functionCallback != null) { - // Dispatch the handling elsewhere. - functionCallback.onWikiFunction(name, args, namedArgs); - } else { - // Default function handling: - for (int i = 0; i < args.size(); ++i) { - args.set(i, WikiTokenizer.toPlainText(args.get(i))); - } - for (final Map.Entry entry : namedArgs.entrySet()) { - entry.setValue(WikiTokenizer.toPlainText(entry.getValue())); - } - WikiTokenizer.appendFunction(builder, name, args, namedArgs); - } - } - - @Override - public void onMarkup(WikiTokenizer wikiTokenizer) { - // Do nothing. - } - - @Override - public void onComment(WikiTokenizer wikiTokenizer) { - // Do nothing. - } - - @Override - public void onNewline(WikiTokenizer wikiTokenizer) { - assert false; - } - - @Override - public void onHeading(WikiTokenizer wikiTokenizer) { - assert false; - } - - @Override - public void onListItem(WikiTokenizer wikiTokenizer) { - assert false; - } - } - private void doTranslationLine(final String line, final String lang, final String title, final String pos, final String sense, final String rest) { // Good chance we'll actually file this one... final PairEntry pairEntry = new PairEntry(); @@ -393,37 +306,7 @@ public class EnWiktionaryXmlParser { final Map namedArgs = wikiTokenizer.functionNamedArgs(); if (functionName.equals("t") || functionName.equals("t+") || functionName.equals("t-") || functionName.equals("tø") || functionName.equals("apdx-t")) { - if (args.size() < 2) { - LOG.warning("{{t}} with too few args: " + line + ", title=" + title); - continue; - } - final String langCode = get(args, 0); - final String word = get(args, 1); - final String gender = get(args, 2); - final String transliteration = namedArgs.get("tr"); - if (foreignText.length() > 0) { - foreignText.append(""); - } - foreignText.append(word); - foreignIndexBuilder.addEntryWithString(indexedEntry, word, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI); - if (gender != null) { - foreignText.append(String.format(" {%s}", gender)); - } - if (transliteration != null) { - foreignText.append(String.format(TRANSLITERATION_FORMAT, transliteration)); - foreignIndexBuilder.addEntryWithString(indexedEntry, transliteration, EntryTypeName.WIKTIONARY_TRANSLITERATION); - } } else if (functionName.equals("qualifier")) { - if (args.size() == 0) { - foreignText.append(wikiTokenizer.token()); - } else { - String qualifier = args.get(0); - if (!namedArgs.isEmpty() || args.size() > 1) { - LOG.warning("weird qualifier: " + line); - } - // Unindexed! - foreignText.append("(").append(qualifier).append(")"); - } } else if (encodings.contains(functionName)) { foreignText.append("").append(args.get(0)); foreignIndexBuilder.addEntryWithString(indexedEntry, args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); @@ -465,7 +348,9 @@ public class EnWiktionaryXmlParser { } else { LOG.warning("Bad translation token: " + wikiTokenizer.token()); } - } + } // while-token loop. + + if (foreignText.length() == 0) { LOG.warning("Empty foreignText: " + line); return; diff --git a/src/com/hughes/android/dictionary/parser/enwiktionary/FunctionCallback.java b/src/com/hughes/android/dictionary/parser/enwiktionary/FunctionCallback.java new file mode 100644 index 0000000..e9c9fb4 --- /dev/null +++ b/src/com/hughes/android/dictionary/parser/enwiktionary/FunctionCallback.java @@ -0,0 +1,80 @@ +package com.hughes.android.dictionary.parser.enwiktionary; + +import java.util.List; +import java.util.Map; +import java.util.logging.Logger; + +import com.hughes.android.dictionary.parser.WikiTokenizer; +import com.hughes.util.ListUtil; + +public interface FunctionCallback { + + static final Logger LOG = Logger.getLogger(EnWiktionaryXmlParser.class.getName()); + + boolean onWikiFunction( + final String name, + final List args, + final Map namedArgs, + final EnWiktionaryXmlParser parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback, + final String title); + + static final class TranslationCallback implements FunctionCallback { + @Override + public boolean onWikiFunction(final String name, final List args, + final Map namedArgs, final EnWiktionaryXmlParser parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback, + final String title) { + + final String transliteration = namedArgs.remove("tr"); + + if (args.size() < 2 || args.size() > 3 || namedArgs.isEmpty()) { + LOG.warning("{{t}} with too few args: " + ", title=" + title); + return false; + } + final String langCode = ListUtil.get(args, 0); + final String word = ListUtil.get(args, 1); + final String gender = ListUtil.get(args, 2); + +// TODO appendAndIndexWikiCallback we're inside translation.... + //EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI + new WikiTokenizer(word, false).dispatch(appendAndIndexWikiCallback); + + if (gender != null) { + appendAndIndexWikiCallback.builder.append(String.format(" {%s}", gender)); + } + if (transliteration != null) { + // TODO appendAndIndexWikiCallback we're inside translation.... + // EntryTypeName.WIKTIONARY_TRANSLITERATION + appendAndIndexWikiCallback.builder.append("(tr. "); + new WikiTokenizer(transliteration).dispatch(appendAndIndexWikiCallback); + appendAndIndexWikiCallback.builder.append(")"); + } + return true; + } + + } + + // ------------------------------------------------------------------ + + static final class QualifierCallback implements FunctionCallback { + @Override + public boolean onWikiFunction(final String name, final List args, + final Map namedArgs, + final EnWiktionaryXmlParser parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback, + final String title) { + if (args.size() != 1 || !namedArgs.isEmpty()) { + LOG.warning("weird qualifier: "); + return false; + } + String qualifier = args.get(0); + // Unindexed! + appendAndIndexWikiCallback.builder.append("("); + new WikiTokenizer(qualifier, false).dispatch(appendAndIndexWikiCallback); + appendAndIndexWikiCallback.builder.append(")"); + return true; + } + } + +} -- 2.43.0