X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fparser%2Fenwiktionary%2FEnWiktionaryXmlParser.java;h=7ccbf9b6d55eda39997a75701de71f21df37c0d6;hb=52887b59a691a06638ce7ecf75064dc34c55701b;hp=9ec660186ad1635c3ecd575cdeacdf076d5e02a6;hpb=90ec4974a7834567b5721528e87a2ce857cce53b;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryXmlParser.java b/src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryXmlParser.java index 9ec6601..7ccbf9b 100644 --- a/src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryXmlParser.java +++ b/src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryXmlParser.java @@ -36,12 +36,9 @@ import com.hughes.android.dictionary.engine.IndexedEntry; import com.hughes.android.dictionary.engine.PairEntry; import com.hughes.android.dictionary.engine.PairEntry.Pair; import com.hughes.android.dictionary.parser.WikiTokenizer; -import com.hughes.util.ListUtil; public class EnWiktionaryXmlParser { - private static final String TRANSLITERATION_FORMAT = " (tr. %s)"; - static final Logger LOG = Logger.getLogger(EnWiktionaryXmlParser.class.getName()); // TODO: process {{ttbc}} lines @@ -51,6 +48,7 @@ public class EnWiktionaryXmlParser { "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" + "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" + "Ligature|Idiom|Phrase|\\{\\{acronym\\}\\}|\\{\\{initialism\\}\\}|" + + "\\{\\{abbreviation\\}\\}|" + // These are @deprecated: "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" + "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb|" + @@ -162,7 +160,7 @@ public class EnWiktionaryXmlParser { } else if (headerName.equals("Translations")) { if (pos == null) { - LOG.warning("Translations without POS: " + title); + LOG.info("Translations without POS (but using anyway): " + title); } doTranslations(wikiTokenizer, pos); } else if (headerName.equals("Pronunciation")) { @@ -170,7 +168,7 @@ public class EnWiktionaryXmlParser { } } else if (wikiTokenizer.isFunction()) { final String name = wikiTokenizer.functionName(); - if (name.equals("head")) { + if (name.equals("head") && pos == null) { LOG.warning("{{head}} without POS: " + title); } } @@ -178,6 +176,9 @@ public class EnWiktionaryXmlParser { } final AppendAndIndexWikiCallback appendAndIndexWikiCallback = new AppendAndIndexWikiCallback(this); + { + appendAndIndexWikiCallback.functionCallbacks.putAll(FunctionCallbacksDefault.DEFAULT); + } private void doTranslations(final WikiTokenizer wikiTokenizer, final String pos) { if (title.equals("absolutely")) { @@ -294,8 +295,6 @@ public class EnWiktionaryXmlParser { final StringBuilder foreignText = new StringBuilder(); appendAndIndexWikiCallback.reset(foreignText, indexedEntry); - appendAndIndexWikiCallback.functionCallbacks.clear(); - appendAndIndexWikiCallback.functionCallbacks.putAll(FunctionCallbacksDefault.DEFAULT); appendAndIndexWikiCallback.dispatch(rest, foreignIndexBuilder, EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); if (foreignText.length() == 0) { @@ -324,10 +323,6 @@ public class EnWiktionaryXmlParser { if (!pairsAdded.add(pair.toString())) { LOG.warning("Duplicate pair: " + pair.toString()); } - if (pair.toString().equals("libero {m} :: free (adjective)")) { - System.out.println(); - } - } @@ -371,19 +366,25 @@ public class EnWiktionaryXmlParser { int foreignCount = 0; + final Collection wordForms = new ArrayList(); + boolean titleAppended = false; + private void doForeignPartOfSpeech(final String lang, String posHeading, final int posDepth, WikiTokenizer wikiTokenizer) { if (++foreignCount % 1000 == 0) { LOG.info("***" + lang + ", " + title + ", pos=" + posHeading + ", foreignCount=" + foreignCount); } - if (title.equals("moro")) { + if (title.equals("6")) { System.out.println(); } - boolean titleAppended = false; final StringBuilder foreignBuilder = new StringBuilder(); - final Collection wordForms = new ArrayList(); final List listSections = new ArrayList(); + appendAndIndexWikiCallback.reset(foreignBuilder, null); + this.state = State.ENGLISH_DEF_OF_FOREIGN; // TODO: this is wrong, need new category.... + titleAppended = false; + wordForms.clear(); + try { ListSection lastListSection = null; @@ -421,112 +422,9 @@ public class EnWiktionaryXmlParser { // I think just under fare. But then we need a way to link to the entry (actually the row, since entries doesn't show up!) // for the conjugation table from "fa". // Would like to be able to link to a lang#token. - if (FunctionCallbacksDefault.DEFAULT.get(name) instanceof FunctionCallbacksDefault.Gender) { - // TODO: Fix hack! - appendAndIndexWikiCallback.reset(foreignBuilder, null); - FunctionCallbacksDefault.DEFAULT.get(name).onWikiFunction(wikiTokenizer, name, args, namedArgs, this, appendAndIndexWikiCallback); - } else if (name.equals("wikipedia")) { - namedArgs.remove("lang"); - if (args.size() > 1 || !namedArgs.isEmpty()) { - // Unindexed! - foreignBuilder.append(wikiTokenizer.token()); - } else if (args.size() == 1) { - foreignBuilder.append(wikiTokenizer.token()); - } else { - //foreignBuilder.append(title); - } - } else if (name.equals("attention") || name.equals("zh-attention")) { - // See: http://en.wiktionary.org/wiki/Template:attention - // Ignore these. - } else if (name.equals("infl") || name.equals("head")) { - // See: http://en.wiktionary.org/wiki/Template:infl - final String langCode = ListUtil.get(args, 0); - String head = namedArgs.remove("head"); - if (head == null) { - head = namedArgs.remove("title"); // Bug - } - if (head == null) { - head = title; - } else { - head = WikiTokenizer.toPlainText(head); - } - titleAppended = true; - - namedArgs.keySet().removeAll(USELESS_WIKI_ARGS); - - final String tr = namedArgs.remove("tr"); - String g = namedArgs.remove("g"); - if (g == null) { - g = namedArgs.remove("gender"); - } - final String g2 = namedArgs.remove("g2"); - final String g3 = namedArgs.remove("g3"); - - foreignBuilder.append(head); - - if (g != null) { - foreignBuilder.append(" {").append(g); - if (g2 != null) { - foreignBuilder.append("|").append(g2); - } - if (g3 != null) { - foreignBuilder.append("|").append(g3); - } - foreignBuilder.append("}"); - } - - if (tr != null) { - foreignBuilder.append(String.format(TRANSLITERATION_FORMAT, tr)); - wordForms.add(tr); - } - - final String pos = ListUtil.get(args, 1); - if (pos != null) { - foreignBuilder.append(" (").append(pos).append(")"); - } - for (int i = 2; i < args.size(); i += 2) { - final String inflName = ListUtil.get(args, i); - final String inflValue = ListUtil.get(args, i + 1); - foreignBuilder.append(", ").append(WikiTokenizer.toPlainText(inflName)); - if (inflValue != null && inflValue.length() > 0) { - foreignBuilder.append(": ").append(WikiTokenizer.toPlainText(inflValue)); - wordForms.add(inflValue); - } - } - for (final String key : namedArgs.keySet()) { - final String value = WikiTokenizer.toPlainText(namedArgs.get(key)); - foreignBuilder.append(" ").append(key).append("=").append(value); - wordForms.add(value); - } - } else if (name.equals("it-noun")) { - titleAppended = true; - final String base = ListUtil.get(args, 0); - final String gender = ListUtil.get(args, 1); - final String singular = base + ListUtil.get(args, 2, null); - final String plural = base + ListUtil.get(args, 3, null); - foreignBuilder.append(String.format(" %s {%s}, %s {pl}", singular, gender, plural, plural)); - wordForms.add(singular); - wordForms.add(plural); - if (!namedArgs.isEmpty() || args.size() > 4) { - LOG.warning("Invalid it-noun: " + wikiTokenizer.token()); - } - } else if (name.equals("it-proper noun")) { - foreignBuilder.append(wikiTokenizer.token()); - } else if (name.equals("it-adj")) { - foreignBuilder.append(wikiTokenizer.token()); - } else if (name.startsWith("it-conj")) { - if (name.equals("it-conj-are")) { - itConjAre(args, namedArgs); - } else if (name.equals("it-conj-ere")) { - } else if (name.equals("it-conj-ire")) { - } else { - LOG.warning("Unknown conjugation: " + wikiTokenizer.token()); - } - } else { - // Unindexed! - foreignBuilder.append(wikiTokenizer.token()); - // LOG.warning("Unknown function: " + wikiTokenizer.token()); - } + + appendAndIndexWikiCallback.onFunction(wikiTokenizer, name, args, namedArgs); + } else if (wikiTokenizer.isListItem()) { final String prefix = wikiTokenizer.listItemPrefix(); if (lastListSection != null && @@ -566,7 +464,7 @@ public class EnWiktionaryXmlParser { foreign = String.format("(%s) %s", lang, foreign); } for (final ListSection listSection : listSections) { - doForeignListItem(foreign, title, wordForms, listSection); + doForeignListSection(foreign, title, wordForms, listSection); } } } @@ -579,9 +477,12 @@ public class EnWiktionaryXmlParser { "sc", "sort", "cat", - "xs")); + "xs", + "nodot")); + + public boolean entryIsFormOfSomething = false; - private void doForeignListItem(final String foreignText, String title, final Collection forms, final ListSection listSection) { + private void doForeignListSection(final String foreignText, String title, final Collection forms, final ListSection listSection) { state = State.ENGLISH_DEF_OF_FOREIGN; final String prefix = listSection.firstPrefix; if (prefix.length() > 1) { @@ -592,19 +493,18 @@ public class EnWiktionaryXmlParser { final PairEntry pairEntry = new PairEntry(); final IndexedEntry indexedEntry = new IndexedEntry(pairEntry); - + + entryIsFormOfSomething = false; final StringBuilder englishBuilder = new StringBuilder(); final String mainLine = listSection.firstLine; - appendAndIndexWikiCallback.reset(englishBuilder, indexedEntry); - appendAndIndexWikiCallback.functionCallbacks.putAll(FunctionCallbacksDefault.DEFAULT); appendAndIndexWikiCallback.dispatch(mainLine, enIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF); final String english = trim(englishBuilder.toString()); if (english.length() > 0) { final Pair pair = new Pair(english, trim(foreignText), this.swap); pairEntry.pairs.add(pair); - foreignIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_MULTI); + foreignIndexBuilder.addEntryWithString(indexedEntry, title, entryIsFormOfSomething ? EntryTypeName.WIKTIONARY_IS_FORM_OF_SOMETHING_ELSE : EntryTypeName.WIKTIONARY_TITLE_MULTI); for (final String form : forms) { foreignIndexBuilder.addEntryWithString(indexedEntry, form, EntryTypeName.WIKTIONARY_INFLECTED_FORM_MULTI); } @@ -615,6 +515,8 @@ public class EnWiktionaryXmlParser { for (int i = 0; i < listSection.nextPrefixes.size(); ++i) { final String nextPrefix = listSection.nextPrefixes.get(i); final String nextLine = listSection.nextLines.get(i); + + // TODO: This splitting is not sensitive to wiki code. int dash = nextLine.indexOf("—"); int mdashLen = 7; if (dash == -1) { @@ -674,32 +576,19 @@ public class EnWiktionaryXmlParser { } private String formatAndIndexExampleString(final String example, final IndexBuilder indexBuilder, final IndexedEntry indexedEntry) { - final WikiTokenizer wikiTokenizer = new WikiTokenizer(example, false); + // TODO: +// if (wikiTokenizer.token().equals("'''")) { +// insideTripleQuotes = !insideTripleQuotes; +// } final StringBuilder builder = new StringBuilder(); - boolean insideTripleQuotes = false; - while (wikiTokenizer.nextToken() != null) { - if (wikiTokenizer.isPlainText()) { - builder.append(wikiTokenizer.token()); - if (indexBuilder != null) { - indexBuilder.addEntryWithStringNoSingle(indexedEntry, wikiTokenizer.token(), EntryTypeName.WIKTIONARY_EXAMPLE); - } - } else if (wikiTokenizer.isWikiLink()) { - final String text = wikiTokenizer.wikiLinkText().replaceAll("'", ""); - builder.append(text); - if (indexBuilder != null) { - indexBuilder.addEntryWithStringNoSingle(indexedEntry, text, EntryTypeName.WIKTIONARY_EXAMPLE); - } - } else if (wikiTokenizer.isFunction()) { - builder.append(wikiTokenizer.token()); - } else if (wikiTokenizer.isMarkup()) { - if (wikiTokenizer.token().equals("'''")) { - insideTripleQuotes = !insideTripleQuotes; - } - } else if (wikiTokenizer.isComment() || wikiTokenizer.isNewline()) { - // Do nothing. - } else { - LOG.warning("unexpected token: " + wikiTokenizer.token()); - } + appendAndIndexWikiCallback.reset(builder, indexedEntry); + appendAndIndexWikiCallback.entryTypeName = EntryTypeName.WIKTIONARY_EXAMPLE; + appendAndIndexWikiCallback.entryTypeNameSticks = true; + try { + // TODO: this is a hack needed because we don't safely split on the dash. + appendAndIndexWikiCallback.dispatch(example, indexBuilder, EntryTypeName.WIKTIONARY_EXAMPLE); + } catch (AssertionError e) { + return "--"; } final String result = trim(builder.toString()); return result.length() > 0 ? result : "--";