X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fparser%2FEnWiktionaryXmlParser.java;h=01af90c9da4d821d5b2369af11a805d54660ed4e;hb=7819736ae570bf597936f0dc640f60644da15fc8;hp=6a6f43856a2bb34aa3e720d9ae839bc80401f6b3;hpb=a7ae2524281869de5aa756ae35524b21bab3e08a;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java index 6a6f438..01af90c 100644 --- a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java +++ b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java @@ -38,6 +38,8 @@ import com.hughes.android.dictionary.engine.PairEntry.Pair; public class EnWiktionaryXmlParser { + private static final String TRANSLITERATION_FORMAT = " (tr. %s)"; + static final Logger LOG = Logger.getLogger(EnWiktionaryXmlParser.class.getName()); // TODO: process {{ttbc}} lines @@ -46,7 +48,7 @@ public class EnWiktionaryXmlParser { "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" + "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" + "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" + - "Ligature|Idiom|Phrase|" + + "Ligature|Idiom|Phrase|\\{\\{acronym\\}\\}|\\{\\{initialism\\}\\}|" + // These are @deprecated: "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" + "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb|" + @@ -117,8 +119,8 @@ public class EnWiktionaryXmlParser { heading = heading.replaceAll("=", "").trim(); if (heading.equals("English")) { doEnglishWord(title, text); - } else if (langPattern.matcher(heading).matches()){ - doForeignWord(title, text); + } else if (langPattern.matcher(heading).find()){ + doForeignWord(heading, title, text); } } // endPage() @@ -144,6 +146,8 @@ public class EnWiktionaryXmlParser { if (partOfSpeechHeader.matcher(headerName).matches()) { posDepth = wikiTokenizer.headingDepth(); pos = wikiTokenizer.headingWikiText(); + // TODO: if we're inside the POS section, we should handle the first title line... + } else if (headerName.equals("Translations")) { if (pos == null) { LOG.warning("Translations without POS: " + title); @@ -165,9 +169,10 @@ public class EnWiktionaryXmlParser { private void doTranslations(final String title, final WikiTokenizer wikiTokenizer, final String pos) { if (title.equals("absolutely")) { - System.out.println(); + //System.out.println(); } + String topLevelLang = null; String sense = null; boolean done = false; while (wikiTokenizer.nextToken() != null) { @@ -181,8 +186,6 @@ public class EnWiktionaryXmlParser { // Check whether we care about this line: - //line = WikiLineReader.removeSquareBrackets(line); - if (wikiTokenizer.isFunction()) { final String functionName = wikiTokenizer.functionName(); final List positionArgs = wikiTokenizer.functionPositionArgs(); @@ -202,6 +205,7 @@ public class EnWiktionaryXmlParser { // TODO: would also be nice... } else if (functionName.startsWith("picdic")) { } else if (functionName.startsWith("checktrans")) { + done = true; } else if (functionName.startsWith("ttbc")) { wikiTokenizer.nextLine(); // TODO: would be great to handle ttbc @@ -213,6 +217,10 @@ public class EnWiktionaryXmlParser { final String line = wikiTokenizer.listItemWikiText(); // This line could produce an output... + if (line.contains("ich hoan dich gear")) { + //System.out.println(); + } + // First strip the language and check whether it matches. // And hold onto it for sub-lines. final int colonIndex = line.indexOf(":"); @@ -220,16 +228,28 @@ public class EnWiktionaryXmlParser { continue; } - final String lang = line.substring(0, colonIndex); - if (!this.langPattern.matcher(lang).find()) { + final String lang = trim(WikiTokenizer.toPlainText(line.substring(0, colonIndex))); + final boolean appendLang; + if (wikiTokenizer.listItemPrefix().length() == 1) { + topLevelLang = lang; + final boolean thisFind = langPattern.matcher(lang).find(); + if (!thisFind) { + continue; + } + appendLang = !langPattern.matcher(lang).matches(); + } else if (topLevelLang == null) { continue; + } else { + // Two-level -- the only way we won't append is if this second level matches exactly. + if (!langPattern.matcher(lang).matches() && !langPattern.matcher(topLevelLang).find()) { + continue; + } + appendLang = !langPattern.matcher(lang).matches(); } String rest = line.substring(colonIndex + 1).trim(); if (rest.length() > 0) { - doTranslationLine(line, title, pos, sense, rest); - } else { - // TODO: do lines that are like "Greek:" + doTranslationLine(line, appendLang ? lang : null, title, pos, sense, rest); } } else if (wikiTokenizer.remainderStartsWith("''See''")) { @@ -254,11 +274,19 @@ public class EnWiktionaryXmlParser { } } + private static T get(final List list, final int index, final T defaultValue) { + return index < list.size() ? list.get(index) : defaultValue; + } + private static T get(final List list, final int index) { - return index < list.size() ? list.get(index) : null; + return get(list, index, null); } - - private void doTranslationLine(final String line, final String title, final String pos, final String sense, final String rest) { + + private static T remove(final List list, final int index, final T defaultValue) { + return index < list.size() ? list.remove(index) : defaultValue; + } + + private void doTranslationLine(final String line, final String lang, final String title, final String pos, final String sense, final String rest) { // Good chance we'll actually file this one... final PairEntry pairEntry = new PairEntry(); final IndexedEntry indexedEntry = new IndexedEntry(pairEntry); @@ -301,7 +329,7 @@ public class EnWiktionaryXmlParser { otherText.append(String.format(" {%s}", gender)); } if (transliteration != null) { - otherText.append(String.format(" (tr. %s)", transliteration)); + otherText.append(String.format(TRANSLITERATION_FORMAT, transliteration)); otherIndexBuilder.addEntryWithString(indexedEntry, transliteration, EntryTypeName.WIKTIONARY_TRANSLITERATION); } //} @@ -347,7 +375,8 @@ public class EnWiktionaryXmlParser { // American sign language } else { // Unindexed! - otherText.append(wikiTokenizer.token()); + namedArgs.keySet().removeAll(USELESS_WIKI_ARGS); + WikiTokenizer.appendFunction(otherText.append("{{"), functionName, args, namedArgs).append("}}"); } } else if (wikiTokenizer.isNewline()) { @@ -363,6 +392,10 @@ public class EnWiktionaryXmlParser { return; } + if (lang != null) { + otherText.insert(0, String.format("(%s) ", lang)); + } + StringBuilder englishText = new StringBuilder(); englishText.append(title); @@ -406,7 +439,7 @@ public class EnWiktionaryXmlParser { // ------------------------------------------------------------------------- - private void doForeignWord(final String title, final String text) { + private void doForeignWord(final String lang, final String title, final String text) { final WikiTokenizer wikiTokenizer = new WikiTokenizer(text); while (wikiTokenizer.nextToken() != null) { if (wikiTokenizer.isHeading()) { @@ -416,7 +449,7 @@ public class EnWiktionaryXmlParser { } else if (headingName.equals("Pronunciation")) { //doPronunciation(wikiLineReader); } else if (partOfSpeechHeader.matcher(headingName).matches()) { - doForeignPartOfSpeech(title, headingName, wikiTokenizer.headingDepth(), wikiTokenizer); + doForeignPartOfSpeech(lang, title, headingName, wikiTokenizer.headingDepth(), wikiTokenizer); } } else { } @@ -442,14 +475,15 @@ public class EnWiktionaryXmlParser { int foreignCount = 0; - private void doForeignPartOfSpeech(String title, final String posHeading, final int posDepth, WikiTokenizer wikiTokenizer) { + private void doForeignPartOfSpeech(final String lang, String title, final String posHeading, final int posDepth, WikiTokenizer wikiTokenizer) { if (++foreignCount % 1000 == 0) { - LOG.info("***" + title + ", pos=" + posHeading + ", foreignCount=" + foreignCount); + LOG.info("***" + lang + ", " + title + ", pos=" + posHeading + ", foreignCount=" + foreignCount); } if (title.equals("moro")) { System.out.println(); } + boolean titleAppended = false; final StringBuilder foreignBuilder = new StringBuilder(); final Collection wordForms = new ArrayList(); final List listSections = new ArrayList(); @@ -491,19 +525,83 @@ public class EnWiktionaryXmlParser { // I think just under fare. But then we need a way to link to the entry (actually the row, since entries doesn't show up!) // for the conjugation table from "fa". // Would like to be able to link to a lang#token. - if (isGender(name)) { - appendGender(foreignBuilder, name, args); - } else if (name.equals("wikipedia")) { - namedArgs.remove("lang"); - if (args.size() > 1 || !namedArgs.isEmpty()) { - // Unindexed! - foreignBuilder.append(wikiTokenizer.token()); - } else if (args.size() == 1) { - foreignBuilder.append(wikiTokenizer.token()); - } else { - //foreignBuilder.append(title); - } - } else if (name.equals("it-noun")) { + if (isGender(name)) { + appendGender(foreignBuilder, name, args); + } else if (name.equals("wikipedia")) { + namedArgs.remove("lang"); + if (args.size() > 1 || !namedArgs.isEmpty()) { + // Unindexed! + foreignBuilder.append(wikiTokenizer.token()); + } else if (args.size() == 1) { + foreignBuilder.append(wikiTokenizer.token()); + } else { + //foreignBuilder.append(title); + } + } else if (name.equals("attention") || name.equals("zh-attention")) { + // See: http://en.wiktionary.org/wiki/Template:attention + // Ignore these. + } else if (name.equals("infl")) { + // See: http://en.wiktionary.org/wiki/Template:infl + final String langCode = get(args, 0); + String head = namedArgs.remove("head"); + if (head == null) { + head = namedArgs.remove("title"); // Bug + } + if (head == null) { + head = title; + } else { + head = WikiTokenizer.toPlainText(head); + } + titleAppended = true; + + namedArgs.keySet().removeAll(USELESS_WIKI_ARGS); + + final String tr = namedArgs.remove("tr"); + String g = namedArgs.remove("g"); + if (g == null) { + g = namedArgs.remove("gender"); + } + final String g2 = namedArgs.remove("g2"); + final String g3 = namedArgs.remove("g3"); + + foreignBuilder.append(head); + + if (g != null) { + foreignBuilder.append(" {").append(g); + if (g2 != null) { + foreignBuilder.append("|").append(g2); + } + if (g3 != null) { + foreignBuilder.append("|").append(g3); + } + foreignBuilder.append("}"); + } + + if (tr != null) { + foreignBuilder.append(String.format(TRANSLITERATION_FORMAT, tr)); + wordForms.add(tr); + } + + final String pos = get(args, 1); + if (pos != null) { + foreignBuilder.append(" (").append(pos).append(")"); + } + for (int i = 2; i < args.size(); i += 2) { + final String inflName = get(args, i); + final String inflValue = get(args, i + 1); + foreignBuilder.append(", ").append(WikiTokenizer.toPlainText(inflName)); + if (inflValue != null && inflValue.length() > 0) { + foreignBuilder.append(": ").append(WikiTokenizer.toPlainText(inflValue)); + wordForms.add(inflValue); + } + } + for (final String key : namedArgs.keySet()) { + final String value = WikiTokenizer.toPlainText(namedArgs.get(key)); + foreignBuilder.append(" ").append(key).append("=").append(value); + wordForms.add(value); + } + } else if (name.equals("it-noun")) { + titleAppended = true; final String base = get(args, 0); final String gender = get(args, 1); final String singular = base + get(args, 2); @@ -511,6 +609,9 @@ public class EnWiktionaryXmlParser { foreignBuilder.append(String.format(" %s {%s}, %s {pl}", singular, gender, plural, plural)); wordForms.add(singular); wordForms.add(plural); + if (!namedArgs.isEmpty() || args.size() > 4) { + LOG.warning("Invalid it-noun: " + wikiTokenizer.token()); + } } else if (name.equals("it-proper noun")) { foreignBuilder.append(wikiTokenizer.token()); } else if (name.equals("it-adj")) { @@ -528,7 +629,6 @@ public class EnWiktionaryXmlParser { foreignBuilder.append(wikiTokenizer.token()); // LOG.warning("Unknown function: " + wikiTokenizer.token()); } - } else if (wikiTokenizer.isListItem()) { final String prefix = wikiTokenizer.listItemPrefix(); if (lastListSection != null && @@ -561,8 +661,11 @@ public class EnWiktionaryXmlParser { // Here's where we exit. // Should we make an entry even if there are no foreign list items? String foreign = foreignBuilder.toString().trim(); - if (!foreign.toLowerCase().startsWith(title.toLowerCase())) { - foreign = title + " " + foreign; + if (!titleAppended && !foreign.toLowerCase().startsWith(title.toLowerCase())) { + foreign = String.format("%s %s", title, foreign); + } + if (!langPattern.matcher(lang).matches()) { + foreign = String.format("(%s) %s", lang, foreign); } for (final ListSection listSection : listSections) { doForeignListItem(foreign, title, wordForms, listSection); @@ -577,6 +680,8 @@ public class EnWiktionaryXmlParser { "imperative" ); + // Might only want to remove "lang" if it's equal to "zh", for example. + static final Set USELESS_WIKI_ARGS = new LinkedHashSet(Arrays.asList("lang", "sc", "sort", "cat")); private void doForeignListItem(final String foreignText, String title, final Collection forms, final ListSection listSection) { @@ -593,7 +698,6 @@ public class EnWiktionaryXmlParser { final StringBuilder englishBuilder = new StringBuilder(); final String mainLine = listSection.firstLine; - final WikiTokenizer englishTokenizer = new WikiTokenizer(mainLine, false); while (englishTokenizer.nextToken() != null) { // TODO handle form of.... @@ -626,20 +730,40 @@ public class EnWiktionaryXmlParser { } } else if (englishTokenizer.isFunction()) { final String name = englishTokenizer.functionName(); - if (name.contains("conjugation of ") || - name.contains("form of ") || - name.contains("feminine of ") || - name.contains("plural of ")) { - // Ignore these in the index, they're really annoying.... - englishBuilder.append(englishTokenizer.token()); + final List args = englishTokenizer.functionPositionArgs(); + final Map namedArgs = englishTokenizer.functionNamedArgs(); + + if ( + name.equals("form of") || + name.contains("conjugation of") || + name.contains("participle of") || + name.contains("gerund of") || + name.contains("feminine of") || + name.contains("plural of")) { + String formName = name; + if (name.equals("form of")) { + formName = args.remove(0); + } + String baseForm = get(args, 1, ""); + if ("".equals(baseForm)) { + baseForm = get(args, 0, null); + remove(args, 1, ""); + } else { + remove(args, 0, null); + } + namedArgs.keySet().removeAll(USELESS_WIKI_ARGS); + WikiTokenizer.appendFunction(englishBuilder.append("{"), formName, args, namedArgs).append("}"); + otherIndexBuilder.addEntryWithString(indexedEntry, baseForm, EntryTypeName.WIKTIONARY_BASE_FORM_SINGLE, EntryTypeName.WIKTIONARY_BASE_FORM_MULTI); + } else { - englishBuilder.append(englishTokenizer.token()); + namedArgs.keySet().removeAll(USELESS_WIKI_ARGS); + WikiTokenizer.appendFunction(englishBuilder.append("{{"), name, args, namedArgs).append("}}"); // LOG.warning("Unexpected function: " + englishTokenizer.token()); } } else { if (englishTokenizer.isComment() || englishTokenizer.isMarkup()) { } else { - LOG.warning("Unexpected definition text: " + englishTokenizer.token()); + LOG.warning("Unexpected definition type: " + englishTokenizer.token()); } } } @@ -650,7 +774,7 @@ public class EnWiktionaryXmlParser { pairEntry.pairs.add(pair); otherIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI); for (final String form : forms) { - otherIndexBuilder.addEntryWithString(indexedEntry, form, EntryTypeName.WIKTIONARY_FORM_SINGLE, EntryTypeName.WIKTIONARY_FORM_MULTI); + otherIndexBuilder.addEntryWithString(indexedEntry, form, EntryTypeName.WIKTIONARY_INFLECTD_FORM_SINGLE, EntryTypeName.WIKTIONARY_INFLECTED_FORM_MULTI); } } @@ -685,25 +809,30 @@ public class EnWiktionaryXmlParser { pairEntry.pairs.add(pair); } } else if (nextPrefix.equals("#::") || nextPrefix.equals("#**")) { - if (lastForeign != null) { + if (lastForeign != null && pairEntry.pairs.size() > 0) { pairEntry.pairs.remove(pairEntry.pairs.size() - 1); final Pair pair = new Pair(formatAndIndexExampleString(nextLine, enIndexBuilder, indexedEntry), formatAndIndexExampleString(lastForeign, otherIndexBuilder, indexedEntry), swap); - if (pair.lang1 != "--" && pair.lang1 != "--") { + if (pair.lang1 != "--" || pair.lang2 != "--") { pairEntry.pairs.add(pair); } + lastForeign = null; } else { - LOG.warning("English example with no foreign: " + title + ", " + nextLine); + LOG.warning("TODO: English example with no foreign: " + title + ", " + nextLine); + final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap); + if (pair.lang1 != "--" || pair.lang2 != "--") { + pairEntry.pairs.add(pair); + } } } else if (nextPrefix.equals("#*")) { // Can't really index these. final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap); lastForeign = nextLine; - if (pair.lang1 != "--" && pair.lang1 != "--") { + if (pair.lang1 != "--" || pair.lang2 != "--") { pairEntry.pairs.add(pair); } } else if (nextPrefix.equals("#::*") || nextPrefix.equals("##") || nextPrefix.equals("#*:") || nextPrefix.equals("#:*") || true) { final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap); - if (pair.lang1 != "--" && pair.lang1 != "--") { + if (pair.lang1 != "--" || pair.lang2 != "--") { pairEntry.pairs.add(pair); } // } else {