X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fparser%2FEnWiktionaryXmlParser.java;h=c6aee3b97d28cea455aef0d80e5483b483d308a3;hb=57d93b56ca2ffa3be718469a9f89f66b4716ad4e;hp=95e910d9656a342eac1bc8b3799e238e91014f8f;hpb=eeb5667c56b2074b7eeac531589c9f1bf55ba738;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java index 95e910d..c6aee3b 100644 --- a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java +++ b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java @@ -1,3 +1,17 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package com.hughes.android.dictionary.parser; import java.io.BufferedInputStream; @@ -6,12 +20,16 @@ import java.io.EOFException; import java.io.File; import java.io.FileInputStream; import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; import java.util.Set; +import java.util.logging.Logger; import java.util.regex.Pattern; -import com.hughes.android.dictionary.engine.DictionaryBuilder; import com.hughes.android.dictionary.engine.EntryTypeName; import com.hughes.android.dictionary.engine.IndexBuilder; import com.hughes.android.dictionary.engine.IndexedEntry; @@ -20,11 +38,17 @@ import com.hughes.android.dictionary.engine.PairEntry.Pair; public class EnWiktionaryXmlParser { + private static final String TRANSLITERATION_FORMAT = " (tr. %s)"; + + static final Logger LOG = Logger.getLogger(EnWiktionaryXmlParser.class.getName()); + + // TODO: process {{ttbc}} lines + static final Pattern partOfSpeechHeader = Pattern.compile( "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" + "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" + "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" + - "Ligature|Idiom|Phrase|" + + "Ligature|Idiom|Phrase|\\{\\{acronym\\}\\}|\\{\\{initialism\\}\\}|" + // These are @deprecated: "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" + "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb|" + @@ -32,7 +56,7 @@ public class EnWiktionaryXmlParser { "Determiner|Numeral|Number|Cardinal number|Ordinal number|Proverb|" + "Particle|Interjection|Pronominal adverb" + "Han character|Hanzi|Hanja|Kanji|Katakana character|Syllable"); - + final IndexBuilder enIndexBuilder; final IndexBuilder otherIndexBuilder; final Pattern langPattern; @@ -73,12 +97,12 @@ public class EnWiktionaryXmlParser { ++pageCount; if (pageCount % 1000 == 0) { - System.out.println("pageCount=" + pageCount); + LOG.info("pageCount=" + pageCount); } } } - private void parseSection(final String title, final String heading, final String text) { + private void parseSection(final String title, String heading, final String text) { if (title.startsWith("Wiktionary:") || title.startsWith("Template:") || title.startsWith("Appendix:") || @@ -92,37 +116,44 @@ public class EnWiktionaryXmlParser { return; } - if (heading.replaceAll("=", "").equals("English")) { + heading = heading.replaceAll("=", "").trim(); + if (heading.equals("English")) { doEnglishWord(title, text); - } else { - doForeignWord(title, text); + } else if (langPattern.matcher(heading).find()){ + doForeignWord(heading, title, text); } } // endPage() // ------------------------------------------------------------------------- - String pos = null; - int posDepth = -1; - private void doEnglishWord(String title, String text) { - final WikiLineReader wikiLineReader = new WikiLineReader(text); - String line; - while ((line = wikiLineReader.readLine()) != null) { - final WikiHeading wikiHeading = WikiHeading.getHeading(line); - if (wikiHeading != null) { + + String pos = null; + int posDepth = -1; + + final WikiTokenizer wikiTokenizer = new WikiTokenizer(text); + while (wikiTokenizer.nextToken() != null) { + + if (wikiTokenizer.isHeading()) { + final String headerName = wikiTokenizer.headingWikiText(); - if (wikiHeading.depth <= posDepth) { + if (wikiTokenizer.headingDepth() <= posDepth) { pos = null; posDepth = -1; } - if (partOfSpeechHeader.matcher(wikiHeading.name).matches()) { - posDepth = wikiHeading.depth; - pos = wikiHeading.name; - } else if (wikiHeading.name.equals("Translations")) { - doTranslations(title, wikiLineReader); - } else if (wikiHeading.name.equals("Pronunciation")) { + if (partOfSpeechHeader.matcher(headerName).matches()) { + posDepth = wikiTokenizer.headingDepth(); + pos = wikiTokenizer.headingWikiText(); + // TODO: if we're inside the POS section, we should handle the first title line... + + } else if (headerName.equals("Translations")) { + if (pos == null) { + LOG.warning("Translations without POS: " + title); + } + doTranslations(title, wikiTokenizer, pos); + } else if (headerName.equals("Pronunciation")) { //doPronunciation(wikiLineReader); } } @@ -136,13 +167,17 @@ public class EnWiktionaryXmlParser { "Jpan", "Kore", "Hebr", "rfscript", "Beng", "Mong", "Knda", "Cyrs", "yue-tsj", "Mlym", "Tfng", "Grek", "yue-yue-j")); - private void doTranslations(final String title, final WikiLineReader wikiLineReader) { - String line; + private void doTranslations(final String title, final WikiTokenizer wikiTokenizer, final String pos) { + if (title.equals("absolutely")) { + //System.out.println(); + } + + String topLevelLang = null; String sense = null; boolean done = false; - while ((line = wikiLineReader.readLine()) != null) { - if (WikiHeading.getHeading(line) != null) { - wikiLineReader.stuffLine(line); + while (wikiTokenizer.nextToken() != null) { + if (wikiTokenizer.isHeading()) { + wikiTokenizer.returnToLineStart(); return; } if (done) { @@ -151,34 +186,41 @@ public class EnWiktionaryXmlParser { // Check whether we care about this line: - //line = WikiLineReader.removeSquareBrackets(line); - - if (line.startsWith("{{")) { + if (wikiTokenizer.isFunction()) { + final String functionName = wikiTokenizer.functionName(); + final List positionArgs = wikiTokenizer.functionPositionArgs(); - WikiFunction wikiFunction; - while ((wikiFunction = WikiFunction.getFunction(line)) != null) { - if (wikiFunction.name.equals("trans-top")) { - sense = null; - if (wikiFunction.args.size() >= 1) { - sense = wikiFunction.args.get(0); - //System.out.println("Sense: " + sense); - } - } else if (wikiFunction.name.equals("trans-bottom")) { - sense = null; - } else if (wikiFunction.name.equals("trans-mid")) { - } else if (wikiFunction.name.equals("trans-see")) { - } else if (wikiFunction.name.startsWith("checktrans")) { - done = true; - } else { - System.err.println("Unexpected translation wikifunction: " + line + ", title=" + title); + if (functionName.equals("trans-top")) { + sense = null; + if (wikiTokenizer.functionPositionArgs().size() >= 1) { + sense = positionArgs.get(0); + // TODO: could emphasize words in [[brackets]] inside sense. + sense = WikiTokenizer.toPlainText(sense); + //LOG.info("Sense: " + sense); } - line = wikiFunction.replaceWith(line, ""); - + } else if (functionName.equals("trans-bottom")) { + sense = null; + } else if (functionName.equals("trans-mid")) { + } else if (functionName.equals("trans-see")) { + // TODO: would also be nice... + } else if (functionName.startsWith("picdic")) { + } else if (functionName.startsWith("checktrans")) { + done = true; + } else if (functionName.startsWith("ttbc")) { + wikiTokenizer.nextLine(); + // TODO: would be great to handle ttbc + // TODO: Check this: done = true; + } else { + LOG.warning("Unexpected translation wikifunction: " + wikiTokenizer.token() + ", title=" + title); } - - } else if (line.startsWith("*")) { + } else if (wikiTokenizer.isListItem()) { + final String line = wikiTokenizer.listItemWikiText(); // This line could produce an output... + if (line.contains("ich hoan dich gear")) { + //System.out.println(); + } + // First strip the language and check whether it matches. // And hold onto it for sub-lines. final int colonIndex = line.indexOf(":"); @@ -186,116 +228,164 @@ public class EnWiktionaryXmlParser { continue; } - final String lang = line.substring(0, colonIndex); - if (!this.langPattern.matcher(lang).find()) { + final String lang = trim(WikiTokenizer.toPlainText(line.substring(0, colonIndex))); + final boolean appendLang; + if (wikiTokenizer.listItemPrefix().length() == 1) { + topLevelLang = lang; + final boolean thisFind = langPattern.matcher(lang).find(); + if (!thisFind) { + continue; + } + appendLang = !langPattern.matcher(lang).matches(); + } else if (topLevelLang == null) { continue; + } else { + // Two-level -- the only way we won't append is if this second level matches exactly. + if (!langPattern.matcher(lang).matches() && !langPattern.matcher(topLevelLang).find()) { + continue; + } + appendLang = !langPattern.matcher(lang).matches(); } String rest = line.substring(colonIndex + 1).trim(); - doTranslationLine(line, title, sense, rest); + if (rest.length() > 0) { + doTranslationLine(line, appendLang ? lang : null, title, pos, sense, rest); + } - } else if (line.equals("")) { - } else if (line.startsWith(":")) { - } else if (line.startsWith("[[") && line.endsWith("]]")) { - } else if (line.startsWith("''See''")) { - } else if (line.startsWith("''")) { - } else if (line.equals("----")) { + } else if (wikiTokenizer.remainderStartsWith("''See''")) { + wikiTokenizer.nextLine(); + LOG.fine("Skipping line: " + wikiTokenizer.token()); + } else if (wikiTokenizer.isWikiLink()) { + final String wikiLink = wikiTokenizer.wikiLinkText(); + if (wikiLink.contains(":") && wikiLink.contains(title)) { + } else if (wikiLink.contains("Category:")) { + } else { + LOG.warning("Unexpected wikiLink: " + wikiTokenizer.token() + ", title=" + title); + } + } else if (wikiTokenizer.isNewline() || wikiTokenizer.isMarkup() || wikiTokenizer.isComment()) { } else { - System.err.println("Unexpected translation line: " + line + ", title=" + title); + final String token = wikiTokenizer.token(); + if (token.equals("----")) { + } else { + LOG.warning("Unexpected translation token: " + wikiTokenizer.token() + ", title=" + title); + } } } - } - private void doTranslationLine(final String line, final String title, final String sense, String rest) { - + private static T get(final List list, final int index) { + return index < list.size() ? list.get(index) : null; + } + + private void doTranslationLine(final String line, final String lang, final String title, final String pos, final String sense, final String rest) { // Good chance we'll actually file this one... final PairEntry pairEntry = new PairEntry(); final IndexedEntry indexedEntry = new IndexedEntry(pairEntry); - - final StringBuilder otherText = new StringBuilder(); - WikiFunction wikiFunction; - while ((wikiFunction = WikiFunction.getFunction(rest)) != null) { - if (wikiFunction.start > 0) { - String plainText = rest.substring(0, wikiFunction.start); - otherText.append("").append(plainText); - otherIndexBuilder.addEntryWithString(indexedEntry, plainText, EntryTypeName.WIKTIONARY_OTHER_TEXT); - } - rest = rest.substring(wikiFunction.end); + final StringBuilder otherText = new StringBuilder(); + final WikiTokenizer wikiTokenizer = new WikiTokenizer(rest, false); + while (wikiTokenizer.nextToken() != null) { - if (wikiFunction.name.equals("t") || wikiFunction.name.equals("t+") || wikiFunction.name.equals("t-") || wikiFunction.name.equals("tø")) { - if (wikiFunction.args.size() < 2) { - System.err.println("{{t}} with too few args: " + line + ", title=" + title); - continue; - } - final String langCode = wikiFunction.getArg(0); - if (this.langCodePattern.matcher(langCode).matches()) { - final String word = wikiFunction.getArg(1); - final String gender = wikiFunction.getArg(2); - final String transliteration = wikiFunction.getNamedArg("tr"); - if (otherText.length() > 0) { - otherText.append(""); - } - otherText.append(word); - otherIndexBuilder.addEntryWithString(indexedEntry, word, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI); - if (gender != null) { - otherText.append(String.format(" {%s}", gender)); + if (wikiTokenizer.isPlainText()) { + final String plainText = wikiTokenizer.token(); + otherText.append("").append(plainText); + otherIndexBuilder.addEntryWithString(indexedEntry, plainText, EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); + + } else if (wikiTokenizer.isWikiLink()) { + final String plainText = wikiTokenizer.wikiLinkText(); + otherText.append("").append(plainText); + otherIndexBuilder.addEntryWithString(indexedEntry, plainText, EntryTypeName.WIKTIONARY_TRANSLATION_WIKI_TEXT); + + } else if (wikiTokenizer.isFunction()) { + final String functionName = wikiTokenizer.functionName(); + final List args = wikiTokenizer.functionPositionArgs(); + final Map namedArgs = wikiTokenizer.functionNamedArgs(); + + if (functionName.equals("t") || functionName.equals("t+") || functionName.equals("t-") || functionName.equals("tø") || functionName.equals("apdx-t")) { + if (args.size() < 2) { + LOG.warning("{{t}} with too few args: " + line + ", title=" + title); + continue; } - if (transliteration != null) { - otherText.append(String.format(" (tr. %s)", transliteration)); - otherIndexBuilder.addEntryWithString(indexedEntry, transliteration, EntryTypeName.WIKTIONARY_TRANSLITERATION); + final String langCode = get(args, 0); + //if (this.langCodePattern.matcher(langCode).matches()) { + final String word = get(args, 1); + final String gender = get(args, 2); + final String transliteration = namedArgs.get("tr"); + if (otherText.length() > 0) { + otherText.append(""); + } + otherText.append(word); + otherIndexBuilder.addEntryWithString(indexedEntry, word, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI); + if (gender != null) { + otherText.append(String.format(" {%s}", gender)); + } + if (transliteration != null) { + otherText.append(String.format(TRANSLITERATION_FORMAT, transliteration)); + otherIndexBuilder.addEntryWithString(indexedEntry, transliteration, EntryTypeName.WIKTIONARY_TRANSLITERATION); + } + //} + } else if (functionName.equals("qualifier")) { + if (args.size() == 0) { + otherText.append(wikiTokenizer.token()); + } else { + String qualifier = args.get(0); + if (!namedArgs.isEmpty() || args.size() > 1) { + LOG.warning("weird qualifier: " + line); + } + // Unindexed! + otherText.append("(").append(qualifier).append(")"); } + } else if (encodings.contains(functionName)) { + otherText.append("").append(args.get(0)); + otherIndexBuilder.addEntryWithString(indexedEntry, args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); + } else if (isGender(functionName)) { + appendGender(otherText, functionName, args); + } else if (functionName.equals("g")) { + otherText.append("{g}"); + } else if (functionName.equals("l")) { + // encodes text in various langs. + // lang is arg 0. + otherText.append("").append(args.get(1)); + otherIndexBuilder.addEntryWithString(indexedEntry, args.get(1), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); + // TODO: transliteration + } else if (functionName.equals("term")) { + // cross-reference to another dictionary + otherText.append("").append(args.get(0)); + otherIndexBuilder.addEntryWithString(indexedEntry, args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); + // TODO: transliteration + } else if (functionName.equals("italbrac") || functionName.equals("gloss")) { + // TODO: put this text aside to use it. + otherText.append("[").append(args.get(0)).append("]"); + otherIndexBuilder.addEntryWithString(indexedEntry, args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); + } else if (functionName.equals("ttbc")) { + LOG.warning("Unexpected {{ttbc}}"); + } else if (functionName.equals("trreq")) { + } else if (functionName.equals("not used")) { + otherText.append("(not used)"); + } else if (functionName.equals("t-image")) { + // American sign language + } else { + // Unindexed! + otherText.append(wikiTokenizer.token()); } - } else if (wikiFunction.name.equals("qualifier")) { - String qualifier = wikiFunction.getArg(0); - if (!wikiFunction.namedArgs.isEmpty() || wikiFunction.args.size() > 1) { - System.err.println("weird qualifier: " + line); - } - otherText.append("(").append(qualifier).append(")"); - } else if (encodings.contains(wikiFunction.name)) { - otherText.append("").append(wikiFunction.getArg(0)); - otherIndexBuilder.addEntryWithString(indexedEntry, wikiFunction.getArg(0), EntryTypeName.WIKTIONARY_OTHER_TEXT); - } else if (wikiFunction.name.equals("m") || wikiFunction.name.equals("f") || wikiFunction.name.equals("n")) { - otherText.append("{"); - otherText.append(wikiFunction.name); - for (int i = 0; i < wikiFunction.args.size(); ++i) { - otherText.append("|").append(wikiFunction.getArg(i)); - } - otherText.append("}"); - } else if (wikiFunction.name.equals("g")) { - otherText.append("{g}"); - } else if (wikiFunction.name.equals("l")) { - // encodes text in various langs. - // lang is arg 0. - otherText.append("").append(wikiFunction.getArg(1)); - otherIndexBuilder.addEntryWithString(indexedEntry, wikiFunction.getArg(1), EntryTypeName.WIKTIONARY_OTHER_TEXT); - // TODO: transliteration - } else if (wikiFunction.name.equals("term")) { - // cross-reference to another dictionary - otherText.append("").append(wikiFunction.getArg(0)); - otherIndexBuilder.addEntryWithString(indexedEntry, wikiFunction.getArg(0), EntryTypeName.WIKTIONARY_OTHER_TEXT); - // TODO: transliteration - } else if (wikiFunction.name.equals("italbrac") || wikiFunction.name.equals("gloss")) { - // TODO: put this text aside to use it. - otherText.append("[").append(wikiFunction.getArg(0)).append("]"); - otherIndexBuilder.addEntryWithString(indexedEntry, wikiFunction.getArg(0), EntryTypeName.WIKTIONARY_OTHER_TEXT); - } else if (wikiFunction.name.equals("ttbc")) { - } else if (wikiFunction.name.equals("trreq")) { - } else if (wikiFunction.name.equals("not used")) { - otherText.append("(not used)"); - } else if (wikiFunction.name.equals("t-image")) { - // American sign language - } else if (wikiFunction.args.isEmpty() && wikiFunction.namedArgs.isEmpty()) { - otherText.append("{UNK. FUNC.: ").append(wikiFunction.name).append("}"); + + } else if (wikiTokenizer.isNewline()) { + assert false; + } else if (wikiTokenizer.isComment()) { + } else if (wikiTokenizer.isMarkup()) { } else { - System.err.println("Unexpected t+- wikifunction: " + line + ", title=" + title); + LOG.warning("Bad translation token: " + wikiTokenizer.token()); } } - String plainText = rest; - otherText.append("").append(plainText); - otherIndexBuilder.addEntryWithString(indexedEntry, plainText, EntryTypeName.WIKTIONARY_OTHER_TEXT); + if (otherText.length() == 0) { + LOG.warning("Empty otherText: " + line); + return; + } + + if (lang != null) { + otherText.insert(0, String.format("(%s) ", lang)); + } StringBuilder englishText = new StringBuilder(); @@ -309,54 +399,533 @@ public class EnWiktionaryXmlParser { } enIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI); - final Pair pair = new Pair(englishText.toString(), WikiParser.simpleParse(otherText.toString()), swap); + final Pair pair = new Pair(trim(englishText.toString()), trim(otherText.toString()), swap); pairEntry.pairs.add(pair); - assert (pairsAdded.add(pair.toString())); + if (!pairsAdded.add(pair.toString())) { + LOG.warning("Duplicate pair: " + pair.toString()); + } if (pair.toString().equals("libero {m} :: free (adjective)")) { System.out.println(); } } + + + private void appendGender(final StringBuilder otherText, + final String functionName, final List args) { + otherText.append("{"); + otherText.append(functionName); + for (int i = 0; i < args.size(); ++i) { + otherText.append("|").append(args.get(i)); + } + otherText.append("}"); + } + + + private boolean isGender(final String functionName) { + return functionName.equals("m") || functionName.equals("f") || functionName.equals("n") || functionName.equals("p"); + } Set pairsAdded = new LinkedHashSet(); // ------------------------------------------------------------------------- - private void doForeignWord(String title, String text) { - final WikiLineReader wikiLineReader = new WikiLineReader(text); - String line; - while ((line = wikiLineReader.readLine()) != null) { - final WikiHeading wikiHeading = WikiHeading.getHeading(line); - if (wikiHeading != null) { - if (wikiHeading.name.equals("Translations")) { - System.err.println("Translations not in English section: " + title); - } else if (wikiHeading.name.equals("Pronunciation")) { + private void doForeignWord(final String lang, final String title, final String text) { + final WikiTokenizer wikiTokenizer = new WikiTokenizer(text); + while (wikiTokenizer.nextToken() != null) { + if (wikiTokenizer.isHeading()) { + final String headingName = wikiTokenizer.headingWikiText(); + if (headingName.equals("Translations")) { + LOG.warning("Translations not in English section: " + title); + } else if (headingName.equals("Pronunciation")) { //doPronunciation(wikiLineReader); - } else if (partOfSpeechHeader.matcher(wikiHeading.name).matches()) { - doPartOfSpeech(title, wikiHeading, wikiLineReader); + } else if (partOfSpeechHeader.matcher(headingName).matches()) { + doForeignPartOfSpeech(lang, title, headingName, wikiTokenizer.headingDepth(), wikiTokenizer); } + } else { } } } + + static final class ListSection { + final String firstPrefix; + final String firstLine; + final List nextPrefixes = new ArrayList(); + final List nextLines = new ArrayList(); + + public ListSection(String firstPrefix, String firstLine) { + this.firstPrefix = firstPrefix; + this.firstLine = firstLine; + } + + @Override + public String toString() { + return firstPrefix + firstLine + "{ " + nextPrefixes + "}"; + } + } - private void doPartOfSpeech(String title, final WikiHeading posHeading, WikiLineReader wikiLineReader) { - String line; - System.out.println("***" + title); - System.out.println(posHeading.name); - while ((line = wikiLineReader.readLine()) != null) { - WikiHeading heading = WikiHeading.getHeading(line); - if (heading != null) { - if (heading.depth <= posHeading.depth) { - wikiLineReader.stuffLine(line); + int foreignCount = 0; + private void doForeignPartOfSpeech(final String lang, String title, final String posHeading, final int posDepth, WikiTokenizer wikiTokenizer) { + if (++foreignCount % 1000 == 0) { + LOG.info("***" + lang + ", " + title + ", pos=" + posHeading + ", foreignCount=" + foreignCount); + } + if (title.equals("moro")) { + System.out.println(); + } + + final StringBuilder foreignBuilder = new StringBuilder(); + final Collection wordForms = new ArrayList(); + final List listSections = new ArrayList(); + + try { + + ListSection lastListSection = null; + + int currentHeadingDepth = posDepth; + while (wikiTokenizer.nextToken() != null) { + if (wikiTokenizer.isHeading()) { + currentHeadingDepth = wikiTokenizer.headingDepth(); + + if (currentHeadingDepth <= posDepth) { + wikiTokenizer.returnToLineStart(); return; } } - System.out.println(line); + if (currentHeadingDepth > posDepth) { + // TODO: deal with other neat info sections + continue; + } + + if (wikiTokenizer.isFunction()) { + final String name = wikiTokenizer.functionName(); + final List args = wikiTokenizer.functionPositionArgs(); + final Map namedArgs = wikiTokenizer.functionNamedArgs(); + // First line is generally a repeat of the title with some extra information. + // We need to build up the left side (foreign text, tokens) separately from the + // right side (English). The left-side may get paired with multiple right sides. + // The left side should get filed under every form of the word in question (singular, plural). + + // For verbs, the conjugation comes later on in a deeper section. + // Ideally, we'd want to file every English entry with the verb + // under every verb form coming from the conjugation. + // Ie. under "fa": see: "make :: fare" and "do :: fare" + // But then where should we put the conjugation table? + // I think just under fare. But then we need a way to link to the entry (actually the row, since entries doesn't show up!) + // for the conjugation table from "fa". + // Would like to be able to link to a lang#token. + if (isGender(name)) { + appendGender(foreignBuilder, name, args); + } else if (name.equals("wikipedia")) { + namedArgs.remove("lang"); + if (args.size() > 1 || !namedArgs.isEmpty()) { + // Unindexed! + foreignBuilder.append(wikiTokenizer.token()); + } else if (args.size() == 1) { + foreignBuilder.append(wikiTokenizer.token()); + } else { + //foreignBuilder.append(title); + } + } else if (name.equals("attention") || name.equals("zh-attention")) { + // See: http://en.wiktionary.org/wiki/Template:attention + // Ignore these. + } else if (name.equals("infl")) { + // See: http://en.wiktionary.org/wiki/Template:infl + final String langCode = get(args, 0); + namedArgs.remove("sc"); + final String tr = namedArgs.remove("tr"); + final String g = namedArgs.remove("g"); + final String g2 = namedArgs.remove("g2"); + final String g3 = namedArgs.remove("g3"); + if (!namedArgs.isEmpty()) { + LOG.warning("Didn't parse infl: " + wikiTokenizer.token()); + foreignBuilder.append(wikiTokenizer.token()); + } else { + String head = namedArgs.get("head"); + if (head == null) { + head = title; + } else { + head = WikiTokenizer.toPlainText(head); + } + foreignBuilder.append(head); + + if (g != null) { + foreignBuilder.append(" {").append(g); + if (g2 != null) { + foreignBuilder.append("|").append(g2); + } + if (g3 != null) { + foreignBuilder.append("|").append(g3); + } + foreignBuilder.append("}"); + } + + if (tr != null) { + foreignBuilder.append(String.format(TRANSLITERATION_FORMAT, tr)); + wordForms.add(tr); + } + + final String pos = get(args, 1); + if (pos != null) { + foreignBuilder.append(" (").append(pos).append(")"); + } + for (int i = 2; i < args.size(); i += 2) { + final String inflName = get(args, i); + final String inflValue = get(args, i + 1); + foreignBuilder.append(", ").append(WikiTokenizer.toPlainText(inflName)); + if (inflValue != null && inflValue.length() > 0) { + foreignBuilder.append(": ").append(WikiTokenizer.toPlainText(inflValue)); + wordForms.add(inflValue); + } + } + } + } else if (name.equals("it-noun")) { + final String base = get(args, 0); + final String gender = get(args, 1); + final String singular = base + get(args, 2); + final String plural = base + get(args, 3); + foreignBuilder.append(String.format(" %s {%s}, %s {pl}", singular, gender, plural, plural)); + wordForms.add(singular); + wordForms.add(plural); + } else if (name.equals("it-proper noun")) { + foreignBuilder.append(wikiTokenizer.token()); + } else if (name.equals("it-adj")) { + foreignBuilder.append(wikiTokenizer.token()); + } else if (name.startsWith("it-conj")) { + if (name.equals("it-conj-are")) { + itConjAre(args, namedArgs); + } else if (name.equals("it-conj-ere")) { + } else if (name.equals("it-conj-ire")) { + } else { + LOG.warning("Unknown conjugation: " + wikiTokenizer.token()); + } + } else { + // Unindexed! + foreignBuilder.append(wikiTokenizer.token()); + // LOG.warning("Unknown function: " + wikiTokenizer.token()); + } + + } else if (wikiTokenizer.isListItem()) { + final String prefix = wikiTokenizer.listItemPrefix(); + if (lastListSection != null && + prefix.startsWith(lastListSection.firstPrefix) && + prefix.length() > lastListSection.firstPrefix.length()) { + lastListSection.nextPrefixes.add(prefix); + lastListSection.nextLines.add(wikiTokenizer.listItemWikiText()); + } else { + lastListSection = new ListSection(prefix, wikiTokenizer.listItemWikiText()); + listSections.add(lastListSection); + } + } else if (lastListSection != null) { + // Don't append anything after the lists, because there's crap. + } else if (wikiTokenizer.isWikiLink()) { + // Unindexed! + foreignBuilder.append(wikiTokenizer.wikiLinkText()); + + } else if (wikiTokenizer.isPlainText()) { + // Unindexed! + foreignBuilder.append(wikiTokenizer.token()); + + } else if (wikiTokenizer.isMarkup() || wikiTokenizer.isNewline() || wikiTokenizer.isComment()) { + // Do nothing. + } else { + LOG.warning("Unexpected token: " + wikiTokenizer.token()); + } + } + + } finally { + // Here's where we exit. + // Should we make an entry even if there are no foreign list items? + String foreign = foreignBuilder.toString().trim(); + if (!foreign.toLowerCase().startsWith(title.toLowerCase())) { + foreign = String.format("%s %s", title, foreign); + } + if (!langPattern.matcher(lang).matches()) { + foreign = String.format("(%s) %s", lang, foreign); + } + for (final ListSection listSection : listSections) { + doForeignListItem(foreign, title, wordForms, listSection); + } + } + } + + + static final Pattern UNINDEXED_WIKI_TEXT = Pattern.compile( + "(first|second|third)-person (singular|plural)|" + + "present tense|" + + "imperative" + ); + + + private void doForeignListItem(final String foreignText, String title, final Collection forms, final ListSection listSection) { + + final String prefix = listSection.firstPrefix; + if (prefix.length() > 1) { + // Could just get looser and say that any prefix longer than first is a sublist. + LOG.warning("Prefix too long: " + listSection); + return; + } + + final PairEntry pairEntry = new PairEntry(); + final IndexedEntry indexedEntry = new IndexedEntry(pairEntry); + + final StringBuilder englishBuilder = new StringBuilder(); + + final String mainLine = listSection.firstLine; + + final WikiTokenizer englishTokenizer = new WikiTokenizer(mainLine, false); + while (englishTokenizer.nextToken() != null) { + // TODO handle form of.... + if (englishTokenizer.isPlainText()) { + englishBuilder.append(englishTokenizer.token()); + enIndexBuilder.addEntryWithString(indexedEntry, englishTokenizer.token(), EntryTypeName.WIKTIONARY_ENGLISH_DEF); + } else if (englishTokenizer.isWikiLink()) { + final String text = englishTokenizer.wikiLinkText(); + final String link = englishTokenizer.wikiLinkDest(); + if (link != null) { + if (link.contains("#English")) { + englishBuilder.append(text); + enIndexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK); + } else if (link.contains("#") && this.langPattern.matcher(link).find()) { + englishBuilder.append(text); + otherIndexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_OTHER_LANG); + } else if (link.equals("plural")) { + englishBuilder.append(text); + } else { + //LOG.warning("Special link: " + englishTokenizer.token()); + enIndexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK); + englishBuilder.append(text); + } + } else { + // link == null + englishBuilder.append(text); + if (!UNINDEXED_WIKI_TEXT.matcher(text).find()) { + enIndexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK); + } + } + } else if (englishTokenizer.isFunction()) { + final String name = englishTokenizer.functionName(); + if (name.contains("conjugation of ") || + name.contains("form of ") || + name.contains("feminine of ") || + name.contains("plural of ")) { + // Ignore these in the index, they're really annoying.... + englishBuilder.append(englishTokenizer.token()); + } else { + englishBuilder.append(englishTokenizer.token()); +// LOG.warning("Unexpected function: " + englishTokenizer.token()); + } + } else { + if (englishTokenizer.isComment() || englishTokenizer.isMarkup()) { + } else { + LOG.warning("Unexpected definition text: " + englishTokenizer.token()); + } + } + } + + final String english = trim(englishBuilder.toString()); + if (english.length() > 0) { + final Pair pair = new Pair(english, trim(foreignText), this.swap); + pairEntry.pairs.add(pair); + otherIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI); + for (final String form : forms) { + otherIndexBuilder.addEntryWithString(indexedEntry, form, EntryTypeName.WIKTIONARY_FORM_SINGLE, EntryTypeName.WIKTIONARY_FORM_MULTI); + } + } + + // Do examples. + String lastForeign = null; + for (int i = 0; i < listSection.nextPrefixes.size(); ++i) { + final String nextPrefix = listSection.nextPrefixes.get(i); + final String nextLine = listSection.nextLines.get(i); + int dash = nextLine.indexOf("—"); + int mdashLen = 7; + if (dash == -1) { + dash = nextLine.indexOf("—"); + mdashLen = 1; + } + if (dash == -1) { + dash = nextLine.indexOf(" - "); + mdashLen = 3; + } + if ((nextPrefix.equals("#:") || nextPrefix.equals("##:")) && dash != -1) { + final String foreignEx = nextLine.substring(0, dash); + final String englishEx = nextLine.substring(dash + mdashLen); + final Pair pair = new Pair(formatAndIndexExampleString(englishEx, enIndexBuilder, indexedEntry), formatAndIndexExampleString(foreignEx, otherIndexBuilder, indexedEntry), swap); + if (pair.lang1 != "--" && pair.lang1 != "--") { + pairEntry.pairs.add(pair); + } + lastForeign = null; + } else if (nextPrefix.equals("#:") || nextPrefix.equals("##:")){ + final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap); + lastForeign = nextLine; + if (pair.lang1 != "--" && pair.lang1 != "--") { + pairEntry.pairs.add(pair); + } + } else if (nextPrefix.equals("#::") || nextPrefix.equals("#**")) { + if (lastForeign != null && pairEntry.pairs.size() > 0) { + pairEntry.pairs.remove(pairEntry.pairs.size() - 1); + final Pair pair = new Pair(formatAndIndexExampleString(nextLine, enIndexBuilder, indexedEntry), formatAndIndexExampleString(lastForeign, otherIndexBuilder, indexedEntry), swap); + if (pair.lang1 != "--" && pair.lang1 != "--") { + pairEntry.pairs.add(pair); + } + lastForeign = null; + } else { + LOG.warning("TODO: English example with no foreign: " + title + ", " + nextLine); + // TODO: add something. + } + } else if (nextPrefix.equals("#*")) { + // Can't really index these. + final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap); + lastForeign = nextLine; + if (pair.lang1 != "--" && pair.lang1 != "--") { + pairEntry.pairs.add(pair); + } + } else if (nextPrefix.equals("#::*") || nextPrefix.equals("##") || nextPrefix.equals("#*:") || nextPrefix.equals("#:*") || true) { + final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap); + if (pair.lang1 != "--" && pair.lang1 != "--") { + pairEntry.pairs.add(pair); + } +// } else { +// assert false; + } + } + } + + private String formatAndIndexExampleString(final String example, final IndexBuilder indexBuilder, final IndexedEntry indexedEntry) { + final WikiTokenizer wikiTokenizer = new WikiTokenizer(example, false); + final StringBuilder builder = new StringBuilder(); + boolean insideTripleQuotes = false; + while (wikiTokenizer.nextToken() != null) { + if (wikiTokenizer.isPlainText()) { + builder.append(wikiTokenizer.token()); + if (indexBuilder != null) { + indexBuilder.addEntryWithString(indexedEntry, wikiTokenizer.token(), EntryTypeName.WIKTIONARY_EXAMPLE); + } + } else if (wikiTokenizer.isWikiLink()) { + final String text = wikiTokenizer.wikiLinkText().replaceAll("'", ""); + builder.append(text); + if (indexBuilder != null) { + indexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_EXAMPLE); + } + } else if (wikiTokenizer.isFunction()) { + builder.append(wikiTokenizer.token()); + } else if (wikiTokenizer.isMarkup()) { + if (wikiTokenizer.token().equals("'''")) { + insideTripleQuotes = !insideTripleQuotes; + } + } else if (wikiTokenizer.isComment() || wikiTokenizer.isNewline()) { + // Do nothing. + } else { + LOG.warning("unexpected token: " + wikiTokenizer.token()); + } + } + final String result = trim(builder.toString()); + return result.length() > 0 ? result : "--"; + } + + + private void itConjAre(List args, Map namedArgs) { + final String base = args.get(0); + final String aux = args.get(1); + + putIfMissing(namedArgs, "inf", base + "are"); + putIfMissing(namedArgs, "aux", aux); + putIfMissing(namedArgs, "ger", base + "ando"); + putIfMissing(namedArgs, "presp", base + "ante"); + putIfMissing(namedArgs, "pastp", base + "ato"); + // Present + putIfMissing(namedArgs, "pres1s", base + "o"); + putIfMissing(namedArgs, "pres2s", base + "i"); + putIfMissing(namedArgs, "pres3s", base + "a"); + putIfMissing(namedArgs, "pres1p", base + "iamo"); + putIfMissing(namedArgs, "pres2p", base + "ate"); + putIfMissing(namedArgs, "pres3p", base + "ano"); + // Imperfect + putIfMissing(namedArgs, "imperf1s", base + "avo"); + putIfMissing(namedArgs, "imperf2s", base + "avi"); + putIfMissing(namedArgs, "imperf3s", base + "ava"); + putIfMissing(namedArgs, "imperf1p", base + "avamo"); + putIfMissing(namedArgs, "imperf2p", base + "avate"); + putIfMissing(namedArgs, "imperf3p", base + "avano"); + // Passato remoto + putIfMissing(namedArgs, "prem1s", base + "ai"); + putIfMissing(namedArgs, "prem2s", base + "asti"); + putIfMissing(namedArgs, "prem3s", base + "ò"); + putIfMissing(namedArgs, "prem1p", base + "ammo"); + putIfMissing(namedArgs, "prem2p", base + "aste"); + putIfMissing(namedArgs, "prem3p", base + "arono"); + // Future + putIfMissing(namedArgs, "fut1s", base + "erò"); + putIfMissing(namedArgs, "fut2s", base + "erai"); + putIfMissing(namedArgs, "fut3s", base + "erà"); + putIfMissing(namedArgs, "fut1p", base + "eremo"); + putIfMissing(namedArgs, "fut2p", base + "erete"); + putIfMissing(namedArgs, "fut3p", base + "eranno"); + // Conditional + putIfMissing(namedArgs, "cond1s", base + "erei"); + putIfMissing(namedArgs, "cond2s", base + "eresti"); + putIfMissing(namedArgs, "cond3s", base + "erebbe"); + putIfMissing(namedArgs, "cond1p", base + "eremmo"); + putIfMissing(namedArgs, "cond2p", base + "ereste"); + putIfMissing(namedArgs, "cond3p", base + "erebbero"); + // Subjunctive / congiuntivo + putIfMissing(namedArgs, "sub123s", base + "i"); + putIfMissing(namedArgs, "sub1p", base + "iamo"); + putIfMissing(namedArgs, "sub2p", base + "iate"); + putIfMissing(namedArgs, "sub3p", base + "ino"); + // Imperfect subjunctive + putIfMissing(namedArgs, "impsub12s", base + "assi"); + putIfMissing(namedArgs, "impsub3s", base + "asse"); + putIfMissing(namedArgs, "impsub1p", base + "assimo"); + putIfMissing(namedArgs, "impsub2p", base + "aste"); + putIfMissing(namedArgs, "impsub3p", base + "assero"); + // Imperative + putIfMissing(namedArgs, "imp2s", base + "a"); + putIfMissing(namedArgs, "imp3s", base + "i"); + putIfMissing(namedArgs, "imp1p", base + "iamo"); + putIfMissing(namedArgs, "imp2p", base + "ate"); + putIfMissing(namedArgs, "imp3p", base + "ino"); + + + itConj(args, namedArgs); + } + + + private void itConj(List args, Map namedArgs) { + // TODO Auto-generated method stub + + } + + + private static void putIfMissing(final Map namedArgs, final String key, + final String value) { + final String oldValue = namedArgs.get(key); + if (oldValue == null || oldValue.length() == 0) { + namedArgs.put(key, value); } } + + // TODO: check how ='' and =| are manifested.... + // TODO: get this right in -are + private static void putOrNullify(final Map namedArgs, final String key, + final String value) { + final String oldValue = namedArgs.get(key); + if (oldValue == null/* || oldValue.length() == 0*/) { + namedArgs.put(key, value); + } else { + if (oldValue.equals("''")) { + namedArgs.put(key, ""); + } + } + } + + static final Pattern whitespace = Pattern.compile("\\s+"); + static String trim(final String s) { + return whitespace.matcher(s).replaceAll(" ").trim(); + } }