X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fparser%2FEnWiktionaryXmlParser.java;h=c6aee3b97d28cea455aef0d80e5483b483d308a3;hb=57d93b56ca2ffa3be718469a9f89f66b4716ad4e;hp=d5b90067c471774a18ec59992bece5ecc7bda1ce;hpb=6acbfe7858aee5af1329aff380055e05d6642292;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java index d5b9006..c6aee3b 100644 --- a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java +++ b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java @@ -1,36 +1,54 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package com.hughes.android.dictionary.parser; +import java.io.BufferedInputStream; +import java.io.DataInputStream; +import java.io.EOFException; import java.io.File; +import java.io.FileInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.logging.Logger; import java.util.regex.Pattern; -import javax.xml.parsers.ParserConfigurationException; -import javax.xml.parsers.SAXParser; -import javax.xml.parsers.SAXParserFactory; - -import org.xml.sax.Attributes; -import org.xml.sax.SAXException; - -import com.hughes.android.dictionary.engine.DictionaryBuilder; +import com.hughes.android.dictionary.engine.EntryTypeName; import com.hughes.android.dictionary.engine.IndexBuilder; -import com.hughes.android.dictionary.parser.WikiWord.FormOf; -import com.hughes.android.dictionary.parser.WikiWord.Translation; -import com.hughes.util.ListUtil; -import com.hughes.util.StringUtil; +import com.hughes.android.dictionary.engine.IndexedEntry; +import com.hughes.android.dictionary.engine.PairEntry; +import com.hughes.android.dictionary.engine.PairEntry.Pair; + +public class EnWiktionaryXmlParser { + + private static final String TRANSLITERATION_FORMAT = " (tr. %s)"; -public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler implements WikiCallback { + static final Logger LOG = Logger.getLogger(EnWiktionaryXmlParser.class.getName()); + + // TODO: process {{ttbc}} lines static final Pattern partOfSpeechHeader = Pattern.compile( "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" + "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" + "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" + - "Ligature|Idiom|Phrase|" + + "Ligature|Idiom|Phrase|\\{\\{acronym\\}\\}|\\{\\{initialism\\}\\}|" + // These are @deprecated: "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" + "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb|" + @@ -38,73 +56,53 @@ public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler im "Determiner|Numeral|Number|Cardinal number|Ordinal number|Proverb|" + "Particle|Interjection|Pronominal adverb" + "Han character|Hanzi|Hanja|Kanji|Katakana character|Syllable"); - - static final Pattern wikiMarkup = Pattern.compile("\\[\\[|\\]\\]|''+"); - - final DictionaryBuilder dictBuilder; - final IndexBuilder[] indexBuilders; - final Pattern[] langPatterns; - final int enIndexBuilder; - - StringBuilder titleBuilder; - StringBuilder textBuilder; - StringBuilder currentBuilder = null; - - public EnWiktionaryXmlParser(final DictionaryBuilder dictBuilder, final Pattern[] langPatterns, final int enIndexBuilder) { - assert langPatterns.length == 2; - this.dictBuilder = dictBuilder; - this.indexBuilders = dictBuilder.indexBuilders.toArray(new IndexBuilder[0]); - this.langPatterns = langPatterns; + final IndexBuilder enIndexBuilder; + final IndexBuilder otherIndexBuilder; + final Pattern langPattern; + final Pattern langCodePattern; + final boolean swap; + + public EnWiktionaryXmlParser(final IndexBuilder enIndexBuilder, final IndexBuilder otherIndexBuilder, final Pattern langPattern, final Pattern langCodePattern, final boolean swap) { this.enIndexBuilder = enIndexBuilder; + this.otherIndexBuilder = otherIndexBuilder; + this.langPattern = langPattern; + this.langCodePattern = langCodePattern; + this.swap = swap; } - @Override - public void startElement(String uri, String localName, String qName, - Attributes attributes) { - currentBuilder = null; - if ("page".equals(qName)) { - titleBuilder = new StringBuilder(); + + public void parse(final File file, final int pageLimit) throws IOException { + int pageCount = 0; + final DataInputStream dis = new DataInputStream(new BufferedInputStream(new FileInputStream(file))); + while (true) { + if (pageLimit >= 0 && pageCount >= pageLimit) { + return; + } - // Start with "\n" to better match certain strings. - textBuilder = new StringBuilder("\n"); - } else if ("title".equals(qName)) { - currentBuilder = titleBuilder; - } else if ("text".equals(qName)) { - currentBuilder = textBuilder; - } - } - - @Override - public void characters(char[] ch, int start, int length) throws SAXException { - if (currentBuilder != null) { - currentBuilder.append(ch, start, length); - } - } + final String title; + try { + title = dis.readUTF(); + } catch (EOFException e) { + dis.close(); + return; + } + final String heading = dis.readUTF(); + final int bytesLength = dis.readInt(); + final byte[] bytes = new byte[bytesLength]; + dis.readFully(bytes); + final String text = new String(bytes, "UTF8"); + + parseSection(title, heading, text); - @Override - public void endElement(String uri, String localName, String qName) - throws SAXException { - currentBuilder = null; - if ("page".equals(qName)) { - endPage(); + ++pageCount; + if (pageCount % 1000 == 0) { + LOG.info("pageCount=" + pageCount); + } } } - - public void parse(final File file) throws ParserConfigurationException, - SAXException, IOException { - final SAXParser parser = SAXParserFactory.newInstance().newSAXParser(); - parser.parse(file, this); - } - - int pageCount = 0; - private void endPage() { - title = titleBuilder.toString(); - ++pageCount; - if (pageCount % 1000 == 0) { - System.out.println("pageCount=" + pageCount); - } + private void parseSection(final String title, String heading, final String text) { if (title.startsWith("Wiktionary:") || title.startsWith("Template:") || title.startsWith("Appendix:") || @@ -117,520 +115,817 @@ public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler im title.startsWith("Help:")) { return; } - currentDepth = 0; - words.clear(); - currentHeading = null; - insidePartOfSpeech = false; -// System.err.println("Working on page: " + title); - try { - WikiParser.parse(textBuilder.toString(), this); - } catch (Throwable e) { - System.err.println("Failure on page: " + title); - e.printStackTrace(System.err); + + heading = heading.replaceAll("=", "").trim(); + if (heading.equals("English")) { + doEnglishWord(title, text); + } else if (langPattern.matcher(heading).find()){ + doForeignWord(heading, title, text); } - - for (final WikiWord word : words) { - word.wikiWordToQuickDic(dictBuilder, enIndexBuilder); - } // WikiWord - + } // endPage() - - - // ------------------------------------------------------------------------ - // ------------------------------------------------------------------------ - // ------------------------------------------------------------------------ - // ------------------------------------------------------------------------ - - /** - * Two things can happen: - * - * We can be in a ==German== section. There we will see English definitions. - * Each POS should get its own QuickDic entry. Pretty much everything goes - * in. - * - * Or we can be in an ==English== section with English definitions - * and maybe see translations for languages we care about. - * - * In either case, we need to differentiate the subsections (Noun, Verb, etc.) - * into separate QuickDic entries, but that's tricky--how do we know when we - * found a subsection? Just ignore anything containing pronunciation and - * etymology? - * - * How do we decide when to seal the deal on an entry? - * - * Would be nice if the parser told us about leaving sections.... - * - * - */ - - String title; - String currentHeading; - int currentDepth; - final List words = new ArrayList(); - WikiWord currentWord; - WikiWord.PartOfSpeech currentPartOfSpeech; - WikiWord.TranslationSense currentTranslationSense; - boolean insidePartOfSpeech; - StringBuilder wikiBuilder = null; + // ------------------------------------------------------------------------- - @Override - public void onWikiLink(String[] args) { - if (wikiBuilder == null) { - return; + private void doEnglishWord(String title, String text) { + + String pos = null; + int posDepth = -1; + + final WikiTokenizer wikiTokenizer = new WikiTokenizer(text); + while (wikiTokenizer.nextToken() != null) { + + if (wikiTokenizer.isHeading()) { + final String headerName = wikiTokenizer.headingWikiText(); + + if (wikiTokenizer.headingDepth() <= posDepth) { + pos = null; + posDepth = -1; + } + + if (partOfSpeechHeader.matcher(headerName).matches()) { + posDepth = wikiTokenizer.headingDepth(); + pos = wikiTokenizer.headingWikiText(); + // TODO: if we're inside the POS section, we should handle the first title line... + + } else if (headerName.equals("Translations")) { + if (pos == null) { + LOG.warning("Translations without POS: " + title); + } + doTranslations(title, wikiTokenizer, pos); + } else if (headerName.equals("Pronunciation")) { + //doPronunciation(wikiLineReader); + } + } } - wikiBuilder.append(args[args.length - 1]); } + + + private static Set encodings = new LinkedHashSet(Arrays.asList("zh-ts", + "sd-Arab", "ku-Arab", "Arab", "unicode", "Laoo", "ur-Arab", "Thai", + "fa-Arab", "Khmr", "zh-tsp", "Cyrl", "IPAchar", "ug-Arab", "ko-inline", + "Jpan", "Kore", "Hebr", "rfscript", "Beng", "Mong", "Knda", "Cyrs", + "yue-tsj", "Mlym", "Tfng", "Grek", "yue-yue-j")); - // ttbc: translations to be checked. - static final Set useRemainingArgTemplates = new LinkedHashSet(Arrays.asList( - "Arab", "Cyrl", "fa-Arab", "italbrac", "Khmr", "ku-Arab", "IPAchar", "Laoo", - "sd-Arab", "Thai", "ttbc", "unicode", "ur-Arab", "yue-yue-j", "zh-ts", - "zh-tsp", "zh-zh-p")); - static final Set ignoreTemplates = new LinkedHashSet(Arrays.asList("")); - static final Set grammarTemplates = new LinkedHashSet(Arrays.asList("impf", "pf")); - static final Set passThroughTemplates = new LinkedHashSet(Arrays.asList("zzzzzzzzzzzzzzz")); - - @Override - public void onTemplate(final List positionalArgs, final Map namedArgs) { - if (positionalArgs.isEmpty()) { - // This happens very rarely with special templates. - return; + private void doTranslations(final String title, final WikiTokenizer wikiTokenizer, final String pos) { + if (title.equals("absolutely")) { + //System.out.println(); } - final String name = positionalArgs.get(0); - namedArgs.remove("lang"); - namedArgs.remove("nocat"); - namedArgs.remove("sc"); - - // Pronunciation - if (currentWord != null) { - if (name.equals("a")) { - // accent tag - currentWord.currentPronunciation = new StringBuilder(); - currentWord.accentToPronunciation.put(positionalArgs.get(1), currentWord.currentPronunciation); + String topLevelLang = null; + String sense = null; + boolean done = false; + while (wikiTokenizer.nextToken() != null) { + if (wikiTokenizer.isHeading()) { + wikiTokenizer.returnToLineStart(); return; } + if (done) { + continue; + } - if (name.equals("IPA") || name.equals("SAMPA") || name.equals("X-SAMPA")|| name.equals("enPR")) { - namedArgs.remove("lang"); - for (int i = 0; i < 100 && !namedArgs.isEmpty(); ++i) { - final String pron = namedArgs.remove("" + i); - if (pron != null) { - positionalArgs.add(pron); - } else { - if (i > 10) { - break; - } + // Check whether we care about this line: + + if (wikiTokenizer.isFunction()) { + final String functionName = wikiTokenizer.functionName(); + final List positionArgs = wikiTokenizer.functionPositionArgs(); + + if (functionName.equals("trans-top")) { + sense = null; + if (wikiTokenizer.functionPositionArgs().size() >= 1) { + sense = positionArgs.get(0); + // TODO: could emphasize words in [[brackets]] inside sense. + sense = WikiTokenizer.toPlainText(sense); + //LOG.info("Sense: " + sense); } + } else if (functionName.equals("trans-bottom")) { + sense = null; + } else if (functionName.equals("trans-mid")) { + } else if (functionName.equals("trans-see")) { + // TODO: would also be nice... + } else if (functionName.startsWith("picdic")) { + } else if (functionName.startsWith("checktrans")) { + done = true; + } else if (functionName.startsWith("ttbc")) { + wikiTokenizer.nextLine(); + // TODO: would be great to handle ttbc + // TODO: Check this: done = true; + } else { + LOG.warning("Unexpected translation wikifunction: " + wikiTokenizer.token() + ", title=" + title); } - if (!(positionalArgs.size() >= 2 && namedArgs.isEmpty())) { - System.err.println("Invalid pronunciation: " + positionalArgs.toString() + namedArgs.toString()); - } - if (currentWord.currentPronunciation == null) { - currentWord.currentPronunciation = new StringBuilder(); - currentWord.accentToPronunciation.put("", currentWord.currentPronunciation); + } else if (wikiTokenizer.isListItem()) { + final String line = wikiTokenizer.listItemWikiText(); + // This line could produce an output... + + if (line.contains("ich hoan dich gear")) { + //System.out.println(); } - if (currentWord.currentPronunciation.length() > 0) { - currentWord.currentPronunciation.append("; "); + + // First strip the language and check whether it matches. + // And hold onto it for sub-lines. + final int colonIndex = line.indexOf(":"); + if (colonIndex == -1) { + continue; } - for (int i = 1; i < positionalArgs.size(); ++i) { - if (i > 1) { - currentWord.currentPronunciation.append(","); + + final String lang = trim(WikiTokenizer.toPlainText(line.substring(0, colonIndex))); + final boolean appendLang; + if (wikiTokenizer.listItemPrefix().length() == 1) { + topLevelLang = lang; + final boolean thisFind = langPattern.matcher(lang).find(); + if (!thisFind) { + continue; } - final String pron = wikiMarkup.matcher(positionalArgs.get(1)).replaceAll(""); - currentWord.currentPronunciation.append(pron).append(""); + appendLang = !langPattern.matcher(lang).matches(); + } else if (topLevelLang == null) { + continue; + } else { + // Two-level -- the only way we won't append is if this second level matches exactly. + if (!langPattern.matcher(lang).matches() && !langPattern.matcher(topLevelLang).find()) { + continue; + } + appendLang = !langPattern.matcher(lang).matches(); } - currentWord.currentPronunciation.append(" (").append(name).append(")"); - return; - } - - if (name.equals("qualifier")) { - //assert positionalArgs.size() == 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs.toString(); - if (wikiBuilder == null) { - return; + + String rest = line.substring(colonIndex + 1).trim(); + if (rest.length() > 0) { + doTranslationLine(line, appendLang ? lang : null, title, pos, sense, rest); + } + + } else if (wikiTokenizer.remainderStartsWith("''See''")) { + wikiTokenizer.nextLine(); + LOG.fine("Skipping line: " + wikiTokenizer.token()); + } else if (wikiTokenizer.isWikiLink()) { + final String wikiLink = wikiTokenizer.wikiLinkText(); + if (wikiLink.contains(":") && wikiLink.contains(title)) { + } else if (wikiLink.contains("Category:")) { + } else { + LOG.warning("Unexpected wikiLink: " + wikiTokenizer.token() + ", title=" + title); + } + } else if (wikiTokenizer.isNewline() || wikiTokenizer.isMarkup() || wikiTokenizer.isComment()) { + } else { + final String token = wikiTokenizer.token(); + if (token.equals("----")) { + } else { + LOG.warning("Unexpected translation token: " + wikiTokenizer.token() + ", title=" + title); } - wikiBuilder.append(" (").append(positionalArgs.get(1)).append(")"); - return; - } - - if (name.equals("...")) { - // Skipping any elided text for brevity. - wikiBuilder.append("..."); - return; - } - - if (passThroughTemplates.contains(name)) { - assert positionalArgs.size() == 1 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs; - wikiBuilder.append(name); - return; - } - - if (name.equals("audio") || name.equals("rhymes") || name.equals("hyphenation")) { - return; } - if ("Pronunciation".equals(currentHeading)) { - System.err.println("Unhandled pronunciation template: " + positionalArgs + namedArgs); - return; - } - } // Pronunciation + } + } + + private static T get(final List list, final int index) { + return index < list.size() ? list.get(index) : null; + } + + private void doTranslationLine(final String line, final String lang, final String title, final String pos, final String sense, final String rest) { + // Good chance we'll actually file this one... + final PairEntry pairEntry = new PairEntry(); + final IndexedEntry indexedEntry = new IndexedEntry(pairEntry); - // Part of speech - if (insidePartOfSpeech) { + final StringBuilder otherText = new StringBuilder(); + final WikiTokenizer wikiTokenizer = new WikiTokenizer(rest, false); + while (wikiTokenizer.nextToken() != null) { - // form of - if (name.equals("form of")) { - namedArgs.remove("sc"); - if (positionalArgs.size() < 3 || positionalArgs.size() > 4) { - System.err.println("Invalid form of."); - } - final String token = positionalArgs.get(positionalArgs.size() == 3 ? 2 : 3); - final String grammarForm = WikiParser.simpleParse(positionalArgs.get(1)); - currentPartOfSpeech.formOfs.add(new FormOf(grammarForm, token)); - return; - } - - // The fallback plan: append the template! - if (wikiBuilder != null) { - wikiBuilder.append("{"); - boolean first = true; - for (final String arg : positionalArgs) { - if (!first) { - wikiBuilder.append(", "); + if (wikiTokenizer.isPlainText()) { + final String plainText = wikiTokenizer.token(); + otherText.append("").append(plainText); + otherIndexBuilder.addEntryWithString(indexedEntry, plainText, EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); + + } else if (wikiTokenizer.isWikiLink()) { + final String plainText = wikiTokenizer.wikiLinkText(); + otherText.append("").append(plainText); + otherIndexBuilder.addEntryWithString(indexedEntry, plainText, EntryTypeName.WIKTIONARY_TRANSLATION_WIKI_TEXT); + + } else if (wikiTokenizer.isFunction()) { + final String functionName = wikiTokenizer.functionName(); + final List args = wikiTokenizer.functionPositionArgs(); + final Map namedArgs = wikiTokenizer.functionNamedArgs(); + + if (functionName.equals("t") || functionName.equals("t+") || functionName.equals("t-") || functionName.equals("tø") || functionName.equals("apdx-t")) { + if (args.size() < 2) { + LOG.warning("{{t}} with too few args: " + line + ", title=" + title); + continue; } - first = false; - wikiBuilder.append(arg); - } - // This one isn't so useful. - for (final Map.Entry entry : namedArgs.entrySet()) { - if (!first) { - wikiBuilder.append(", "); + final String langCode = get(args, 0); + //if (this.langCodePattern.matcher(langCode).matches()) { + final String word = get(args, 1); + final String gender = get(args, 2); + final String transliteration = namedArgs.get("tr"); + if (otherText.length() > 0) { + otherText.append(""); + } + otherText.append(word); + otherIndexBuilder.addEntryWithString(indexedEntry, word, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI); + if (gender != null) { + otherText.append(String.format(" {%s}", gender)); + } + if (transliteration != null) { + otherText.append(String.format(TRANSLITERATION_FORMAT, transliteration)); + otherIndexBuilder.addEntryWithString(indexedEntry, transliteration, EntryTypeName.WIKTIONARY_TRANSLITERATION); + } + //} + } else if (functionName.equals("qualifier")) { + if (args.size() == 0) { + otherText.append(wikiTokenizer.token()); + } else { + String qualifier = args.get(0); + if (!namedArgs.isEmpty() || args.size() > 1) { + LOG.warning("weird qualifier: " + line); + } + // Unindexed! + otherText.append("(").append(qualifier).append(")"); } - first = false; - wikiBuilder.append(entry.getKey()).append("=").append(entry.getValue()); + } else if (encodings.contains(functionName)) { + otherText.append("").append(args.get(0)); + otherIndexBuilder.addEntryWithString(indexedEntry, args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); + } else if (isGender(functionName)) { + appendGender(otherText, functionName, args); + } else if (functionName.equals("g")) { + otherText.append("{g}"); + } else if (functionName.equals("l")) { + // encodes text in various langs. + // lang is arg 0. + otherText.append("").append(args.get(1)); + otherIndexBuilder.addEntryWithString(indexedEntry, args.get(1), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); + // TODO: transliteration + } else if (functionName.equals("term")) { + // cross-reference to another dictionary + otherText.append("").append(args.get(0)); + otherIndexBuilder.addEntryWithString(indexedEntry, args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); + // TODO: transliteration + } else if (functionName.equals("italbrac") || functionName.equals("gloss")) { + // TODO: put this text aside to use it. + otherText.append("[").append(args.get(0)).append("]"); + otherIndexBuilder.addEntryWithString(indexedEntry, args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); + } else if (functionName.equals("ttbc")) { + LOG.warning("Unexpected {{ttbc}}"); + } else if (functionName.equals("trreq")) { + } else if (functionName.equals("not used")) { + otherText.append("(not used)"); + } else if (functionName.equals("t-image")) { + // American sign language + } else { + // Unindexed! + otherText.append(wikiTokenizer.token()); } - wikiBuilder.append("}"); + + } else if (wikiTokenizer.isNewline()) { + assert false; + } else if (wikiTokenizer.isComment()) { + } else if (wikiTokenizer.isMarkup()) { + } else { + LOG.warning("Bad translation token: " + wikiTokenizer.token()); } - - //System.err.println("Unhandled part of speech template: " + positionalArgs + namedArgs); + } + if (otherText.length() == 0) { + LOG.warning("Empty otherText: " + line); return; - } // Part of speech - + } - // Translations - if (name.equals("trans-top")) { - assert positionalArgs.size() >= 1 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs + title; - - if (currentPartOfSpeech == null) { - assert !currentWord.partsOfSpeech.isEmpty() : title; - System.err.println("Assuming last part of speech for non-nested translation section: " + title); - currentPartOfSpeech = ListUtil.getLast(currentWord.partsOfSpeech); - } - - currentTranslationSense = new WikiWord.TranslationSense(); - currentPartOfSpeech.translationSenses.add(currentTranslationSense); - if (positionalArgs.size() > 1) { - currentTranslationSense.sense = positionalArgs.get(1); - } - return; - } // Translations + if (lang != null) { + otherText.insert(0, String.format("(%s) ", lang)); + } + + StringBuilder englishText = new StringBuilder(); + + englishText.append(title); + if (sense != null) { + englishText.append(" (").append(sense).append(")"); + enIndexBuilder.addEntryWithString(indexedEntry, sense, EntryTypeName.WIKTIONARY_TRANSLATION_SENSE, EntryTypeName.WIKTIONARY_TRANSLATION_SENSE); + } + if (pos != null) { + englishText.append(" (").append(pos.toLowerCase()).append(")"); + } + enIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI); + + final Pair pair = new Pair(trim(englishText.toString()), trim(otherText.toString()), swap); + pairEntry.pairs.add(pair); + if (!pairsAdded.add(pair.toString())) { + LOG.warning("Duplicate pair: " + pair.toString()); + } + if (pair.toString().equals("libero {m} :: free (adjective)")) { + System.out.println(); + } - if (wikiBuilder == null) { - return; - } - if (name.equals("m") || name.equals("f") || name.equals("n") || name.equals("c")) { - assert positionalArgs.size() >= 1 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs.toString(); - wikiBuilder.append("{"); - for (int i = 1; i < positionalArgs.size(); ++i) { - wikiBuilder.append(i > 1 ? "," : ""); - wikiBuilder.append(positionalArgs.get(i)); - } - wikiBuilder.append(name).append("}"); - - } else if (name.equals("p")) { - assert positionalArgs.size() == 1 && namedArgs.isEmpty(); - wikiBuilder.append("pl."); + } - } else if (name.equals("s")) { - assert positionalArgs.size() == 1 && namedArgs.isEmpty() || title.equals("dobra"); - wikiBuilder.append("sg."); - - } else if (grammarTemplates.contains(name)) { - assert positionalArgs.size() == 1 && namedArgs.isEmpty(); - wikiBuilder.append(name).append("."); - } else if (name.equals("l")) { - // This template is designed to generate a link to a specific language-section on the target page. - wikiBuilder.append(positionalArgs.size() >= 4 ? positionalArgs.get(3) : positionalArgs.get(2)); - - } else if (name.equals("t") || name.equals("t+") || name.equals("t-") || name.equals("tø")) { - if (positionalArgs.size() > 2) { - wikiBuilder.append(positionalArgs.get(2)); - } - for (int i = 3; i < positionalArgs.size(); ++i) { - wikiBuilder.append(i == 3 ? " {" : ","); - wikiBuilder.append(positionalArgs.get(i)); - wikiBuilder.append(i == positionalArgs.size() - 1 ? "}" : ""); - } - final String transliteration = namedArgs.remove("tr"); - if (transliteration != null) { - wikiBuilder.append(" (").append(transliteration).append(")"); - } - - } else if (name.equals("trreq")) { - wikiBuilder.append("{{trreq}}"); - - } else if (name.equals("qualifier")) { - //assert positionalArgs.size() == 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs.toString(); - wikiBuilder.append(" (").append(positionalArgs.get(1)).append(")"); - - } else if (useRemainingArgTemplates.contains(name)) { - for (int i = 1; i < positionalArgs.size(); ++i) { - if (i != 1) { - wikiBuilder.append(", "); + private void appendGender(final StringBuilder otherText, + final String functionName, final List args) { + otherText.append("{"); + otherText.append(functionName); + for (int i = 0; i < args.size(); ++i) { + otherText.append("|").append(args.get(i)); + } + otherText.append("}"); + } + + + private boolean isGender(final String functionName) { + return functionName.equals("m") || functionName.equals("f") || functionName.equals("n") || functionName.equals("p"); + } + + Set pairsAdded = new LinkedHashSet(); + + // ------------------------------------------------------------------------- + + private void doForeignWord(final String lang, final String title, final String text) { + final WikiTokenizer wikiTokenizer = new WikiTokenizer(text); + while (wikiTokenizer.nextToken() != null) { + if (wikiTokenizer.isHeading()) { + final String headingName = wikiTokenizer.headingWikiText(); + if (headingName.equals("Translations")) { + LOG.warning("Translations not in English section: " + title); + } else if (headingName.equals("Pronunciation")) { + //doPronunciation(wikiLineReader); + } else if (partOfSpeechHeader.matcher(headingName).matches()) { + doForeignPartOfSpeech(lang, title, headingName, wikiTokenizer.headingDepth(), wikiTokenizer); } - wikiBuilder.append(positionalArgs.get(i)); - } - } else if (ignoreTemplates.contains(name)) { - // Do nothing. - - } else if (name.equals("initialism")) { - assert positionalArgs.size() <= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs; - wikiBuilder.append("Initialism"); - } else if (name.equals("abbreviation")) { - assert positionalArgs.size() <= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs; - wikiBuilder.append("Abbreviation"); - } else if (name.equals("acronym")) { - assert positionalArgs.size() <= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs; - wikiBuilder.append("Acronym"); - } else { - if (currentTranslationSense != null) { - System.err.println("Unhandled template: " + positionalArgs.toString() + namedArgs); + } else { } } } + + static final class ListSection { + final String firstPrefix; + final String firstLine; + final List nextPrefixes = new ArrayList(); + final List nextLines = new ArrayList(); + + public ListSection(String firstPrefix, String firstLine) { + this.firstPrefix = firstPrefix; + this.firstLine = firstLine; + } - @Override - public void onText(String text) { - if (wikiBuilder != null) { - wikiBuilder.append(text); - return; + @Override + public String toString() { + return firstPrefix + firstLine + "{ " + nextPrefixes + "}"; } } - @Override - public void onHeadingStart(int depth) { - wikiBuilder = new StringBuilder(); - currentDepth = depth; - if (currentPartOfSpeech != null && depth <= currentPartOfSpeech.depth) { - currentPartOfSpeech = null; - insidePartOfSpeech = false; + + int foreignCount = 0; + private void doForeignPartOfSpeech(final String lang, String title, final String posHeading, final int posDepth, WikiTokenizer wikiTokenizer) { + if (++foreignCount % 1000 == 0) { + LOG.info("***" + lang + ", " + title + ", pos=" + posHeading + ", foreignCount=" + foreignCount); } - if (currentWord != null && depth <= currentWord.depth) { - currentWord = null; + if (title.equals("moro")) { + System.out.println(); } - currentHeading = null; - } - - @Override - public void onHeadingEnd(int depth) { - final String name = wikiBuilder.toString().trim(); - wikiBuilder = null; - currentTranslationSense = null; - currentHeading = name; + final StringBuilder foreignBuilder = new StringBuilder(); + final Collection wordForms = new ArrayList(); + final List listSections = new ArrayList(); - final boolean lang0 = langPatterns[0].matcher(name).matches(); - final boolean lang1 = langPatterns[1].matcher(name).matches(); - if (name.equalsIgnoreCase("English") || lang0 || lang1 || name.equalsIgnoreCase("Translingual")) { - currentWord = new WikiWord(title, depth); - if (lang0 && lang1) { - System.err.println("Word is indexed in both index1 and index2: " + title); - } - currentWord.language = name; - currentWord.index = lang0 ? 0 : (lang1 ? 1 : -1); - words.add(currentWord); - return; - } + try { - if (currentWord == null) { - return; - } + ListSection lastListSection = null; - if (currentPartOfSpeech != null && depth <= currentPartOfSpeech.depth) { - currentPartOfSpeech = null; - } + int currentHeadingDepth = posDepth; + while (wikiTokenizer.nextToken() != null) { + if (wikiTokenizer.isHeading()) { + currentHeadingDepth = wikiTokenizer.headingDepth(); + + if (currentHeadingDepth <= posDepth) { + wikiTokenizer.returnToLineStart(); + return; + } + } + + if (currentHeadingDepth > posDepth) { + // TODO: deal with other neat info sections + continue; + } + + if (wikiTokenizer.isFunction()) { + final String name = wikiTokenizer.functionName(); + final List args = wikiTokenizer.functionPositionArgs(); + final Map namedArgs = wikiTokenizer.functionNamedArgs(); + // First line is generally a repeat of the title with some extra information. + // We need to build up the left side (foreign text, tokens) separately from the + // right side (English). The left-side may get paired with multiple right sides. + // The left side should get filed under every form of the word in question (singular, plural). + + // For verbs, the conjugation comes later on in a deeper section. + // Ideally, we'd want to file every English entry with the verb + // under every verb form coming from the conjugation. + // Ie. under "fa": see: "make :: fare" and "do :: fare" + // But then where should we put the conjugation table? + // I think just under fare. But then we need a way to link to the entry (actually the row, since entries doesn't show up!) + // for the conjugation table from "fa". + // Would like to be able to link to a lang#token. + if (isGender(name)) { + appendGender(foreignBuilder, name, args); + } else if (name.equals("wikipedia")) { + namedArgs.remove("lang"); + if (args.size() > 1 || !namedArgs.isEmpty()) { + // Unindexed! + foreignBuilder.append(wikiTokenizer.token()); + } else if (args.size() == 1) { + foreignBuilder.append(wikiTokenizer.token()); + } else { + //foreignBuilder.append(title); + } + } else if (name.equals("attention") || name.equals("zh-attention")) { + // See: http://en.wiktionary.org/wiki/Template:attention + // Ignore these. + } else if (name.equals("infl")) { + // See: http://en.wiktionary.org/wiki/Template:infl + final String langCode = get(args, 0); + namedArgs.remove("sc"); + final String tr = namedArgs.remove("tr"); + final String g = namedArgs.remove("g"); + final String g2 = namedArgs.remove("g2"); + final String g3 = namedArgs.remove("g3"); + if (!namedArgs.isEmpty()) { + LOG.warning("Didn't parse infl: " + wikiTokenizer.token()); + foreignBuilder.append(wikiTokenizer.token()); + } else { + String head = namedArgs.get("head"); + if (head == null) { + head = title; + } else { + head = WikiTokenizer.toPlainText(head); + } + foreignBuilder.append(head); - insidePartOfSpeech = false; - if (currentPartOfSpeech == null && partOfSpeechHeader.matcher(name).matches()) { - currentPartOfSpeech = new WikiWord.PartOfSpeech(depth, name); - currentWord.partsOfSpeech.add(currentPartOfSpeech); - insidePartOfSpeech = true; - return; - } + if (g != null) { + foreignBuilder.append(" {").append(g); + if (g2 != null) { + foreignBuilder.append("|").append(g2); + } + if (g3 != null) { + foreignBuilder.append("|").append(g3); + } + foreignBuilder.append("}"); + } - if (name.equals("Translations")) { - if (currentWord == null || - !currentWord.language.equals("English") || - currentPartOfSpeech == null) { - System.err.println("Unexpected Translations section: " + title); - return; + if (tr != null) { + foreignBuilder.append(String.format(TRANSLITERATION_FORMAT, tr)); + wordForms.add(tr); + } + + final String pos = get(args, 1); + if (pos != null) { + foreignBuilder.append(" (").append(pos).append(")"); + } + for (int i = 2; i < args.size(); i += 2) { + final String inflName = get(args, i); + final String inflValue = get(args, i + 1); + foreignBuilder.append(", ").append(WikiTokenizer.toPlainText(inflName)); + if (inflValue != null && inflValue.length() > 0) { + foreignBuilder.append(": ").append(WikiTokenizer.toPlainText(inflValue)); + wordForms.add(inflValue); + } + } + } + } else if (name.equals("it-noun")) { + final String base = get(args, 0); + final String gender = get(args, 1); + final String singular = base + get(args, 2); + final String plural = base + get(args, 3); + foreignBuilder.append(String.format(" %s {%s}, %s {pl}", singular, gender, plural, plural)); + wordForms.add(singular); + wordForms.add(plural); + } else if (name.equals("it-proper noun")) { + foreignBuilder.append(wikiTokenizer.token()); + } else if (name.equals("it-adj")) { + foreignBuilder.append(wikiTokenizer.token()); + } else if (name.startsWith("it-conj")) { + if (name.equals("it-conj-are")) { + itConjAre(args, namedArgs); + } else if (name.equals("it-conj-ere")) { + } else if (name.equals("it-conj-ire")) { + } else { + LOG.warning("Unknown conjugation: " + wikiTokenizer.token()); + } + } else { + // Unindexed! + foreignBuilder.append(wikiTokenizer.token()); + // LOG.warning("Unknown function: " + wikiTokenizer.token()); + } + + } else if (wikiTokenizer.isListItem()) { + final String prefix = wikiTokenizer.listItemPrefix(); + if (lastListSection != null && + prefix.startsWith(lastListSection.firstPrefix) && + prefix.length() > lastListSection.firstPrefix.length()) { + lastListSection.nextPrefixes.add(prefix); + lastListSection.nextLines.add(wikiTokenizer.listItemWikiText()); + } else { + lastListSection = new ListSection(prefix, wikiTokenizer.listItemWikiText()); + listSections.add(lastListSection); + } + } else if (lastListSection != null) { + // Don't append anything after the lists, because there's crap. + } else if (wikiTokenizer.isWikiLink()) { + // Unindexed! + foreignBuilder.append(wikiTokenizer.wikiLinkText()); + + } else if (wikiTokenizer.isPlainText()) { + // Unindexed! + foreignBuilder.append(wikiTokenizer.token()); + + } else if (wikiTokenizer.isMarkup() || wikiTokenizer.isNewline() || wikiTokenizer.isComment()) { + // Do nothing. + } else { + LOG.warning("Unexpected token: " + wikiTokenizer.token()); } - currentTranslationSense = new WikiWord.TranslationSense(); } - } - - @Override - public void onListItemStart(String header, int[] section) { - wikiBuilder = new StringBuilder(); - if (currentWord != null) { - currentWord.currentPronunciation = null; + } finally { + // Here's where we exit. + // Should we make an entry even if there are no foreign list items? + String foreign = foreignBuilder.toString().trim(); + if (!foreign.toLowerCase().startsWith(title.toLowerCase())) { + foreign = String.format("%s %s", title, foreign); + } + if (!langPattern.matcher(lang).matches()) { + foreign = String.format("(%s) %s", lang, foreign); + } + for (final ListSection listSection : listSections) { + doForeignListItem(foreign, title, wordForms, listSection); + } } } + + static final Pattern UNINDEXED_WIKI_TEXT = Pattern.compile( + "(first|second|third)-person (singular|plural)|" + + "present tense|" + + "imperative" + ); + - @Override - public void onListItemEnd(String header, int[] section) { - String item = wikiBuilder.toString().trim(); - final String oldItem = item; - if (item.length() == 0) { + private void doForeignListItem(final String foreignText, String title, final Collection forms, final ListSection listSection) { + + final String prefix = listSection.firstPrefix; + if (prefix.length() > 1) { + // Could just get looser and say that any prefix longer than first is a sublist. + LOG.warning("Prefix too long: " + listSection); return; } - item = WikiParser.simpleParse(item); - wikiBuilder = null; - - // Part of speech - if (insidePartOfSpeech) { - assert currentPartOfSpeech != null : title + item; - if (header.equals("#") || - header.equals("##") || - header.equals("###") || - header.equals("####") || - header.equals(":#") || - header.equals("::") || - header.equals(":::*")) { - // Definition. - // :: should append, probably. - currentPartOfSpeech.newMeaning().meaning = item; - - // Source - } else if (header.equals("#*") || - header.equals("##*") || - header.equals("###*")) { - currentPartOfSpeech.lastMeaning().newExample().source = item; - - // Example - } else if (header.equals("#:") || - header.equals("#*:") || - header.equals("#:*") || - header.equals("##:") || - header.equals("##*:") || - header.equals("#:*:") || - header.equals("#:*#") || - header.equals("#*:") || - header.equals("*:") || - header.equals("#:::") || - header.equals("#**") || - header.equals("#*:::") || - header.equals("#:#") || - header.equals(":::") || - header.equals("##:*") || - header.equals("###*:")) { - StringUtil.appendLine(currentPartOfSpeech.lastMeaning().newExample().example, item); - - // Example in English - } else if (header.equals("#::") || - header.equals("#*::") || - header.equals("#:**") || - header.equals("#*#") || - header.equals("##*::")) { - StringUtil.appendLine(currentPartOfSpeech.lastMeaning().lastExample().exampleInEnglish, item); - - // Skip - } else if (header.equals("*") || - header.equals("**") || - header.equals("***") || - header.equals("*#") || - header.equals(":") || - header.equals("::*") || - header.equals("#**") || - header.equals(":*") || - header.equals("#*:*") || - header.equals("#*:**") || - header.equals("#*:#") || - header.equals("#*:*:") || - header.equals("#*:*") || - header.equals(";")) { - // might have: * {{seeCites}} - // * [[w:Arabic numerals|Arabic numerals]]: 2 - //assert item.trim().length() == 0; - System.err.println("Skipping meaning: " + header + " " + item); + + final PairEntry pairEntry = new PairEntry(); + final IndexedEntry indexedEntry = new IndexedEntry(pairEntry); + + final StringBuilder englishBuilder = new StringBuilder(); + + final String mainLine = listSection.firstLine; + + final WikiTokenizer englishTokenizer = new WikiTokenizer(mainLine, false); + while (englishTokenizer.nextToken() != null) { + // TODO handle form of.... + if (englishTokenizer.isPlainText()) { + englishBuilder.append(englishTokenizer.token()); + enIndexBuilder.addEntryWithString(indexedEntry, englishTokenizer.token(), EntryTypeName.WIKTIONARY_ENGLISH_DEF); + } else if (englishTokenizer.isWikiLink()) { + final String text = englishTokenizer.wikiLinkText(); + final String link = englishTokenizer.wikiLinkDest(); + if (link != null) { + if (link.contains("#English")) { + englishBuilder.append(text); + enIndexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK); + } else if (link.contains("#") && this.langPattern.matcher(link).find()) { + englishBuilder.append(text); + otherIndexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_OTHER_LANG); + } else if (link.equals("plural")) { + englishBuilder.append(text); + } else { + //LOG.warning("Special link: " + englishTokenizer.token()); + enIndexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK); + englishBuilder.append(text); + } + } else { + // link == null + englishBuilder.append(text); + if (!UNINDEXED_WIKI_TEXT.matcher(text).find()) { + enIndexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK); + } + } + } else if (englishTokenizer.isFunction()) { + final String name = englishTokenizer.functionName(); + if (name.contains("conjugation of ") || + name.contains("form of ") || + name.contains("feminine of ") || + name.contains("plural of ")) { + // Ignore these in the index, they're really annoying.... + englishBuilder.append(englishTokenizer.token()); + } else { + englishBuilder.append(englishTokenizer.token()); +// LOG.warning("Unexpected function: " + englishTokenizer.token()); + } } else { - if (title.equals("Yellowknife")) { - return; + if (englishTokenizer.isComment() || englishTokenizer.isMarkup()) { + } else { + LOG.warning("Unexpected definition text: " + englishTokenizer.token()); } - System.err.println("Busted heading: " + title + " "+ header + " " + item); } - return; } - // Part of speech + + final String english = trim(englishBuilder.toString()); + if (english.length() > 0) { + final Pair pair = new Pair(english, trim(foreignText), this.swap); + pairEntry.pairs.add(pair); + otherIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI); + for (final String form : forms) { + otherIndexBuilder.addEntryWithString(indexedEntry, form, EntryTypeName.WIKTIONARY_FORM_SINGLE, EntryTypeName.WIKTIONARY_FORM_MULTI); + } + } - // Translation - if (currentTranslationSense != null) { - if (item.indexOf("{{[trreq]{}}}") != -1) { - return; + // Do examples. + String lastForeign = null; + for (int i = 0; i < listSection.nextPrefixes.size(); ++i) { + final String nextPrefix = listSection.nextPrefixes.get(i); + final String nextLine = listSection.nextLines.get(i); + int dash = nextLine.indexOf("—"); + int mdashLen = 7; + if (dash == -1) { + dash = nextLine.indexOf("—"); + mdashLen = 1; } - - if (currentPartOfSpeech.translationSenses.isEmpty()) { - currentPartOfSpeech.translationSenses.add(currentTranslationSense); + if (dash == -1) { + dash = nextLine.indexOf(" - "); + mdashLen = 3; } - - final int colonPos = item.indexOf(':'); - if (colonPos == -1) { - System.err.println("Invalid translation: title=" + title + ", item=" + item); - return; + + if ((nextPrefix.equals("#:") || nextPrefix.equals("##:")) && dash != -1) { + final String foreignEx = nextLine.substring(0, dash); + final String englishEx = nextLine.substring(dash + mdashLen); + final Pair pair = new Pair(formatAndIndexExampleString(englishEx, enIndexBuilder, indexedEntry), formatAndIndexExampleString(foreignEx, otherIndexBuilder, indexedEntry), swap); + if (pair.lang1 != "--" && pair.lang1 != "--") { + pairEntry.pairs.add(pair); + } + lastForeign = null; + } else if (nextPrefix.equals("#:") || nextPrefix.equals("##:")){ + final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap); + lastForeign = nextLine; + if (pair.lang1 != "--" && pair.lang1 != "--") { + pairEntry.pairs.add(pair); + } + } else if (nextPrefix.equals("#::") || nextPrefix.equals("#**")) { + if (lastForeign != null && pairEntry.pairs.size() > 0) { + pairEntry.pairs.remove(pairEntry.pairs.size() - 1); + final Pair pair = new Pair(formatAndIndexExampleString(nextLine, enIndexBuilder, indexedEntry), formatAndIndexExampleString(lastForeign, otherIndexBuilder, indexedEntry), swap); + if (pair.lang1 != "--" && pair.lang1 != "--") { + pairEntry.pairs.add(pair); + } + lastForeign = null; + } else { + LOG.warning("TODO: English example with no foreign: " + title + ", " + nextLine); + // TODO: add something. + } + } else if (nextPrefix.equals("#*")) { + // Can't really index these. + final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap); + lastForeign = nextLine; + if (pair.lang1 != "--" && pair.lang1 != "--") { + pairEntry.pairs.add(pair); + } + } else if (nextPrefix.equals("#::*") || nextPrefix.equals("##") || nextPrefix.equals("#*:") || nextPrefix.equals("#:*") || true) { + final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap); + if (pair.lang1 != "--" && pair.lang1 != "--") { + pairEntry.pairs.add(pair); + } +// } else { +// assert false; } - final String lang = item.substring(0, colonPos); - final String trans = item.substring(colonPos + 1).trim(); - for (int i = 0; i < 2; ++i) { - if (langPatterns[i].matcher(lang).find()) { - currentTranslationSense.translations.get(i).add(new Translation(lang, trans)); + } + } + + private String formatAndIndexExampleString(final String example, final IndexBuilder indexBuilder, final IndexedEntry indexedEntry) { + final WikiTokenizer wikiTokenizer = new WikiTokenizer(example, false); + final StringBuilder builder = new StringBuilder(); + boolean insideTripleQuotes = false; + while (wikiTokenizer.nextToken() != null) { + if (wikiTokenizer.isPlainText()) { + builder.append(wikiTokenizer.token()); + if (indexBuilder != null) { + indexBuilder.addEntryWithString(indexedEntry, wikiTokenizer.token(), EntryTypeName.WIKTIONARY_EXAMPLE); } + } else if (wikiTokenizer.isWikiLink()) { + final String text = wikiTokenizer.wikiLinkText().replaceAll("'", ""); + builder.append(text); + if (indexBuilder != null) { + indexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_EXAMPLE); + } + } else if (wikiTokenizer.isFunction()) { + builder.append(wikiTokenizer.token()); + } else if (wikiTokenizer.isMarkup()) { + if (wikiTokenizer.token().equals("'''")) { + insideTripleQuotes = !insideTripleQuotes; + } + } else if (wikiTokenizer.isComment() || wikiTokenizer.isNewline()) { + // Do nothing. + } else { + LOG.warning("unexpected token: " + wikiTokenizer.token()); } - } // Translation + } + final String result = trim(builder.toString()); + return result.length() > 0 ? result : "--"; } - @Override - public void onNewLine() { - } - @Override - public void onNewParagraph() { + private void itConjAre(List args, Map namedArgs) { + final String base = args.get(0); + final String aux = args.get(1); + + putIfMissing(namedArgs, "inf", base + "are"); + putIfMissing(namedArgs, "aux", aux); + putIfMissing(namedArgs, "ger", base + "ando"); + putIfMissing(namedArgs, "presp", base + "ante"); + putIfMissing(namedArgs, "pastp", base + "ato"); + // Present + putIfMissing(namedArgs, "pres1s", base + "o"); + putIfMissing(namedArgs, "pres2s", base + "i"); + putIfMissing(namedArgs, "pres3s", base + "a"); + putIfMissing(namedArgs, "pres1p", base + "iamo"); + putIfMissing(namedArgs, "pres2p", base + "ate"); + putIfMissing(namedArgs, "pres3p", base + "ano"); + // Imperfect + putIfMissing(namedArgs, "imperf1s", base + "avo"); + putIfMissing(namedArgs, "imperf2s", base + "avi"); + putIfMissing(namedArgs, "imperf3s", base + "ava"); + putIfMissing(namedArgs, "imperf1p", base + "avamo"); + putIfMissing(namedArgs, "imperf2p", base + "avate"); + putIfMissing(namedArgs, "imperf3p", base + "avano"); + // Passato remoto + putIfMissing(namedArgs, "prem1s", base + "ai"); + putIfMissing(namedArgs, "prem2s", base + "asti"); + putIfMissing(namedArgs, "prem3s", base + "ò"); + putIfMissing(namedArgs, "prem1p", base + "ammo"); + putIfMissing(namedArgs, "prem2p", base + "aste"); + putIfMissing(namedArgs, "prem3p", base + "arono"); + // Future + putIfMissing(namedArgs, "fut1s", base + "erò"); + putIfMissing(namedArgs, "fut2s", base + "erai"); + putIfMissing(namedArgs, "fut3s", base + "erà"); + putIfMissing(namedArgs, "fut1p", base + "eremo"); + putIfMissing(namedArgs, "fut2p", base + "erete"); + putIfMissing(namedArgs, "fut3p", base + "eranno"); + // Conditional + putIfMissing(namedArgs, "cond1s", base + "erei"); + putIfMissing(namedArgs, "cond2s", base + "eresti"); + putIfMissing(namedArgs, "cond3s", base + "erebbe"); + putIfMissing(namedArgs, "cond1p", base + "eremmo"); + putIfMissing(namedArgs, "cond2p", base + "ereste"); + putIfMissing(namedArgs, "cond3p", base + "erebbero"); + // Subjunctive / congiuntivo + putIfMissing(namedArgs, "sub123s", base + "i"); + putIfMissing(namedArgs, "sub1p", base + "iamo"); + putIfMissing(namedArgs, "sub2p", base + "iate"); + putIfMissing(namedArgs, "sub3p", base + "ino"); + // Imperfect subjunctive + putIfMissing(namedArgs, "impsub12s", base + "assi"); + putIfMissing(namedArgs, "impsub3s", base + "asse"); + putIfMissing(namedArgs, "impsub1p", base + "assimo"); + putIfMissing(namedArgs, "impsub2p", base + "aste"); + putIfMissing(namedArgs, "impsub3p", base + "assero"); + // Imperative + putIfMissing(namedArgs, "imp2s", base + "a"); + putIfMissing(namedArgs, "imp3s", base + "i"); + putIfMissing(namedArgs, "imp1p", base + "iamo"); + putIfMissing(namedArgs, "imp2p", base + "ate"); + putIfMissing(namedArgs, "imp3p", base + "ino"); + + + itConj(args, namedArgs); } - // ---------------------------------------------------------------------- - - @Override - public void onComment(String text) { - } - @Override - public void onFormatBold(boolean boldOn) { + private void itConj(List args, Map namedArgs) { + // TODO Auto-generated method stub + } - @Override - public void onFormatItalic(boolean italicOn) { - } - @Override - public void onUnterminated(String start, String rest) { - throw new RuntimeException(start + rest); + private static void putIfMissing(final Map namedArgs, final String key, + final String value) { + final String oldValue = namedArgs.get(key); + if (oldValue == null || oldValue.length() == 0) { + namedArgs.put(key, value); + } } - @Override - public void onInvalidHeaderEnd(String rest) { - throw new RuntimeException(rest); + + // TODO: check how ='' and =| are manifested.... + // TODO: get this right in -are + private static void putOrNullify(final Map namedArgs, final String key, + final String value) { + final String oldValue = namedArgs.get(key); + if (oldValue == null/* || oldValue.length() == 0*/) { + namedArgs.put(key, value); + } else { + if (oldValue.equals("''")) { + namedArgs.put(key, ""); + } + } } + static final Pattern whitespace = Pattern.compile("\\s+"); + static String trim(final String s) { + return whitespace.matcher(s).replaceAll(" ").trim(); + } + + }