From: Thad Hughes Date: Fri, 10 Feb 2012 19:43:15 +0000 (-0800) Subject: Unit tests working again after refactoring!!! X-Git-Url: http://gitweb.fperrin.net/?p=DictionaryPC.git;a=commitdiff_plain;h=7d5ada9329d101b59b55691dd2f63ce3e3860011 Unit tests working again after refactoring!!! --- diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java index 41a4492..10c52a5 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java @@ -31,11 +31,12 @@ import javax.xml.parsers.ParserConfigurationException; import org.xml.sax.SAXException; import com.hughes.android.dictionary.parser.DictFileParser; -import com.hughes.android.dictionary.parser.wiktionary.EnWiktionaryXmlParser; +import com.hughes.android.dictionary.parser.Parser; +import com.hughes.android.dictionary.parser.wiktionary.EnForeignParser; +import com.hughes.android.dictionary.parser.wiktionary.EnToTranslationParser; import com.hughes.util.Args; import com.hughes.util.FileUtil; - public class DictionaryBuilder { public final Dictionary dictionary; @@ -126,6 +127,11 @@ public class DictionaryBuilder { if (inputName == null) { fatalError("Must specify human readable name for: " + prefix + "Name"); } + String pageLimitString = keyValueArgs.remove(prefix + "PageLimit"); + if (pageLimitString == null) { + pageLimitString = "-1"; + } + final int pageLimit = Integer.parseInt(pageLimitString); final EntrySource entrySource = new EntrySource(dictionaryBuilder.dictionary.sources.size(), inputName, 0); System.out.println(""); @@ -133,24 +139,31 @@ public class DictionaryBuilder { String inputFormat = keyValueArgs.remove(prefix + "Format"); if ("tab_separated".equals(inputFormat)) { final boolean flipColumns = "true".equals(keyValueArgs.remove(prefix + "FlipColumns")); - new DictFileParser(charset, flipColumns, DictFileParser.TAB, null, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parseFile(file, entrySource); + new DictFileParser(charset, flipColumns, DictFileParser.TAB, null, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parse(file, entrySource, pageLimit); } else if ("chemnitz".equals(inputFormat)) { final boolean flipColumns = "true".equals(keyValueArgs.remove(prefix + "FlipColumns")); - new DictFileParser(charset, flipColumns, DictFileParser.DOUBLE_COLON, DictFileParser.PIPE, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parseFile(file, entrySource); + new DictFileParser(charset, flipColumns, DictFileParser.DOUBLE_COLON, DictFileParser.PIPE, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parse(file, entrySource, pageLimit); } else if ("enwiktionary".equals(inputFormat)) { + final String type = keyValueArgs.remove(prefix + "WiktionaryType"); final Pattern langPattern = Pattern.compile(keyValueArgs.remove(prefix + "LangPattern"), Pattern.CASE_INSENSITIVE); final Pattern langCodePattern = Pattern.compile(keyValueArgs.remove(prefix + "LangCodePattern")); final int enIndex = Integer.parseInt(keyValueArgs.remove(prefix + "EnIndex")) - 1; - String pageLimit = keyValueArgs.remove(prefix + "PageLimit"); - if (pageLimit == null) { - pageLimit = "-1"; - } if (enIndex < 0 || enIndex >= 2) { fatalError("Must be 1 or 2: " + prefix + "EnIndex"); } - new EnWiktionaryXmlParser(dictionaryBuilder.indexBuilders.get(enIndex), dictionaryBuilder.indexBuilders.get(1-enIndex), - langPattern, langCodePattern, enIndex != 0).parse(file, entrySource, Integer.parseInt(pageLimit)); + final Parser parser; + if ("EnToTranslation".equals(type)) { + parser = new EnToTranslationParser(dictionaryBuilder.indexBuilders.get(enIndex), dictionaryBuilder.indexBuilders.get(1-enIndex), + langPattern, langCodePattern, enIndex != 0); + } else if ("EnForeign".equals(type)) { + parser = new EnForeignParser(dictionaryBuilder.indexBuilders.get(enIndex), dictionaryBuilder.indexBuilders.get(1-enIndex), + langPattern, langCodePattern, enIndex != 0); + } else { + fatalError("Invalid WiktionaryType (use EnToTranslation or EnForeign): " + type); + return; + } + parser.parse(file, entrySource, pageLimit); } else { fatalError("Invalid or missing input format: " + inputFormat); } diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java index 201ac59..0731ffb 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java @@ -93,14 +93,14 @@ public class DictionaryBuilderMain extends TestCase { String.format("--lang2Stoplist=%s", STOPLISTS + isoToStoplist.get(foreignIso)), String.format("--dictInfo=(EN)Wikitionary-based EN-%s dictionary.\n\n%s", foreignIso, isoToDedication.get(foreignIso)), - "--input2=" + INPUTS + "enWikiSplit/" + foreignIso + ".data", + "--input2=" + INPUTS + "eikiSplit/en/" + foreignIso + ".data", "--input2Name=enwiktionary." + foreignIso, "--input2Format=enwiktionary", "--input2LangPattern=" + isoToRegex.get(foreignIso), "--input2LangCodePattern=" + foreignIso.toLowerCase(), "--input2EnIndex=1", - "--input3=" + INPUTS + "enWikiSplit/EN.data", + "--input3=" + INPUTS + "wikiSplit/en/EN.data", "--input3Name=enwiktionary.english", "--input3Format=enwiktionary", "--input3LangPattern=" + isoToRegex.get(foreignIso), @@ -125,14 +125,14 @@ public class DictionaryBuilderMain extends TestCase { "--input4Charset=UTF8", "--input4Format=chemnitz", - "--input2=" + INPUTS + "enWikiSplit/DE.data", + "--input2=" + INPUTS + "wikiSplit/en/DE.data", "--input2Name=enwiktionary.DE", "--input2Format=enwiktionary", "--input2LangPattern=German", "--input2LangCodePattern=de", "--input2EnIndex=2", - "--input3=" + INPUTS + "enWikiSplit/EN.data", + "--input3=" + INPUTS + "wikiSplit/en/EN.data", "--input3Name=enwiktionary.english", "--input3Format=enwiktionary", "--input3LangPattern=German", diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java index 7f1beba..a73b1b1 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java @@ -85,6 +85,7 @@ public class DictionaryBuilderTest extends TestCase { final String langPattern, final String langCode) throws Exception { final File result = new File(TEST_OUTPUTS + name); System.out.println("Writing to: " + result); + final String type = data.equals("EN.data") ? "EnToTranslation" : "EnForeign"; DictionaryBuilder.main(new String[] { "--dictOut=" + result.getAbsolutePath(), "--lang1=" + lang1, @@ -96,6 +97,7 @@ public class DictionaryBuilderTest extends TestCase { "--input4=" + WIKISPLIT + data, "--input4Name=" + dictName, "--input4Format=enwiktionary", + "--input4WiktionaryType=" + type, "--input4LangPattern=" + langPattern, "--input4LangCodePattern=" + langCode, "--input4EnIndex=2", diff --git a/src/com/hughes/android/dictionary/parser/DictFileParser.java b/src/com/hughes/android/dictionary/parser/DictFileParser.java index f4f68c8..b435b4a 100644 --- a/src/com/hughes/android/dictionary/parser/DictFileParser.java +++ b/src/com/hughes/android/dictionary/parser/DictFileParser.java @@ -38,7 +38,7 @@ import com.hughes.android.dictionary.engine.Language; import com.hughes.android.dictionary.engine.PairEntry; import com.hughes.android.dictionary.engine.PairEntry.Pair; -public class DictFileParser { +public class DictFileParser implements Parser { static final Logger logger = Logger.getLogger(DictFileParser.class.getName()); @@ -89,12 +89,16 @@ public class DictFileParser { this.bothIndexBuilder = bothIndexBuilder; } - public void parseFile(final File file, final EntrySource entrySouce) throws IOException { + @Override + public void parse(final File file, final EntrySource entrySouce, final int pageLimit) throws IOException { this.entrySource = entrySouce; final BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset)); String line; int count = 0; while ((line = reader.readLine()) != null) { + if (pageLimit >= 0 && count >= pageLimit) { + return; + } if (count % 10000 == 0) { logger.info("count=" + count + ", line=" + line); } diff --git a/src/com/hughes/android/dictionary/parser/Parser.java b/src/com/hughes/android/dictionary/parser/Parser.java index 0ca30a6..b0f2e96 100644 --- a/src/com/hughes/android/dictionary/parser/Parser.java +++ b/src/com/hughes/android/dictionary/parser/Parser.java @@ -1,3 +1,17 @@ +// Copyright 2012 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package com.hughes.android.dictionary.parser; import java.io.File; diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java index c3bd2ff..4f5d362 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java @@ -53,6 +53,8 @@ public abstract class AbstractWiktionaryParser implements Parser { abstract void parseSection(final String heading, final String text); + abstract void removeUselessArgs(final Map namedArgs); + @Override public void parse(final File file, final EntrySource entrySource, final int pageLimit) throws IOException { this.entrySource = entrySource; @@ -99,7 +101,7 @@ public abstract class AbstractWiktionaryParser implements Parser { public void incrementCount(final String string) { AtomicInteger counter = counters.get(string); - if (counter != null) { + if (counter == null) { counter = new AtomicInteger(); counters.put(string, counter); } @@ -109,7 +111,7 @@ public abstract class AbstractWiktionaryParser implements Parser { // ------------------------------------------------------------------------- - static final class AppendAndIndexWikiCallback implements WikiTokenizer.Callback { + static class AppendAndIndexWikiCallback implements WikiTokenizer.Callback { final T parser; StringBuilder builder; @@ -180,7 +182,7 @@ public abstract class AbstractWiktionaryParser implements Parser { FunctionCallback functionCallback = functionCallbacks.get(name); if (functionCallback == null || !functionCallback.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, this)) { // Default function handling: -// namedArgs.keySet().removeAll(EnWiktionaryXmlParser.USELESS_WIKI_ARGS); + parser.removeUselessArgs(namedArgs); final boolean single = args.isEmpty() && namedArgs.isEmpty(); builder.append(single ? "{" : "{{"); @@ -192,7 +194,7 @@ public abstract class AbstractWiktionaryParser implements Parser { builder.append(single ? "}" : "}}"); } } - + @Override public void onHtml(WikiTokenizer wikiTokenizer) { // Unindexed for now. diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/EnForeignParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/EnForeignParser.java new file mode 100644 index 0000000..ec0350e --- /dev/null +++ b/src/com/hughes/android/dictionary/parser/wiktionary/EnForeignParser.java @@ -0,0 +1,395 @@ +// Copyright 2012 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package com.hughes.android.dictionary.parser.wiktionary; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.regex.Pattern; + +import com.hughes.android.dictionary.engine.EntryTypeName; +import com.hughes.android.dictionary.engine.IndexBuilder; +import com.hughes.android.dictionary.engine.IndexedEntry; +import com.hughes.android.dictionary.engine.PairEntry; +import com.hughes.android.dictionary.engine.PairEntry.Pair; +import com.hughes.android.dictionary.parser.WikiTokenizer; + +public final class EnForeignParser extends EnParser { + + public EnForeignParser(final IndexBuilder enIndexBuilder, + final IndexBuilder otherIndexBuilder, final Pattern langPattern, + final Pattern langCodePattern, final boolean swap) { + super(enIndexBuilder, otherIndexBuilder, langPattern, langCodePattern, swap); + } + + @Override + void parseSection(String heading, String text) { + if (isIgnorableTitle(title)) { + return; + } + final String lang = heading.replaceAll("=", "").trim(); + if (!langPattern.matcher(lang).find()){ + return; + } + + final WikiTokenizer wikiTokenizer = new WikiTokenizer(text); + while (wikiTokenizer.nextToken() != null) { + if (wikiTokenizer.isHeading()) { + final String headingName = wikiTokenizer.headingWikiText(); + if (headingName.equals("Translations")) { + LOG.warning("Translations not in English section: " + title); + incrementCount("WARNING: Translations not in English section"); + } else if (headingName.equals("Pronunciation")) { + //doPronunciation(wikiLineReader); + } else if (partOfSpeechHeader.matcher(headingName).matches()) { + doForeignPartOfSpeech(lang, headingName, wikiTokenizer.headingDepth(), wikiTokenizer); + } + } else { + // It's not a heading. + // TODO: optimization: skip to next heading. + } + } + } + + static final class ListSection { + final String firstPrefix; + final String firstLine; + final List nextPrefixes = new ArrayList(); + final List nextLines = new ArrayList(); + + public ListSection(String firstPrefix, String firstLine) { + this.firstPrefix = firstPrefix; + this.firstLine = firstLine; + } + + @Override + public String toString() { + return firstPrefix + firstLine + "{ " + nextPrefixes + "}"; + } + } + + int foreignCount = 0; + private void doForeignPartOfSpeech(final String lang, String posHeading, final int posDepth, WikiTokenizer wikiTokenizer) { + if (++foreignCount % 1000 == 0) { + LOG.info("***" + lang + ", " + title + ", pos=" + posHeading + ", foreignCount=" + foreignCount); + } + if (title.equals("6")) { + System.out.println(); + } + + final StringBuilder foreignBuilder = new StringBuilder(); + final List listSections = new ArrayList(); + + appendAndIndexWikiCallback.reset(foreignBuilder, null); + this.state = State.ENGLISH_DEF_OF_FOREIGN; // TODO: this is wrong, need new category.... + titleAppended = false; + wordForms.clear(); + + try { + + EnForeignParser.ListSection lastListSection = null; + + int currentHeadingDepth = posDepth; + while (wikiTokenizer.nextToken() != null) { + if (wikiTokenizer.isHeading()) { + currentHeadingDepth = wikiTokenizer.headingDepth(); + + if (currentHeadingDepth <= posDepth) { + wikiTokenizer.returnToLineStart(); + return; + } + } // heading + + if (currentHeadingDepth > posDepth) { + // TODO: deal with other neat info sections inside POS + continue; + } + + if (wikiTokenizer.isFunction()) { + final String name = wikiTokenizer.functionName(); + final List args = wikiTokenizer.functionPositionArgs(); + final Map namedArgs = wikiTokenizer.functionNamedArgs(); + // First line is generally a repeat of the title with some extra information. + // We need to build up the left side (foreign text, tokens) separately from the + // right side (English). The left-side may get paired with multiple right sides. + // The left side should get filed under every form of the word in question (singular, plural). + + // For verbs, the conjugation comes later on in a deeper section. + // Ideally, we'd want to file every English entry with the verb + // under every verb form coming from the conjugation. + // Ie. under "fa": see: "make :: fare" and "do :: fare" + // But then where should we put the conjugation table? + // I think just under fare. But then we need a way to link to the entry (actually the row, since entries doesn't show up!) + // for the conjugation table from "fa". + // Would like to be able to link to a lang#token. + + appendAndIndexWikiCallback.onFunction(wikiTokenizer, name, args, namedArgs); + + } else if (wikiTokenizer.isListItem()) { + final String prefix = wikiTokenizer.listItemPrefix(); + if (lastListSection != null && + prefix.startsWith(lastListSection.firstPrefix) && + prefix.length() > lastListSection.firstPrefix.length()) { + lastListSection.nextPrefixes.add(prefix); + lastListSection.nextLines.add(wikiTokenizer.listItemWikiText()); + } else { + lastListSection = new ListSection(prefix, wikiTokenizer.listItemWikiText()); + listSections.add(lastListSection); + } + } else if (lastListSection != null) { + // Don't append anything after the lists, because there's crap. + } else if (wikiTokenizer.isWikiLink()) { + // Unindexed! + foreignBuilder.append(wikiTokenizer.wikiLinkText()); + + } else if (wikiTokenizer.isPlainText()) { + // Unindexed! + foreignBuilder.append(wikiTokenizer.token()); + + } else if (wikiTokenizer.isMarkup() || wikiTokenizer.isNewline() || wikiTokenizer.isComment()) { + // Do nothing. + } else { + LOG.warning("Unexpected token: " + wikiTokenizer.token()); + assert false; + } + } + + } finally { + // Here's where we exit. + // Should we make an entry even if there are no foreign list items? + String foreign = foreignBuilder.toString().trim(); + if (!titleAppended && !foreign.toLowerCase().startsWith(title.toLowerCase())) { + foreign = String.format("%s %s", title, foreign); + } + if (!langPattern.matcher(lang).matches()) { + foreign = String.format("(%s) %s", lang, foreign); + } + for (final EnForeignParser.ListSection listSection : listSections) { + doForeignListSection(foreign, title, wordForms, listSection); + } + } + } + + private void doForeignListSection(final String foreignText, String title, final Collection forms, final EnForeignParser.ListSection listSection) { + state = State.ENGLISH_DEF_OF_FOREIGN; + final String prefix = listSection.firstPrefix; + if (prefix.length() > 1) { + // Could just get looser and say that any prefix longer than first is a sublist. + LOG.warning("Prefix too long: " + listSection); + incrementCount("WARNING: Prefix too long"); + return; + } + + final PairEntry pairEntry = new PairEntry(entrySource); + final IndexedEntry indexedEntry = new IndexedEntry(pairEntry); + + entryIsFormOfSomething = false; + final StringBuilder englishBuilder = new StringBuilder(); + final String mainLine = listSection.firstLine; + appendAndIndexWikiCallback.reset(englishBuilder, indexedEntry); + appendAndIndexWikiCallback.dispatch(mainLine, enIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF); + + final String english = trim(englishBuilder.toString()); + if (english.length() > 0) { + final Pair pair = new Pair(english, trim(foreignText), this.swap); + pairEntry.pairs.add(pair); + foreignIndexBuilder.addEntryWithString(indexedEntry, title, entryIsFormOfSomething ? EntryTypeName.WIKTIONARY_IS_FORM_OF_SOMETHING_ELSE : EntryTypeName.WIKTIONARY_TITLE_MULTI); + for (final String form : forms) { + foreignIndexBuilder.addEntryWithString(indexedEntry, form, EntryTypeName.WIKTIONARY_INFLECTED_FORM_MULTI); + } + } + + // Do examples. + String lastForeign = null; + for (int i = 0; i < listSection.nextPrefixes.size(); ++i) { + final String nextPrefix = listSection.nextPrefixes.get(i); + final String nextLine = listSection.nextLines.get(i); + + // TODO: This splitting is not sensitive to wiki code. + int dash = nextLine.indexOf("—"); + int mdashLen = 7; + if (dash == -1) { + dash = nextLine.indexOf("—"); + mdashLen = 1; + } + if (dash == -1) { + dash = nextLine.indexOf(" - "); + mdashLen = 3; + } + + if ((nextPrefix.equals("#:") || nextPrefix.equals("##:")) && dash != -1) { + final String foreignEx = nextLine.substring(0, dash); + final String englishEx = nextLine.substring(dash + mdashLen); + final Pair pair = new Pair(formatAndIndexExampleString(englishEx, enIndexBuilder, indexedEntry), formatAndIndexExampleString(foreignEx, foreignIndexBuilder, indexedEntry), swap); + if (pair.lang1 != "--" && pair.lang1 != "--") { + pairEntry.pairs.add(pair); + } + lastForeign = null; + } else if (nextPrefix.equals("#:") || nextPrefix.equals("##:")){ + final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap); + lastForeign = nextLine; + if (pair.lang1 != "--" && pair.lang1 != "--") { + pairEntry.pairs.add(pair); + } + } else if (nextPrefix.equals("#::") || nextPrefix.equals("#**")) { + if (lastForeign != null && pairEntry.pairs.size() > 0) { + pairEntry.pairs.remove(pairEntry.pairs.size() - 1); + final Pair pair = new Pair(formatAndIndexExampleString(nextLine, enIndexBuilder, indexedEntry), formatAndIndexExampleString(lastForeign, foreignIndexBuilder, indexedEntry), swap); + if (pair.lang1 != "--" || pair.lang2 != "--") { + pairEntry.pairs.add(pair); + } + lastForeign = null; + } else { + LOG.warning("TODO: English example with no foreign: " + title + ", " + nextLine); + final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap); + if (pair.lang1 != "--" || pair.lang2 != "--") { + pairEntry.pairs.add(pair); + } + } + } else if (nextPrefix.equals("#*")) { + // Can't really index these. + final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap); + lastForeign = nextLine; + if (pair.lang1 != "--" || pair.lang2 != "--") { + pairEntry.pairs.add(pair); + } + } else if (nextPrefix.equals("#::*") || nextPrefix.equals("##") || nextPrefix.equals("#*:") || nextPrefix.equals("#:*") || true) { + final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap); + if (pair.lang1 != "--" || pair.lang2 != "--") { + pairEntry.pairs.add(pair); + } +// } else { +// assert false; + } + } + } + + private String formatAndIndexExampleString(final String example, final IndexBuilder indexBuilder, final IndexedEntry indexedEntry) { + // TODO: +// if (wikiTokenizer.token().equals("'''")) { +// insideTripleQuotes = !insideTripleQuotes; +// } + final StringBuilder builder = new StringBuilder(); + appendAndIndexWikiCallback.reset(builder, indexedEntry); + appendAndIndexWikiCallback.entryTypeName = EntryTypeName.WIKTIONARY_EXAMPLE; + appendAndIndexWikiCallback.entryTypeNameSticks = true; + try { + // TODO: this is a hack needed because we don't safely split on the dash. + appendAndIndexWikiCallback.dispatch(example, indexBuilder, EntryTypeName.WIKTIONARY_EXAMPLE); + } catch (AssertionError e) { + return "--"; + } + final String result = trim(builder.toString()); + return result.length() > 0 ? result : "--"; + } + + + private void itConjAre(List args, Map namedArgs) { + final String base = args.get(0); + final String aux = args.get(1); + + putIfMissing(namedArgs, "inf", base + "are"); + putIfMissing(namedArgs, "aux", aux); + putIfMissing(namedArgs, "ger", base + "ando"); + putIfMissing(namedArgs, "presp", base + "ante"); + putIfMissing(namedArgs, "pastp", base + "ato"); + // Present + putIfMissing(namedArgs, "pres1s", base + "o"); + putIfMissing(namedArgs, "pres2s", base + "i"); + putIfMissing(namedArgs, "pres3s", base + "a"); + putIfMissing(namedArgs, "pres1p", base + "iamo"); + putIfMissing(namedArgs, "pres2p", base + "ate"); + putIfMissing(namedArgs, "pres3p", base + "ano"); + // Imperfect + putIfMissing(namedArgs, "imperf1s", base + "avo"); + putIfMissing(namedArgs, "imperf2s", base + "avi"); + putIfMissing(namedArgs, "imperf3s", base + "ava"); + putIfMissing(namedArgs, "imperf1p", base + "avamo"); + putIfMissing(namedArgs, "imperf2p", base + "avate"); + putIfMissing(namedArgs, "imperf3p", base + "avano"); + // Passato remoto + putIfMissing(namedArgs, "prem1s", base + "ai"); + putIfMissing(namedArgs, "prem2s", base + "asti"); + putIfMissing(namedArgs, "prem3s", base + "ò"); + putIfMissing(namedArgs, "prem1p", base + "ammo"); + putIfMissing(namedArgs, "prem2p", base + "aste"); + putIfMissing(namedArgs, "prem3p", base + "arono"); + // Future + putIfMissing(namedArgs, "fut1s", base + "erò"); + putIfMissing(namedArgs, "fut2s", base + "erai"); + putIfMissing(namedArgs, "fut3s", base + "erà"); + putIfMissing(namedArgs, "fut1p", base + "eremo"); + putIfMissing(namedArgs, "fut2p", base + "erete"); + putIfMissing(namedArgs, "fut3p", base + "eranno"); + // Conditional + putIfMissing(namedArgs, "cond1s", base + "erei"); + putIfMissing(namedArgs, "cond2s", base + "eresti"); + putIfMissing(namedArgs, "cond3s", base + "erebbe"); + putIfMissing(namedArgs, "cond1p", base + "eremmo"); + putIfMissing(namedArgs, "cond2p", base + "ereste"); + putIfMissing(namedArgs, "cond3p", base + "erebbero"); + // Subjunctive / congiuntivo + putIfMissing(namedArgs, "sub123s", base + "i"); + putIfMissing(namedArgs, "sub1p", base + "iamo"); + putIfMissing(namedArgs, "sub2p", base + "iate"); + putIfMissing(namedArgs, "sub3p", base + "ino"); + // Imperfect subjunctive + putIfMissing(namedArgs, "impsub12s", base + "assi"); + putIfMissing(namedArgs, "impsub3s", base + "asse"); + putIfMissing(namedArgs, "impsub1p", base + "assimo"); + putIfMissing(namedArgs, "impsub2p", base + "aste"); + putIfMissing(namedArgs, "impsub3p", base + "assero"); + // Imperative + putIfMissing(namedArgs, "imp2s", base + "a"); + putIfMissing(namedArgs, "imp3s", base + "i"); + putIfMissing(namedArgs, "imp1p", base + "iamo"); + putIfMissing(namedArgs, "imp2p", base + "ate"); + putIfMissing(namedArgs, "imp3p", base + "ino"); + + + itConj(args, namedArgs); + } + + + private void itConj(List args, Map namedArgs) { + // TODO Auto-generated method stub + + } + + + private static void putIfMissing(final Map namedArgs, final String key, + final String value) { + final String oldValue = namedArgs.get(key); + if (oldValue == null || oldValue.length() == 0) { + namedArgs.put(key, value); + } + } + + // TODO: check how ='' and =| are manifested.... + // TODO: get this right in -are + private static void putOrNullify(final Map namedArgs, final String key, + final String value) { + final String oldValue = namedArgs.get(key); + if (oldValue == null/* || oldValue.length() == 0*/) { + namedArgs.put(key, value); + } else { + if (oldValue.equals("''")) { + namedArgs.put(key, ""); + } + } + } + + } // ForeignParser \ No newline at end of file diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/EnFunctionCallbacks.java b/src/com/hughes/android/dictionary/parser/wiktionary/EnFunctionCallbacks.java new file mode 100644 index 0000000..4d11824 --- /dev/null +++ b/src/com/hughes/android/dictionary/parser/wiktionary/EnFunctionCallbacks.java @@ -0,0 +1,595 @@ +// Copyright 2012 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package com.hughes.android.dictionary.parser.wiktionary; + +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; + +import com.hughes.android.dictionary.engine.EntryTypeName; +import com.hughes.android.dictionary.engine.IndexBuilder; +import com.hughes.android.dictionary.parser.WikiTokenizer; +import com.hughes.android.dictionary.parser.wiktionary.AbstractWiktionaryParser.AppendAndIndexWikiCallback; +import com.hughes.android.dictionary.parser.wiktionary.AbstractWiktionaryParser.NameAndArgs; +import com.hughes.util.ListUtil; + +class EnFunctionCallbacks { + + static final Map> DEFAULT = new LinkedHashMap>(); + + static { + FunctionCallback callback = new TranslationCallback(); + DEFAULT.put("t", callback); + DEFAULT.put("t+", callback); + DEFAULT.put("t-", callback); + DEFAULT.put("tø", callback); + DEFAULT.put("apdx-t", callback); + + callback = new EncodingCallback(); + Set encodings = new LinkedHashSet(Arrays.asList( + "zh-ts", "zh-tsp", + "sd-Arab", "ku-Arab", "Arab", "unicode", "Laoo", "ur-Arab", "Thai", + "fa-Arab", "Khmr", "Cyrl", "IPAchar", "ug-Arab", "ko-inline", + "Jpan", "Kore", "Hebr", "rfscript", "Beng", "Mong", "Knda", "Cyrs", + "yue-tsj", "Mlym", "Tfng", "Grek", "yue-yue-j")); + for (final String encoding : encodings) { + DEFAULT.put(encoding, callback); + } + + callback = new l_term(); + DEFAULT.put("l", callback); + DEFAULT.put("term", callback); + + callback = new Gender(); + DEFAULT.put("m", callback); + DEFAULT.put("f", callback); + DEFAULT.put("n", callback); + DEFAULT.put("p", callback); + DEFAULT.put("g", callback); + + callback = new AppendArg0(); + + callback = new Ignore(); + DEFAULT.put("trreq", callback); + DEFAULT.put("t-image", callback); + DEFAULT.put("defn", callback); + DEFAULT.put("rfdef", callback); + DEFAULT.put("rfdate", callback); + DEFAULT.put("rfex", callback); + DEFAULT.put("rfquote", callback); + DEFAULT.put("attention", callback); + DEFAULT.put("zh-attention", callback); + + + callback = new FormOf(); + DEFAULT.put("form of", callback); + DEFAULT.put("conjugation of", callback); + DEFAULT.put("participle of", callback); + DEFAULT.put("present participle of", callback); + DEFAULT.put("past participle of", callback); + DEFAULT.put("feminine past participle of", callback); + DEFAULT.put("gerund of", callback); + DEFAULT.put("feminine of", callback); + DEFAULT.put("plural of", callback); + DEFAULT.put("feminine plural of", callback); + DEFAULT.put("inflected form of", callback); + DEFAULT.put("alternative form of", callback); + DEFAULT.put("dated form of", callback); + DEFAULT.put("apocopic form of", callback); + + callback = new InflOrHead(); + DEFAULT.put("infl", callback); + DEFAULT.put("head", callback); + + callback = new AppendName(); + DEFAULT.put("...", callback); + + DEFAULT.put("qualifier", new QualifierCallback()); + DEFAULT.put("italbrac", new italbrac()); + DEFAULT.put("gloss", new gloss()); + DEFAULT.put("not used", new not_used()); + DEFAULT.put("wikipedia", new wikipedia()); + } + + static final NameAndArgs NAME_AND_ARGS = new NameAndArgs(); + + // ------------------------------------------------------------------ + + static final class TranslationCallback implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, final EnParser parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + + final String transliteration = namedArgs.remove("tr"); + final String alt = namedArgs.remove("alt"); + namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS); + if (args.size() < 2) { + EnParser.LOG.warning("{{t...}} with wrong args: title=" + parser.title); + return false; + } + final String langCode = ListUtil.get(args, 0); + if (!appendAndIndexWikiCallback.langCodeToTCount.containsKey(langCode)) { + appendAndIndexWikiCallback.langCodeToTCount.put(langCode, new AtomicInteger()); + } + appendAndIndexWikiCallback.langCodeToTCount.get(langCode).incrementAndGet(); + final String word = ListUtil.get(args, 1); + appendAndIndexWikiCallback.dispatch(alt != null ? alt : word, EntryTypeName.WIKTIONARY_TITLE_MULTI); + + // Genders... + if (args.size() > 2) { + appendAndIndexWikiCallback.builder.append(" {"); + for (int i = 2; i < args.size(); ++i) { + if (i > 2) { + appendAndIndexWikiCallback.builder.append("|"); + } + appendAndIndexWikiCallback.builder.append(args.get(i)); + } + appendAndIndexWikiCallback.builder.append("}"); + } + + if (transliteration != null) { + appendAndIndexWikiCallback.builder.append(" ("); + appendAndIndexWikiCallback.dispatch(transliteration, EntryTypeName.WIKTIONARY_TRANSLITERATION); + appendAndIndexWikiCallback.builder.append(")"); + } + + if (alt != null) { + // If alt wasn't null, we appended alt instead of the actual word + // we're filing under.. + appendAndIndexWikiCallback.builder.append(" ("); + appendAndIndexWikiCallback.dispatch(word, EntryTypeName.WIKTIONARY_TITLE_MULTI); + appendAndIndexWikiCallback.builder.append(")"); + } + + // Catch-all for anything else... + if (!namedArgs.isEmpty()) { + appendAndIndexWikiCallback.builder.append(" {"); + EnParser.appendNamedArgs(namedArgs, appendAndIndexWikiCallback); + appendAndIndexWikiCallback.builder.append("}"); + } + + return true; + } + } + + // ------------------------------------------------------------------ + + static final class QualifierCallback implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final EnParser parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + if (args.size() != 1 || !namedArgs.isEmpty()) { + EnParser.LOG.warning("weird qualifier: "); + return false; + } + String qualifier = args.get(0); + appendAndIndexWikiCallback.builder.append("("); + appendAndIndexWikiCallback.dispatch(qualifier, null); + appendAndIndexWikiCallback.builder.append(")"); + return true; + } + } + + // ------------------------------------------------------------------ + + static final class EncodingCallback implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final EnParser parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + if (!namedArgs.isEmpty()) { + EnParser.LOG.warning("weird encoding: " + wikiTokenizer.token()); + } + if (args.size() == 0) { + // Things like "{{Jpan}}" exist. + return true; + } + + for (int i = 0; i < args.size(); ++i) { + if (i > 0) { + appendAndIndexWikiCallback.builder.append(", "); + } + final String arg = args.get(i); +// if (arg.equals(parser.title)) { +// parser.titleAppended = true; +// } + appendAndIndexWikiCallback.dispatch(arg, appendAndIndexWikiCallback.entryTypeName); + } + + return true; + } + } + + // ------------------------------------------------------------------ + + static final class Gender implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final EnParser parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + if (!namedArgs.isEmpty()) { + return false; + } + appendAndIndexWikiCallback.builder.append("{"); + appendAndIndexWikiCallback.builder.append(name); + for (int i = 0; i < args.size(); ++i) { + appendAndIndexWikiCallback.builder.append("|").append(args.get(i)); + } + appendAndIndexWikiCallback.builder.append("}"); + return true; + } + } + + // ------------------------------------------------------------------ + + static final class l_term implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final EnParser parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + + // for {{l}}, lang is arg 0, but not for {{term}} + if (name.equals("term")) { + args.add(0, ""); + } + + final EntryTypeName entryTypeName; + switch (parser.state) { + case TRANSLATION_LINE: entryTypeName = EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT; break; + case ENGLISH_DEF_OF_FOREIGN: entryTypeName = EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK; break; + default: throw new IllegalStateException("Invalid enum value: " + parser.state); + } + + final String langCode = args.get(0); + final IndexBuilder indexBuilder; + if ("".equals(langCode)) { + indexBuilder = parser.foreignIndexBuilder; + } else if ("en".equals(langCode)) { + indexBuilder = parser.enIndexBuilder; + } else { + indexBuilder = parser.foreignIndexBuilder; + } + + String displayText = ListUtil.get(args, 2, ""); + if (displayText.equals("")) { + displayText = ListUtil.get(args, 1, null); + } + + if (displayText != null) { + appendAndIndexWikiCallback.dispatch(displayText, indexBuilder, entryTypeName); + } else { + EnParser.LOG.warning("no display text: " + wikiTokenizer.token()); + } + + final String tr = namedArgs.remove("tr"); + if (tr != null) { + appendAndIndexWikiCallback.builder.append(" ("); + appendAndIndexWikiCallback.dispatch(tr, indexBuilder, EntryTypeName.WIKTIONARY_TRANSLITERATION); + appendAndIndexWikiCallback.builder.append(")"); + } + + final String gloss = ListUtil.get(args, 3, ""); + if (!gloss.equals("")) { + appendAndIndexWikiCallback.builder.append(" ("); + appendAndIndexWikiCallback.dispatch(gloss, parser.enIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF); + appendAndIndexWikiCallback.builder.append(")"); + } + + namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS); + if (!namedArgs.isEmpty()) { + appendAndIndexWikiCallback.builder.append(" {").append(name); + EnParser.appendNamedArgs(namedArgs, appendAndIndexWikiCallback); + appendAndIndexWikiCallback.builder.append("}"); + } + + return true; + } + } + + // ------------------------------------------------------------------ + + static final class AppendArg0 implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final EnParser parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + if (args.size() != 1 || !namedArgs.isEmpty()) { + return false; + } + appendAndIndexWikiCallback.dispatch(args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); + + final String tr = namedArgs.remove("tr"); + if (tr != null) { + appendAndIndexWikiCallback.builder.append(" ("); + appendAndIndexWikiCallback.dispatch(tr, EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); + appendAndIndexWikiCallback.builder.append(")"); + parser.wordForms.add(tr); + } + + return true; + } + } + + // ------------------------------------------------------------------ + + static final class italbrac implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final EnParser parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + if (args.size() != 1 || !namedArgs.isEmpty()) { + return false; + } + appendAndIndexWikiCallback.builder.append("("); + appendAndIndexWikiCallback.dispatch(args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); + appendAndIndexWikiCallback.builder.append(")"); + return true; + } + } + + // ------------------------------------------------------------------ + + static final class gloss implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final EnParser parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + if (args.size() != 1 || !namedArgs.isEmpty()) { + return false; + } + appendAndIndexWikiCallback.builder.append("("); + appendAndIndexWikiCallback.dispatch(args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); + appendAndIndexWikiCallback.builder.append(")"); + return true; + } + } + + // ------------------------------------------------------------------ + + static final class Ignore implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final EnParser parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + return true; + } + } + + // ------------------------------------------------------------------ + + static final class not_used implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final EnParser parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + appendAndIndexWikiCallback.builder.append("(not used)"); + return true; + } + } + + + // ------------------------------------------------------------------ + + static final class AppendName implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final EnParser parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + if (!args.isEmpty() || !namedArgs.isEmpty()) { + return false; + } + appendAndIndexWikiCallback.builder.append(name); + return true; + } + } + + // -------------------------------------------------------------------- + // -------------------------------------------------------------------- + + + static final class FormOf implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final EnParser parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + parser.entryIsFormOfSomething = true; + String formName = name; + if (name.equals("form of")) { + formName = ListUtil.remove(args, 0, null); + } + if (formName == null) { + EnParser.LOG.warning("Missing form name: " + parser.title); + formName = "form of"; + } + String baseForm = ListUtil.get(args, 1, ""); + if ("".equals(baseForm)) { + baseForm = ListUtil.get(args, 0, null); + ListUtil.remove(args, 1, ""); + } else { + ListUtil.remove(args, 0, null); + } + namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS); + + appendAndIndexWikiCallback.builder.append("{"); + NAME_AND_ARGS.onWikiFunction(wikiTokenizer, formName, args, namedArgs, parser, appendAndIndexWikiCallback); + appendAndIndexWikiCallback.builder.append("}"); + if (baseForm != null && appendAndIndexWikiCallback.indexedEntry != null) { + parser.foreignIndexBuilder.addEntryWithString(appendAndIndexWikiCallback.indexedEntry, baseForm, EntryTypeName.WIKTIONARY_BASE_FORM_MULTI); + } else { + // null baseForm happens in Danish. + EnParser.LOG.warning("Null baseform: " + parser.title); + } + return true; + } + } + + static final EnFunctionCallbacks.FormOf FORM_OF = new FormOf(); + + + // -------------------------------------------------------------------- + // -------------------------------------------------------------------- + + static final class wikipedia implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final EnParser parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + namedArgs.remove("lang"); + if (args.size() > 1 || !namedArgs.isEmpty()) { + // Unindexed! + return false; + } else if (args.size() == 1) { + return false; + } else { + return true; + } + } + } + + static final class InflOrHead implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final EnParser parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + // See: http://en.wiktionary.org/wiki/Template:infl + final String langCode = ListUtil.get(args, 0); + String head = namedArgs.remove("head"); + if (head == null) { + head = namedArgs.remove("title"); // Bug + } + if (head == null) { + head = parser.title; + } + parser.titleAppended = true; + + namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS); + + final String tr = namedArgs.remove("tr"); + String g = namedArgs.remove("g"); + if (g == null) { + g = namedArgs.remove("gender"); + } + final String g2 = namedArgs.remove("g2"); + final String g3 = namedArgs.remove("g3"); + + appendAndIndexWikiCallback.dispatch(head, EntryTypeName.WIKTIONARY_TITLE_MULTI); + + if (g != null) { + appendAndIndexWikiCallback.builder.append(" {").append(g); + if (g2 != null) { + appendAndIndexWikiCallback.builder.append("|").append(g2); + } + if (g3 != null) { + appendAndIndexWikiCallback.builder.append("|").append(g3); + } + appendAndIndexWikiCallback.builder.append("}"); + } + + if (tr != null) { + appendAndIndexWikiCallback.builder.append(" ("); + appendAndIndexWikiCallback.dispatch(tr, EntryTypeName.WIKTIONARY_TITLE_MULTI); + appendAndIndexWikiCallback.builder.append(")"); + parser.wordForms.add(tr); + } + + final String pos = ListUtil.get(args, 1); + if (pos != null) { + appendAndIndexWikiCallback.builder.append(" (").append(pos).append(")"); + } + for (int i = 2; i < args.size(); i += 2) { + final String inflName = ListUtil.get(args, i); + final String inflValue = ListUtil.get(args, i + 1); + appendAndIndexWikiCallback.builder.append(", "); + appendAndIndexWikiCallback.dispatch(inflName, null, null); + if (inflValue != null && inflValue.length() > 0) { + appendAndIndexWikiCallback.builder.append(": "); + appendAndIndexWikiCallback.dispatch(inflValue, null, null); + parser.wordForms.add(inflValue); + } + } + for (final String key : namedArgs.keySet()) { + final String value = WikiTokenizer.toPlainText(namedArgs.get(key)); + appendAndIndexWikiCallback.builder.append(" "); + appendAndIndexWikiCallback.dispatch(key, null, null); + appendAndIndexWikiCallback.builder.append("="); + appendAndIndexWikiCallback.dispatch(value, null, null); + parser.wordForms.add(value); + } + return true; + } + } + + + static { + DEFAULT.put("it-noun", new it_noun()); + } + static final class it_noun implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final EnParser parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + parser.titleAppended = true; + final String base = ListUtil.get(args, 0); + final String gender = ListUtil.get(args, 1); + final String singular = base + ListUtil.get(args, 2, null); + final String plural = base + ListUtil.get(args, 3, null); + appendAndIndexWikiCallback.builder.append(" "); + appendAndIndexWikiCallback.dispatch(singular, null, null); + appendAndIndexWikiCallback.builder.append(" {").append(gender).append("}, "); + appendAndIndexWikiCallback.dispatch(plural, null, null); + appendAndIndexWikiCallback.builder.append(" {pl}"); + parser.wordForms.add(singular); + parser.wordForms.add(plural); + if (!namedArgs.isEmpty() || args.size() > 4) { + EnParser.LOG.warning("Invalid it-noun: " + wikiTokenizer.token()); + } + return true; + } + } + + static { + DEFAULT.put("it-proper noun", new it_proper_noun()); + } + static final class it_proper_noun implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final EnParser parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + return false; + } + } + + } \ No newline at end of file diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/EnParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/EnParser.java index 6032505..7fdea86 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/EnParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/EnParser.java @@ -17,21 +17,14 @@ package com.hughes.android.dictionary.parser.wiktionary; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; -import java.util.LinkedHashMap; import java.util.LinkedHashSet; -import java.util.List; import java.util.Map; import java.util.Set; -import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Pattern; import com.hughes.android.dictionary.engine.EntryTypeName; import com.hughes.android.dictionary.engine.IndexBuilder; -import com.hughes.android.dictionary.engine.IndexedEntry; -import com.hughes.android.dictionary.engine.PairEntry; -import com.hughes.android.dictionary.engine.PairEntry.Pair; import com.hughes.android.dictionary.parser.WikiTokenizer; -import com.hughes.util.ListUtil; public abstract class EnParser extends AbstractWiktionaryParser { @@ -95,9 +88,9 @@ public abstract class EnParser extends AbstractWiktionaryParser { boolean titleAppended = false; - final AppendAndIndexWikiCallback appendAndIndexWikiCallback = new AppendAndIndexWikiCallback(this); + final AppendAndIndexWikiCallback appendAndIndexWikiCallback = new AppendAndIndexCallback(this); { - appendAndIndexWikiCallback.functionCallbacks.putAll(FunctionCallbacks.DEFAULT); + appendAndIndexWikiCallback.functionCallbacks.putAll(EnFunctionCallbacks.DEFAULT); } EnParser(final IndexBuilder enIndexBuilder, final IndexBuilder otherIndexBuilder, final Pattern langPattern, final Pattern langCodePattern, final boolean swap) { @@ -107,1142 +100,50 @@ public abstract class EnParser extends AbstractWiktionaryParser { this.langCodePattern = langCodePattern; this.swap = swap; } - - // -------------------------------------------------------------------- - - static final class EnToTranslationParser extends EnParser { - - EnToTranslationParser(final IndexBuilder enIndexBuilder, - final IndexBuilder otherIndexBuilder, final Pattern langPattern, - final Pattern langCodePattern, final boolean swap) { - super(enIndexBuilder, otherIndexBuilder, langPattern, langCodePattern, swap); - } - - @Override - void parseSection(String heading, String text) { - if (isIgnorableTitle(title)) { - return; - } - heading = heading.replaceAll("=", "").trim(); - if (!heading.contains("English")) { - return; - } - String pos = null; - int posDepth = -1; - - final WikiTokenizer wikiTokenizer = new WikiTokenizer(text); - while (wikiTokenizer.nextToken() != null) { - - if (wikiTokenizer.isHeading()) { - final String headerName = wikiTokenizer.headingWikiText(); - - if (wikiTokenizer.headingDepth() <= posDepth) { - pos = null; - posDepth = -1; - } - - if (partOfSpeechHeader.matcher(headerName).matches()) { - posDepth = wikiTokenizer.headingDepth(); - pos = wikiTokenizer.headingWikiText(); - // TODO: if we're inside the POS section, we should handle the first title line... - - } else if (headerName.equals("Translations")) { - if (pos == null) { - LOG.info("Translations without POS (but using anyway): " + title); - } - doTranslations(wikiTokenizer, pos); - } else if (headerName.equals("Pronunciation")) { - //doPronunciation(wikiLineReader); - } - } else if (wikiTokenizer.isFunction()) { - final String name = wikiTokenizer.functionName(); - if (name.equals("head") && pos == null) { - LOG.warning("{{head}} without POS: " + title); - } - } - } - } - - private void doTranslations(final WikiTokenizer wikiTokenizer, final String pos) { - if (title.equals("absolutely")) { - //System.out.println(); - } - - String topLevelLang = null; - String sense = null; - boolean done = false; - while (wikiTokenizer.nextToken() != null) { - if (wikiTokenizer.isHeading()) { - wikiTokenizer.returnToLineStart(); - return; - } - if (done) { - continue; - } - - // Check whether we care about this line: - - if (wikiTokenizer.isFunction()) { - final String functionName = wikiTokenizer.functionName(); - final List positionArgs = wikiTokenizer.functionPositionArgs(); - - if (functionName.equals("trans-top")) { - sense = null; - if (wikiTokenizer.functionPositionArgs().size() >= 1) { - sense = positionArgs.get(0); - // TODO: could emphasize words in [[brackets]] inside sense. - sense = WikiTokenizer.toPlainText(sense); - //LOG.info("Sense: " + sense); - } - } else if (functionName.equals("trans-bottom")) { - sense = null; - } else if (functionName.equals("trans-mid")) { - } else if (functionName.equals("trans-see")) { - incrementCount("WARNING:trans-see"); - } else if (functionName.startsWith("picdic")) { - } else if (functionName.startsWith("checktrans")) { - done = true; - } else if (functionName.startsWith("ttbc")) { - wikiTokenizer.nextLine(); - // TODO: would be great to handle ttbc - // TODO: Check this: done = true; - } else { - LOG.warning("Unexpected translation wikifunction: " + wikiTokenizer.token() + ", title=" + title); - } - } else if (wikiTokenizer.isListItem()) { - final String line = wikiTokenizer.listItemWikiText(); - // This line could produce an output... - -// if (line.contains("ich hoan dich gear")) { -// //System.out.println(); -// } - - // First strip the language and check whether it matches. - // And hold onto it for sub-lines. - final int colonIndex = line.indexOf(":"); - if (colonIndex == -1) { - continue; - } - - final String lang = trim(WikiTokenizer.toPlainText(line.substring(0, colonIndex))); - incrementCount("tCount:" + lang); - final boolean appendLang; - if (wikiTokenizer.listItemPrefix().length() == 1) { - topLevelLang = lang; - final boolean thisFind = langPattern.matcher(lang).find(); - if (!thisFind) { - continue; - } - appendLang = !langPattern.matcher(lang).matches(); - } else if (topLevelLang == null) { - continue; - } else { - // Two-level -- the only way we won't append is if this second level matches exactly. - if (!langPattern.matcher(lang).matches() && !langPattern.matcher(topLevelLang).find()) { - continue; - } - appendLang = !langPattern.matcher(lang).matches(); - } - - String rest = line.substring(colonIndex + 1).trim(); - if (rest.length() > 0) { - doTranslationLine(line, appendLang ? lang : null, pos, sense, rest); - } - - } else if (wikiTokenizer.remainderStartsWith("''See''")) { - wikiTokenizer.nextLine(); - incrementCount("WARNING: ''See''" ); - LOG.fine("Skipping See line: " + wikiTokenizer.token()); - } else if (wikiTokenizer.isWikiLink()) { - final String wikiLink = wikiTokenizer.wikiLinkText(); - if (wikiLink.contains(":") && wikiLink.contains(title)) { - } else if (wikiLink.contains("Category:")) { - } else { - incrementCount("WARNING: Unexpected wikiLink" ); - LOG.warning("Unexpected wikiLink: " + wikiTokenizer.token() + ", title=" + title); - } - } else if (wikiTokenizer.isNewline() || wikiTokenizer.isMarkup() || wikiTokenizer.isComment()) { - } else { - final String token = wikiTokenizer.token(); - if (token.equals("----")) { - } else { - LOG.warning("Unexpected translation token: " + wikiTokenizer.token() + ", title=" + title); - incrementCount("WARNING: Unexpected translation token" ); - } - } - - } - } - - private void doTranslationLine(final String line, final String lang, final String pos, final String sense, final String rest) { - state = State.TRANSLATION_LINE; - // Good chance we'll actually file this one... - final PairEntry pairEntry = new PairEntry(entrySource); - final IndexedEntry indexedEntry = new IndexedEntry(pairEntry); - - final StringBuilder foreignText = new StringBuilder(); - appendAndIndexWikiCallback.reset(foreignText, indexedEntry); - appendAndIndexWikiCallback.dispatch(rest, foreignIndexBuilder, EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); - - if (foreignText.length() == 0) { - LOG.warning("Empty foreignText: " + line); - incrementCount("WARNING: Empty foreignText" ); - return; - } - - if (lang != null) { - foreignText.insert(0, String.format("(%s) ", lang)); - } - - StringBuilder englishText = new StringBuilder(); - - englishText.append(title); - if (sense != null) { - englishText.append(" (").append(sense).append(")"); - enIndexBuilder.addEntryWithString(indexedEntry, sense, EntryTypeName.WIKTIONARY_TRANSLATION_SENSE); - } - if (pos != null) { - englishText.append(" (").append(pos.toLowerCase()).append(")"); - } - enIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_MULTI); - - final Pair pair = new Pair(trim(englishText.toString()), trim(foreignText.toString()), swap); - pairEntry.pairs.add(pair); - if (!pairsAdded.add(pair.toString())) { - LOG.warning("Duplicate pair: " + pair.toString()); - incrementCount("WARNING: Duplicate pair" ); - } - } - } // EnToTranslationParser - - // ----------------------------------------------------------------------- - + @Override + void removeUselessArgs(Map namedArgs) { + namedArgs.keySet().removeAll(USELESS_WIKI_ARGS); + } - static final class ForeignParser extends EnParser { + static class AppendAndIndexCallback extends AppendAndIndexWikiCallback { - ForeignParser(final IndexBuilder enIndexBuilder, - final IndexBuilder otherIndexBuilder, final Pattern langPattern, - final Pattern langCodePattern, final boolean swap) { - super(enIndexBuilder, otherIndexBuilder, langPattern, langCodePattern, swap); + public AppendAndIndexCallback(EnParser parser) { + super(parser); } @Override - void parseSection(String heading, String text) { - if (isIgnorableTitle(title)) { - return; - } - final String lang = heading.replaceAll("=", "").trim(); - if (!langPattern.matcher(lang).find()){ - return; - } - - final WikiTokenizer wikiTokenizer = new WikiTokenizer(text); - while (wikiTokenizer.nextToken() != null) { - if (wikiTokenizer.isHeading()) { - final String headingName = wikiTokenizer.headingWikiText(); - if (headingName.equals("Translations")) { - LOG.warning("Translations not in English section: " + title); - } else if (headingName.equals("Pronunciation")) { - //doPronunciation(wikiLineReader); - } else if (partOfSpeechHeader.matcher(headingName).matches()) { - doForeignPartOfSpeech(lang, headingName, wikiTokenizer.headingDepth(), wikiTokenizer); - } - } else { - // It's not a heading. - // TODO: optimization: skip to next heading. - } - } - } - - static final class ListSection { - final String firstPrefix; - final String firstLine; - final List nextPrefixes = new ArrayList(); - final List nextLines = new ArrayList(); - - public ListSection(String firstPrefix, String firstLine) { - this.firstPrefix = firstPrefix; - this.firstLine = firstLine; - } - - @Override - public String toString() { - return firstPrefix + firstLine + "{ " + nextPrefixes + "}"; - } - } - - int foreignCount = 0; - private void doForeignPartOfSpeech(final String lang, String posHeading, final int posDepth, WikiTokenizer wikiTokenizer) { - if (++foreignCount % 1000 == 0) { - LOG.info("***" + lang + ", " + title + ", pos=" + posHeading + ", foreignCount=" + foreignCount); - } - if (title.equals("6")) { - System.out.println(); - } - - final StringBuilder foreignBuilder = new StringBuilder(); - final List listSections = new ArrayList(); - - appendAndIndexWikiCallback.reset(foreignBuilder, null); - this.state = State.ENGLISH_DEF_OF_FOREIGN; // TODO: this is wrong, need new category.... - titleAppended = false; - wordForms.clear(); - - try { - - ListSection lastListSection = null; - - int currentHeadingDepth = posDepth; - while (wikiTokenizer.nextToken() != null) { - if (wikiTokenizer.isHeading()) { - currentHeadingDepth = wikiTokenizer.headingDepth(); - - if (currentHeadingDepth <= posDepth) { - wikiTokenizer.returnToLineStart(); - return; - } - } - - if (currentHeadingDepth > posDepth) { - // TODO: deal with other neat info sections - continue; - } - - if (wikiTokenizer.isFunction()) { - final String name = wikiTokenizer.functionName(); - final List args = wikiTokenizer.functionPositionArgs(); - final Map namedArgs = wikiTokenizer.functionNamedArgs(); - // First line is generally a repeat of the title with some extra information. - // We need to build up the left side (foreign text, tokens) separately from the - // right side (English). The left-side may get paired with multiple right sides. - // The left side should get filed under every form of the word in question (singular, plural). - - // For verbs, the conjugation comes later on in a deeper section. - // Ideally, we'd want to file every English entry with the verb - // under every verb form coming from the conjugation. - // Ie. under "fa": see: "make :: fare" and "do :: fare" - // But then where should we put the conjugation table? - // I think just under fare. But then we need a way to link to the entry (actually the row, since entries doesn't show up!) - // for the conjugation table from "fa". - // Would like to be able to link to a lang#token. - - appendAndIndexWikiCallback.onFunction(wikiTokenizer, name, args, namedArgs); - - } else if (wikiTokenizer.isListItem()) { - final String prefix = wikiTokenizer.listItemPrefix(); - if (lastListSection != null && - prefix.startsWith(lastListSection.firstPrefix) && - prefix.length() > lastListSection.firstPrefix.length()) { - lastListSection.nextPrefixes.add(prefix); - lastListSection.nextLines.add(wikiTokenizer.listItemWikiText()); - } else { - lastListSection = new ListSection(prefix, wikiTokenizer.listItemWikiText()); - listSections.add(lastListSection); - } - } else if (lastListSection != null) { - // Don't append anything after the lists, because there's crap. - } else if (wikiTokenizer.isWikiLink()) { - // Unindexed! - foreignBuilder.append(wikiTokenizer.wikiLinkText()); - - } else if (wikiTokenizer.isPlainText()) { - // Unindexed! - foreignBuilder.append(wikiTokenizer.token()); - - } else if (wikiTokenizer.isMarkup() || wikiTokenizer.isNewline() || wikiTokenizer.isComment()) { - // Do nothing. + public void onWikiLink(WikiTokenizer wikiTokenizer) { + final String text = wikiTokenizer.wikiLinkText(); + final String link = wikiTokenizer.wikiLinkDest(); + if (link != null) { + if (link.contains("#English")) { + dispatch(text, parser.enIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK); + } else if (link.contains("#") && parser.langPattern.matcher(link).find()) { + dispatch(text, parser.foreignIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF_OTHER_LANG); + } else if (link.equals("plural")) { + builder.append(text); } else { - LOG.warning("Unexpected token: " + wikiTokenizer.token()); + //LOG.warning("Special link: " + englishTokenizer.token()); + dispatch(text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK); } - } - - } finally { - // Here's where we exit. - // Should we make an entry even if there are no foreign list items? - String foreign = foreignBuilder.toString().trim(); - if (!titleAppended && !foreign.toLowerCase().startsWith(title.toLowerCase())) { - foreign = String.format("%s %s", title, foreign); - } - if (!langPattern.matcher(lang).matches()) { - foreign = String.format("(%s) %s", lang, foreign); - } - for (final ListSection listSection : listSections) { - doForeignListSection(foreign, title, wordForms, listSection); - } - } - } - - - - private void doForeignListSection(final String foreignText, String title, final Collection forms, final ListSection listSection) { - state = State.ENGLISH_DEF_OF_FOREIGN; - final String prefix = listSection.firstPrefix; - if (prefix.length() > 1) { - // Could just get looser and say that any prefix longer than first is a sublist. - LOG.warning("Prefix too long: " + listSection); - return; - } - - final PairEntry pairEntry = new PairEntry(entrySource); - final IndexedEntry indexedEntry = new IndexedEntry(pairEntry); - - entryIsFormOfSomething = false; - final StringBuilder englishBuilder = new StringBuilder(); - final String mainLine = listSection.firstLine; - appendAndIndexWikiCallback.reset(englishBuilder, indexedEntry); - appendAndIndexWikiCallback.dispatch(mainLine, enIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF); - - final String english = trim(englishBuilder.toString()); - if (english.length() > 0) { - final Pair pair = new Pair(english, trim(foreignText), this.swap); - pairEntry.pairs.add(pair); - foreignIndexBuilder.addEntryWithString(indexedEntry, title, entryIsFormOfSomething ? EntryTypeName.WIKTIONARY_IS_FORM_OF_SOMETHING_ELSE : EntryTypeName.WIKTIONARY_TITLE_MULTI); - for (final String form : forms) { - foreignIndexBuilder.addEntryWithString(indexedEntry, form, EntryTypeName.WIKTIONARY_INFLECTED_FORM_MULTI); - } - } - - // Do examples. - String lastForeign = null; - for (int i = 0; i < listSection.nextPrefixes.size(); ++i) { - final String nextPrefix = listSection.nextPrefixes.get(i); - final String nextLine = listSection.nextLines.get(i); - - // TODO: This splitting is not sensitive to wiki code. - int dash = nextLine.indexOf("—"); - int mdashLen = 7; - if (dash == -1) { - dash = nextLine.indexOf("—"); - mdashLen = 1; - } - if (dash == -1) { - dash = nextLine.indexOf(" - "); - mdashLen = 3; - } - - if ((nextPrefix.equals("#:") || nextPrefix.equals("##:")) && dash != -1) { - final String foreignEx = nextLine.substring(0, dash); - final String englishEx = nextLine.substring(dash + mdashLen); - final Pair pair = new Pair(formatAndIndexExampleString(englishEx, enIndexBuilder, indexedEntry), formatAndIndexExampleString(foreignEx, foreignIndexBuilder, indexedEntry), swap); - if (pair.lang1 != "--" && pair.lang1 != "--") { - pairEntry.pairs.add(pair); - } - lastForeign = null; - } else if (nextPrefix.equals("#:") || nextPrefix.equals("##:")){ - final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap); - lastForeign = nextLine; - if (pair.lang1 != "--" && pair.lang1 != "--") { - pairEntry.pairs.add(pair); - } - } else if (nextPrefix.equals("#::") || nextPrefix.equals("#**")) { - if (lastForeign != null && pairEntry.pairs.size() > 0) { - pairEntry.pairs.remove(pairEntry.pairs.size() - 1); - final Pair pair = new Pair(formatAndIndexExampleString(nextLine, enIndexBuilder, indexedEntry), formatAndIndexExampleString(lastForeign, foreignIndexBuilder, indexedEntry), swap); - if (pair.lang1 != "--" || pair.lang2 != "--") { - pairEntry.pairs.add(pair); - } - lastForeign = null; - } else { - LOG.warning("TODO: English example with no foreign: " + title + ", " + nextLine); - final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap); - if (pair.lang1 != "--" || pair.lang2 != "--") { - pairEntry.pairs.add(pair); - } - } - } else if (nextPrefix.equals("#*")) { - // Can't really index these. - final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap); - lastForeign = nextLine; - if (pair.lang1 != "--" || pair.lang2 != "--") { - pairEntry.pairs.add(pair); - } - } else if (nextPrefix.equals("#::*") || nextPrefix.equals("##") || nextPrefix.equals("#*:") || nextPrefix.equals("#:*") || true) { - final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap); - if (pair.lang1 != "--" || pair.lang2 != "--") { - pairEntry.pairs.add(pair); - } -// } else { -// assert false; - } - } - } - - private String formatAndIndexExampleString(final String example, final IndexBuilder indexBuilder, final IndexedEntry indexedEntry) { - // TODO: -// if (wikiTokenizer.token().equals("'''")) { -// insideTripleQuotes = !insideTripleQuotes; -// } - final StringBuilder builder = new StringBuilder(); - appendAndIndexWikiCallback.reset(builder, indexedEntry); - appendAndIndexWikiCallback.entryTypeName = EntryTypeName.WIKTIONARY_EXAMPLE; - appendAndIndexWikiCallback.entryTypeNameSticks = true; - try { - // TODO: this is a hack needed because we don't safely split on the dash. - appendAndIndexWikiCallback.dispatch(example, indexBuilder, EntryTypeName.WIKTIONARY_EXAMPLE); - } catch (AssertionError e) { - return "--"; - } - final String result = trim(builder.toString()); - return result.length() > 0 ? result : "--"; - } - - - private void itConjAre(List args, Map namedArgs) { - final String base = args.get(0); - final String aux = args.get(1); - - putIfMissing(namedArgs, "inf", base + "are"); - putIfMissing(namedArgs, "aux", aux); - putIfMissing(namedArgs, "ger", base + "ando"); - putIfMissing(namedArgs, "presp", base + "ante"); - putIfMissing(namedArgs, "pastp", base + "ato"); - // Present - putIfMissing(namedArgs, "pres1s", base + "o"); - putIfMissing(namedArgs, "pres2s", base + "i"); - putIfMissing(namedArgs, "pres3s", base + "a"); - putIfMissing(namedArgs, "pres1p", base + "iamo"); - putIfMissing(namedArgs, "pres2p", base + "ate"); - putIfMissing(namedArgs, "pres3p", base + "ano"); - // Imperfect - putIfMissing(namedArgs, "imperf1s", base + "avo"); - putIfMissing(namedArgs, "imperf2s", base + "avi"); - putIfMissing(namedArgs, "imperf3s", base + "ava"); - putIfMissing(namedArgs, "imperf1p", base + "avamo"); - putIfMissing(namedArgs, "imperf2p", base + "avate"); - putIfMissing(namedArgs, "imperf3p", base + "avano"); - // Passato remoto - putIfMissing(namedArgs, "prem1s", base + "ai"); - putIfMissing(namedArgs, "prem2s", base + "asti"); - putIfMissing(namedArgs, "prem3s", base + "ò"); - putIfMissing(namedArgs, "prem1p", base + "ammo"); - putIfMissing(namedArgs, "prem2p", base + "aste"); - putIfMissing(namedArgs, "prem3p", base + "arono"); - // Future - putIfMissing(namedArgs, "fut1s", base + "erò"); - putIfMissing(namedArgs, "fut2s", base + "erai"); - putIfMissing(namedArgs, "fut3s", base + "erà"); - putIfMissing(namedArgs, "fut1p", base + "eremo"); - putIfMissing(namedArgs, "fut2p", base + "erete"); - putIfMissing(namedArgs, "fut3p", base + "eranno"); - // Conditional - putIfMissing(namedArgs, "cond1s", base + "erei"); - putIfMissing(namedArgs, "cond2s", base + "eresti"); - putIfMissing(namedArgs, "cond3s", base + "erebbe"); - putIfMissing(namedArgs, "cond1p", base + "eremmo"); - putIfMissing(namedArgs, "cond2p", base + "ereste"); - putIfMissing(namedArgs, "cond3p", base + "erebbero"); - // Subjunctive / congiuntivo - putIfMissing(namedArgs, "sub123s", base + "i"); - putIfMissing(namedArgs, "sub1p", base + "iamo"); - putIfMissing(namedArgs, "sub2p", base + "iate"); - putIfMissing(namedArgs, "sub3p", base + "ino"); - // Imperfect subjunctive - putIfMissing(namedArgs, "impsub12s", base + "assi"); - putIfMissing(namedArgs, "impsub3s", base + "asse"); - putIfMissing(namedArgs, "impsub1p", base + "assimo"); - putIfMissing(namedArgs, "impsub2p", base + "aste"); - putIfMissing(namedArgs, "impsub3p", base + "assero"); - // Imperative - putIfMissing(namedArgs, "imp2s", base + "a"); - putIfMissing(namedArgs, "imp3s", base + "i"); - putIfMissing(namedArgs, "imp1p", base + "iamo"); - putIfMissing(namedArgs, "imp2p", base + "ate"); - putIfMissing(namedArgs, "imp3p", base + "ino"); - - - itConj(args, namedArgs); - } - - - private void itConj(List args, Map namedArgs) { - // TODO Auto-generated method stub - - } - - - private static void putIfMissing(final Map namedArgs, final String key, - final String value) { - final String oldValue = namedArgs.get(key); - if (oldValue == null || oldValue.length() == 0) { - namedArgs.put(key, value); - } - } - - // TODO: check how ='' and =| are manifested.... - // TODO: get this right in -are - private static void putOrNullify(final Map namedArgs, final String key, - final String value) { - final String oldValue = namedArgs.get(key); - if (oldValue == null/* || oldValue.length() == 0*/) { - namedArgs.put(key, value); } else { - if (oldValue.equals("''")) { - namedArgs.put(key, ""); + // link == null + final EntryTypeName entryTypeName; + switch (parser.state) { + case TRANSLATION_LINE: + entryTypeName = EntryTypeName.WIKTIONARY_TRANSLATION_WIKI_TEXT; + break; + case ENGLISH_DEF_OF_FOREIGN: + entryTypeName = EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK; + break; + default: + throw new IllegalStateException("Invalid enum value: " + parser.state); } + dispatch(text, entryTypeName); } } - - } // ForeignParser - - // ----------------------------------------------------------------------- - - static class FunctionCallbacks { - - static final Map> DEFAULT = new LinkedHashMap>(); - - static { - FunctionCallback callback = new TranslationCallback(); - DEFAULT.put("t", callback); - DEFAULT.put("t+", callback); - DEFAULT.put("t-", callback); - DEFAULT.put("tø", callback); - DEFAULT.put("apdx-t", callback); - - callback = new EncodingCallback(); - Set encodings = new LinkedHashSet(Arrays.asList( - "zh-ts", "zh-tsp", - "sd-Arab", "ku-Arab", "Arab", "unicode", "Laoo", "ur-Arab", "Thai", - "fa-Arab", "Khmr", "Cyrl", "IPAchar", "ug-Arab", "ko-inline", - "Jpan", "Kore", "Hebr", "rfscript", "Beng", "Mong", "Knda", "Cyrs", - "yue-tsj", "Mlym", "Tfng", "Grek", "yue-yue-j")); - for (final String encoding : encodings) { - DEFAULT.put(encoding, callback); - } - - callback = new l_term(); - DEFAULT.put("l", callback); - DEFAULT.put("term", callback); - - callback = new Gender(); - DEFAULT.put("m", callback); - DEFAULT.put("f", callback); - DEFAULT.put("n", callback); - DEFAULT.put("p", callback); - DEFAULT.put("g", callback); - callback = new AppendArg0(); - - callback = new Ignore(); - DEFAULT.put("trreq", callback); - DEFAULT.put("t-image", callback); - DEFAULT.put("defn", callback); - DEFAULT.put("rfdef", callback); - DEFAULT.put("rfdate", callback); - DEFAULT.put("rfex", callback); - DEFAULT.put("rfquote", callback); - DEFAULT.put("attention", callback); - DEFAULT.put("zh-attention", callback); - - - callback = new FormOf(); - DEFAULT.put("form of", callback); - DEFAULT.put("conjugation of", callback); - DEFAULT.put("participle of", callback); - DEFAULT.put("present participle of", callback); - DEFAULT.put("past participle of", callback); - DEFAULT.put("feminine past participle of", callback); - DEFAULT.put("gerund of", callback); - DEFAULT.put("feminine of", callback); - DEFAULT.put("plural of", callback); - DEFAULT.put("feminine plural of", callback); - DEFAULT.put("inflected form of", callback); - DEFAULT.put("alternative form of", callback); - DEFAULT.put("dated form of", callback); - DEFAULT.put("apocopic form of", callback); - - callback = new InflOrHead(); - DEFAULT.put("infl", callback); - DEFAULT.put("head", callback); - - callback = new AppendName(); - DEFAULT.put("...", callback); - - DEFAULT.put("qualifier", new QualifierCallback()); - DEFAULT.put("italbrac", new italbrac()); - DEFAULT.put("gloss", new gloss()); - DEFAULT.put("not used", new not_used()); - DEFAULT.put("wikipedia", new wikipedia()); - } - - static final NameAndArgs NAME_AND_ARGS = new NameAndArgs(); - - // ------------------------------------------------------------------ - - static final class TranslationCallback implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, final EnParser parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - - final String transliteration = namedArgs.remove("tr"); - final String alt = namedArgs.remove("alt"); - namedArgs.keySet().removeAll(USELESS_WIKI_ARGS); - if (args.size() < 2) { - LOG.warning("{{t...}} with wrong args: title=" + parser.title); - return false; - } - final String langCode = ListUtil.get(args, 0); - if (!appendAndIndexWikiCallback.langCodeToTCount.containsKey(langCode)) { - appendAndIndexWikiCallback.langCodeToTCount.put(langCode, new AtomicInteger()); - } - appendAndIndexWikiCallback.langCodeToTCount.get(langCode).incrementAndGet(); - final String word = ListUtil.get(args, 1); - appendAndIndexWikiCallback.dispatch(alt != null ? alt : word, EntryTypeName.WIKTIONARY_TITLE_MULTI); - - // Genders... - if (args.size() > 2) { - appendAndIndexWikiCallback.builder.append(" {"); - for (int i = 2; i < args.size(); ++i) { - if (i > 2) { - appendAndIndexWikiCallback.builder.append("|"); - } - appendAndIndexWikiCallback.builder.append(args.get(i)); - } - appendAndIndexWikiCallback.builder.append("}"); - } - - if (transliteration != null) { - appendAndIndexWikiCallback.builder.append(" ("); - appendAndIndexWikiCallback.dispatch(transliteration, EntryTypeName.WIKTIONARY_TRANSLITERATION); - appendAndIndexWikiCallback.builder.append(")"); - } - - if (alt != null) { - // If alt wasn't null, we appended alt instead of the actual word - // we're filing under.. - appendAndIndexWikiCallback.builder.append(" ("); - appendAndIndexWikiCallback.dispatch(word, EntryTypeName.WIKTIONARY_TITLE_MULTI); - appendAndIndexWikiCallback.builder.append(")"); - } - - // Catch-all for anything else... - if (!namedArgs.isEmpty()) { - appendAndIndexWikiCallback.builder.append(" {"); - appendNamedArgs(namedArgs, appendAndIndexWikiCallback); - appendAndIndexWikiCallback.builder.append("}"); - } - - return true; - } } - // ------------------------------------------------------------------ - - static final class QualifierCallback implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final EnParser parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - if (args.size() != 1 || !namedArgs.isEmpty()) { - LOG.warning("weird qualifier: "); - return false; - } - String qualifier = args.get(0); - appendAndIndexWikiCallback.builder.append("("); - appendAndIndexWikiCallback.dispatch(qualifier, null); - appendAndIndexWikiCallback.builder.append(")"); - return true; - } - } - - // ------------------------------------------------------------------ - - static final class EncodingCallback implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final EnParser parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - if (!namedArgs.isEmpty()) { - LOG.warning("weird encoding: " + wikiTokenizer.token()); - } - if (args.size() == 0) { - // Things like "{{Jpan}}" exist. - return true; - } - - for (int i = 0; i < args.size(); ++i) { - if (i > 0) { - appendAndIndexWikiCallback.builder.append(", "); - } - final String arg = args.get(i); -// if (arg.equals(parser.title)) { -// parser.titleAppended = true; -// } - appendAndIndexWikiCallback.dispatch(arg, appendAndIndexWikiCallback.entryTypeName); - } - - return true; - } - } - - // ------------------------------------------------------------------ - - static final class Gender implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final EnParser parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - if (!namedArgs.isEmpty()) { - return false; - } - appendAndIndexWikiCallback.builder.append("{"); - appendAndIndexWikiCallback.builder.append(name); - for (int i = 0; i < args.size(); ++i) { - appendAndIndexWikiCallback.builder.append("|").append(args.get(i)); - } - appendAndIndexWikiCallback.builder.append("}"); - return true; - } - } - - // ------------------------------------------------------------------ - - static final class l_term implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final EnParser parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - - // for {{l}}, lang is arg 0, but not for {{term}} - if (name.equals("term")) { - args.add(0, ""); - } - - final EntryTypeName entryTypeName; - switch (parser.state) { - case TRANSLATION_LINE: entryTypeName = EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT; break; - case ENGLISH_DEF_OF_FOREIGN: entryTypeName = EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK; break; - default: throw new IllegalStateException("Invalid enum value: " + parser.state); - } - - final String langCode = args.get(0); - final IndexBuilder indexBuilder; - if ("".equals(langCode)) { - indexBuilder = parser.foreignIndexBuilder; - } else if ("en".equals(langCode)) { - indexBuilder = parser.enIndexBuilder; - } else { - indexBuilder = parser.foreignIndexBuilder; - } - - String displayText = ListUtil.get(args, 2, ""); - if (displayText.equals("")) { - displayText = ListUtil.get(args, 1, null); - } - - if (displayText != null) { - appendAndIndexWikiCallback.dispatch(displayText, indexBuilder, entryTypeName); - } else { - LOG.warning("no display text: " + wikiTokenizer.token()); - } - - final String tr = namedArgs.remove("tr"); - if (tr != null) { - appendAndIndexWikiCallback.builder.append(" ("); - appendAndIndexWikiCallback.dispatch(tr, indexBuilder, EntryTypeName.WIKTIONARY_TRANSLITERATION); - appendAndIndexWikiCallback.builder.append(")"); - } - - final String gloss = ListUtil.get(args, 3, ""); - if (!gloss.equals("")) { - appendAndIndexWikiCallback.builder.append(" ("); - appendAndIndexWikiCallback.dispatch(gloss, parser.enIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF); - appendAndIndexWikiCallback.builder.append(")"); - } - - namedArgs.keySet().removeAll(USELESS_WIKI_ARGS); - if (!namedArgs.isEmpty()) { - appendAndIndexWikiCallback.builder.append(" {").append(name); - appendNamedArgs(namedArgs, appendAndIndexWikiCallback); - appendAndIndexWikiCallback.builder.append("}"); - } - - return true; - } - } - - // ------------------------------------------------------------------ - - static final class AppendArg0 implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final EnParser parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - if (args.size() != 1 || !namedArgs.isEmpty()) { - return false; - } - appendAndIndexWikiCallback.dispatch(args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); - // TODO: transliteration - return true; - } - } - - // ------------------------------------------------------------------ - - static final class italbrac implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final EnParser parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - if (args.size() != 1 || !namedArgs.isEmpty()) { - return false; - } - appendAndIndexWikiCallback.builder.append("("); - appendAndIndexWikiCallback.dispatch(args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); - appendAndIndexWikiCallback.builder.append(")"); - return true; - } - } - - // ------------------------------------------------------------------ - - static final class gloss implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final EnParser parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - if (args.size() != 1 || !namedArgs.isEmpty()) { - return false; - } - appendAndIndexWikiCallback.builder.append("("); - appendAndIndexWikiCallback.dispatch(args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); - appendAndIndexWikiCallback.builder.append(")"); - return true; - } - } - - // ------------------------------------------------------------------ - - static final class Ignore implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final EnParser parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - return true; - } - } - - // ------------------------------------------------------------------ - - static final class not_used implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final EnParser parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - appendAndIndexWikiCallback.builder.append("(not used)"); - return true; - } - } - - - // ------------------------------------------------------------------ - - static final class AppendName implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final EnParser parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - if (!args.isEmpty() || !namedArgs.isEmpty()) { - return false; - } - appendAndIndexWikiCallback.builder.append(name); - return true; - } - } - - // -------------------------------------------------------------------- - // -------------------------------------------------------------------- - - - static final class FormOf implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final EnParser parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - parser.entryIsFormOfSomething = true; - String formName = name; - if (name.equals("form of")) { - formName = ListUtil.remove(args, 0, null); - } - if (formName == null) { - LOG.warning("Missing form name: " + parser.title); - formName = "form of"; - } - String baseForm = ListUtil.get(args, 1, ""); - if ("".equals(baseForm)) { - baseForm = ListUtil.get(args, 0, null); - ListUtil.remove(args, 1, ""); - } else { - ListUtil.remove(args, 0, null); - } - namedArgs.keySet().removeAll(USELESS_WIKI_ARGS); - - appendAndIndexWikiCallback.builder.append("{"); - NAME_AND_ARGS.onWikiFunction(wikiTokenizer, formName, args, namedArgs, parser, appendAndIndexWikiCallback); - appendAndIndexWikiCallback.builder.append("}"); - if (baseForm != null && appendAndIndexWikiCallback.indexedEntry != null) { - parser.foreignIndexBuilder.addEntryWithString(appendAndIndexWikiCallback.indexedEntry, baseForm, EntryTypeName.WIKTIONARY_BASE_FORM_MULTI); - } else { - // null baseForm happens in Danish. - LOG.warning("Null baseform: " + parser.title); - } - return true; - } - } - - static final FormOf FORM_OF = new FormOf(); - - - // -------------------------------------------------------------------- - // -------------------------------------------------------------------- - - static final class wikipedia implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final EnParser parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - namedArgs.remove("lang"); - if (args.size() > 1 || !namedArgs.isEmpty()) { - // Unindexed! - return false; - } else if (args.size() == 1) { - return false; - } else { - return true; - } - } - } - - static final class InflOrHead implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final EnParser parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - // See: http://en.wiktionary.org/wiki/Template:infl - final String langCode = ListUtil.get(args, 0); - String head = namedArgs.remove("head"); - if (head == null) { - head = namedArgs.remove("title"); // Bug - } - if (head == null) { - head = parser.title; - } - parser.titleAppended = true; - - namedArgs.keySet().removeAll(USELESS_WIKI_ARGS); - - final String tr = namedArgs.remove("tr"); - String g = namedArgs.remove("g"); - if (g == null) { - g = namedArgs.remove("gender"); - } - final String g2 = namedArgs.remove("g2"); - final String g3 = namedArgs.remove("g3"); - - appendAndIndexWikiCallback.dispatch(head, EntryTypeName.WIKTIONARY_TITLE_MULTI); - - if (g != null) { - appendAndIndexWikiCallback.builder.append(" {").append(g); - if (g2 != null) { - appendAndIndexWikiCallback.builder.append("|").append(g2); - } - if (g3 != null) { - appendAndIndexWikiCallback.builder.append("|").append(g3); - } - appendAndIndexWikiCallback.builder.append("}"); - } - - if (tr != null) { - appendAndIndexWikiCallback.builder.append(" ("); - appendAndIndexWikiCallback.dispatch(tr, EntryTypeName.WIKTIONARY_TITLE_MULTI); - appendAndIndexWikiCallback.builder.append(")"); - parser.wordForms.add(tr); - } - - final String pos = ListUtil.get(args, 1); - if (pos != null) { - appendAndIndexWikiCallback.builder.append(" (").append(pos).append(")"); - } - for (int i = 2; i < args.size(); i += 2) { - final String inflName = ListUtil.get(args, i); - final String inflValue = ListUtil.get(args, i + 1); - appendAndIndexWikiCallback.builder.append(", "); - appendAndIndexWikiCallback.dispatch(inflName, null, null); - if (inflValue != null && inflValue.length() > 0) { - appendAndIndexWikiCallback.builder.append(": "); - appendAndIndexWikiCallback.dispatch(inflValue, null, null); - parser.wordForms.add(inflValue); - } - } - for (final String key : namedArgs.keySet()) { - final String value = WikiTokenizer.toPlainText(namedArgs.get(key)); - appendAndIndexWikiCallback.builder.append(" "); - appendAndIndexWikiCallback.dispatch(key, null, null); - appendAndIndexWikiCallback.builder.append("="); - appendAndIndexWikiCallback.dispatch(value, null, null); - parser.wordForms.add(value); - } - return true; - } - } - - - static { - DEFAULT.put("it-noun", new it_noun()); - } - static final class it_noun implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final EnParser parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - parser.titleAppended = true; - final String base = ListUtil.get(args, 0); - final String gender = ListUtil.get(args, 1); - final String singular = base + ListUtil.get(args, 2, null); - final String plural = base + ListUtil.get(args, 3, null); - appendAndIndexWikiCallback.builder.append(" "); - appendAndIndexWikiCallback.dispatch(singular, null, null); - appendAndIndexWikiCallback.builder.append(" {").append(gender).append("}, "); - appendAndIndexWikiCallback.dispatch(plural, null, null); - appendAndIndexWikiCallback.builder.append(" {pl}"); - parser.wordForms.add(singular); - parser.wordForms.add(plural); - if (!namedArgs.isEmpty() || args.size() > 4) { - LOG.warning("Invalid it-noun: " + wikiTokenizer.token()); - } - return true; - } - } - - static { - DEFAULT.put("it-proper noun", new it_proper_noun()); - } - static final class it_proper_noun implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final EnParser parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - return false; - } - } - - } - - } diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/EnToTranslationParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/EnToTranslationParser.java new file mode 100644 index 0000000..6c1bbec --- /dev/null +++ b/src/com/hughes/android/dictionary/parser/wiktionary/EnToTranslationParser.java @@ -0,0 +1,230 @@ +// Copyright 2012 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package com.hughes.android.dictionary.parser.wiktionary; + +import java.util.List; +import java.util.regex.Pattern; + +import com.hughes.android.dictionary.engine.EntryTypeName; +import com.hughes.android.dictionary.engine.IndexBuilder; +import com.hughes.android.dictionary.engine.IndexedEntry; +import com.hughes.android.dictionary.engine.PairEntry; +import com.hughes.android.dictionary.engine.PairEntry.Pair; +import com.hughes.android.dictionary.parser.WikiTokenizer; + +public final class EnToTranslationParser extends EnParser { + + public EnToTranslationParser(final IndexBuilder enIndexBuilder, + final IndexBuilder otherIndexBuilder, final Pattern langPattern, + final Pattern langCodePattern, final boolean swap) { + super(enIndexBuilder, otherIndexBuilder, langPattern, langCodePattern, swap); + } + + @Override + void parseSection(String heading, String text) { + if (isIgnorableTitle(title)) { + return; + } + heading = heading.replaceAll("=", "").trim(); + if (!heading.contains("English")) { + return; + } + + String pos = null; + int posDepth = -1; + + final WikiTokenizer wikiTokenizer = new WikiTokenizer(text); + while (wikiTokenizer.nextToken() != null) { + + if (wikiTokenizer.isHeading()) { + final String headerName = wikiTokenizer.headingWikiText(); + + if (wikiTokenizer.headingDepth() <= posDepth) { + pos = null; + posDepth = -1; + } + + if (partOfSpeechHeader.matcher(headerName).matches()) { + posDepth = wikiTokenizer.headingDepth(); + pos = wikiTokenizer.headingWikiText(); + // TODO: if we're inside the POS section, we should handle the first title line... + + } else if (headerName.equals("Translations")) { + if (pos == null) { + LOG.info("Translations without POS (but using anyway): " + title); + } + doTranslations(wikiTokenizer, pos); + } else if (headerName.equals("Pronunciation")) { + //doPronunciation(wikiLineReader); + } + } else if (wikiTokenizer.isFunction()) { + final String name = wikiTokenizer.functionName(); + if (name.equals("head") && pos == null) { + LOG.warning("{{head}} without POS: " + title); + } + } + } + } + + private void doTranslations(final WikiTokenizer wikiTokenizer, final String pos) { + if (title.equals("absolutely")) { + //System.out.println(); + } + + String topLevelLang = null; + String sense = null; + boolean done = false; + while (wikiTokenizer.nextToken() != null) { + if (wikiTokenizer.isHeading()) { + wikiTokenizer.returnToLineStart(); + return; + } + if (done) { + continue; + } + + // Check whether we care about this line: + + if (wikiTokenizer.isFunction()) { + final String functionName = wikiTokenizer.functionName(); + final List positionArgs = wikiTokenizer.functionPositionArgs(); + + if (functionName.equals("trans-top")) { + sense = null; + if (wikiTokenizer.functionPositionArgs().size() >= 1) { + sense = positionArgs.get(0); + sense = WikiTokenizer.toPlainText(sense); + //LOG.info("Sense: " + sense); + } + } else if (functionName.equals("trans-bottom")) { + sense = null; + } else if (functionName.equals("trans-mid")) { + } else if (functionName.equals("trans-see")) { + incrementCount("WARNING:trans-see"); + } else if (functionName.startsWith("picdic")) { + } else if (functionName.startsWith("checktrans")) { + done = true; + } else if (functionName.startsWith("ttbc")) { + wikiTokenizer.nextLine(); + // TODO: would be great to handle ttbc + // TODO: Check this: done = true; + } else { + LOG.warning("Unexpected translation wikifunction: " + wikiTokenizer.token() + ", title=" + title); + } + } else if (wikiTokenizer.isListItem()) { + final String line = wikiTokenizer.listItemWikiText(); + // This line could produce an output... + +// if (line.contains("ich hoan dich gear")) { +// //System.out.println(); +// } + + // First strip the language and check whether it matches. + // And hold onto it for sub-lines. + final int colonIndex = line.indexOf(":"); + if (colonIndex == -1) { + continue; + } + + final String lang = trim(WikiTokenizer.toPlainText(line.substring(0, colonIndex))); + incrementCount("tCount:" + lang); + final boolean appendLang; + if (wikiTokenizer.listItemPrefix().length() == 1) { + topLevelLang = lang; + final boolean thisFind = langPattern.matcher(lang).find(); + if (!thisFind) { + continue; + } + appendLang = !langPattern.matcher(lang).matches(); + } else if (topLevelLang == null) { + continue; + } else { + // Two-level -- the only way we won't append is if this second level matches exactly. + if (!langPattern.matcher(lang).matches() && !langPattern.matcher(topLevelLang).find()) { + continue; + } + appendLang = !langPattern.matcher(lang).matches(); + } + + String rest = line.substring(colonIndex + 1).trim(); + if (rest.length() > 0) { + doTranslationLine(line, appendLang ? lang : null, pos, sense, rest); + } + + } else if (wikiTokenizer.remainderStartsWith("''See''")) { + wikiTokenizer.nextLine(); + incrementCount("WARNING: ''See''" ); + LOG.fine("Skipping See line: " + wikiTokenizer.token()); + } else if (wikiTokenizer.isWikiLink()) { + final String wikiLink = wikiTokenizer.wikiLinkText(); + if (wikiLink.contains(":") && wikiLink.contains(title)) { + } else if (wikiLink.contains("Category:")) { + } else { + incrementCount("WARNING: Unexpected wikiLink" ); + LOG.warning("Unexpected wikiLink: " + wikiTokenizer.token() + ", title=" + title); + } + } else if (wikiTokenizer.isNewline() || wikiTokenizer.isMarkup() || wikiTokenizer.isComment()) { + } else { + final String token = wikiTokenizer.token(); + if (token.equals("----")) { + } else { + LOG.warning("Unexpected translation token: " + wikiTokenizer.token() + ", title=" + title); + incrementCount("WARNING: Unexpected translation token" ); + } + } + + } + } + + private void doTranslationLine(final String line, final String lang, final String pos, final String sense, final String rest) { + state = State.TRANSLATION_LINE; + // Good chance we'll actually file this one... + final PairEntry pairEntry = new PairEntry(entrySource); + final IndexedEntry indexedEntry = new IndexedEntry(pairEntry); + + final StringBuilder foreignText = new StringBuilder(); + appendAndIndexWikiCallback.reset(foreignText, indexedEntry); + appendAndIndexWikiCallback.dispatch(rest, foreignIndexBuilder, EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); + + if (foreignText.length() == 0) { + LOG.warning("Empty foreignText: " + line); + incrementCount("WARNING: Empty foreignText" ); + return; + } + + if (lang != null) { + foreignText.insert(0, String.format("(%s) ", lang)); + } + + StringBuilder englishText = new StringBuilder(); + + englishText.append(title); + if (sense != null) { + englishText.append(" (").append(sense).append(")"); + enIndexBuilder.addEntryWithString(indexedEntry, sense, EntryTypeName.WIKTIONARY_TRANSLATION_SENSE); + } + if (pos != null) { + englishText.append(" (").append(pos.toLowerCase()).append(")"); + } + enIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_MULTI); + + final Pair pair = new Pair(trim(englishText.toString()), trim(foreignText.toString()), swap); + pairEntry.pairs.add(pair); + if (!pairsAdded.add(pair.toString())) { + LOG.warning("Duplicate pair: " + pair.toString()); + incrementCount("WARNING: Duplicate pair" ); + } + } + } // EnToTranslationParser \ No newline at end of file diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java b/src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java index 7e2e3df..141d2c6 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java @@ -1,3 +1,17 @@ +// Copyright 2012 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package com.hughes.android.dictionary.parser.wiktionary; import java.util.LinkedHashMap; diff --git a/testdata/goldens/wiktionary.fr_fr.quickdic.text b/testdata/goldens/wiktionary.fr_fr.quickdic.text index 93aedeb..39f8914 100644 --- a/testdata/goldens/wiktionary.fr_fr.quickdic.text +++ b/testdata/goldens/wiktionary.fr_fr.quickdic.text @@ -1,5 +1,5 @@ dictInfo=SomeWikiData -EntrySource: enwiktionary.french 6703 +EntrySource: enwiktionary.french 6701 Index: FR FR->EN ===00=== @@ -1233,7 +1233,6 @@ Index: FR FR->EN cardinal {{fr-noun|m|pl=cardinaux}} :: {religion} cardinal cardinal {{fr-noun|m|pl=cardinaux}} :: cardinal number cardinal {m|inv} (noun) :: cardinal (color) - {{cardinalbox|fr|4|5|6|quatre|six|ord=cinquième|wplink=Cinq}}cinq {m|inv} (noun) cat2=cardinal numbers :: five ===care=== care {fr-verb-form} :: {conjugation of|carer|1|s|pres|ind} care {fr-verb-form} :: {conjugation of|carer|3|s|pres|ind} @@ -1460,7 +1459,7 @@ Index: FR FR->EN Je préfère ce gateau-ci à celui-là. :: I prefer this cake to that one. (Old French) ci (adverb) :: here (in this place) ***cinq*** - {{cardinalbox|fr|4|5|6|quatre|six|ord=cinquième|wplink=Cinq}}cinq {m|inv} (noun) cat2=cardinal numbers :: five + {{cardinalbox|fr|4|5|6|quatre|six|ord=cinquième|wplink=Cinq}}cinq {m|inv} (noun) :: five (Middle French) cinq (noun) {m|inv} :: five de {fr-prep} :: from (used to indicate the start of a time or range) De 9:00 à 11:00 je ne serai pas libre. :: From 9 to 11 I won't be free. @@ -4044,8 +4043,6 @@ Index: FR FR->EN ===nul=== c :: {text messaging} {Informal spelling|c'est} C nul ici sans George :: It's rubbish here without George -===numbers=== - {{cardinalbox|fr|4|5|6|quatre|six|ord=cinquième|wplink=Cinq}}cinq {m|inv} (noun) cat2=cardinal numbers :: five ***Nunavut*** Nunavut {{fr-proper noun|m}} :: Nunavut ***o*** @@ -8512,7 +8509,7 @@ Index: EN EN->FR ===fist=== pain {{fr-noun|m}} :: {informal} punch (a hit with the fist) ===five=== - {{cardinalbox|fr|4|5|6|quatre|six|ord=cinquième|wplink=Cinq}}cinq {m|inv} (noun) cat2=cardinal numbers :: five + {{cardinalbox|fr|4|5|6|quatre|six|ord=cinquième|wplink=Cinq}}cinq {m|inv} (noun) :: five (Middle French) cinq (noun) {m|inv} :: five de {fr-prep} :: from (used to indicate the start of a time or range) De 9:00 à 11:00 je ne serai pas libre. :: From 9 to 11 I won't be free.