X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fparser%2Fwiktionary%2FEnTranslationToTranslationParser.java;h=042f0fac3289280cb91db44ac86f5404d09fba6d;hb=2fc669d88306d563fc9c899d8d91b25d591692ea;hp=25bd7a6417e7ed33ad6fea2fc1fc8bac9d2d14c3;hpb=7c9c09f3f5513f13e8ee337c910b9f81704ce119;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/EnTranslationToTranslationParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/EnTranslationToTranslationParser.java index 25bd7a6..042f0fa 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/EnTranslationToTranslationParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/EnTranslationToTranslationParser.java @@ -14,166 +14,149 @@ package com.hughes.android.dictionary.parser.wiktionary; +import java.util.Arrays; +import java.util.HashSet; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.regex.Pattern; -import com.hughes.android.dictionary.engine.EntryTypeName; import com.hughes.android.dictionary.engine.IndexBuilder; import com.hughes.android.dictionary.engine.IndexedEntry; import com.hughes.android.dictionary.engine.PairEntry; -import com.hughes.android.dictionary.engine.PairEntry.Pair; import com.hughes.android.dictionary.parser.WikiTokenizer; +import com.hughes.android.dictionary.parser.wiktionary.EnFunctionCallbacks.TranslationCallback; +import com.hughes.util.ListUtil; public final class EnTranslationToTranslationParser extends AbstractWiktionaryParser { - - final IndexBuilder[] indexBuilders; - final Pattern[] namePatterns; - - public EnTranslationToTranslationParser(final IndexBuilder[] indexBuilders, - final Pattern[] namePatterns) { - this.indexBuilders = indexBuilders; - this.namePatterns = namePatterns; + + final List indexBuilders; + final Pattern[] langCodePatterns; + + PairEntry pairEntry = null; + IndexedEntry indexedEntry = null; + StringBuilder[] builders = null; + final HashSet allPairs = new HashSet<>(); + + public static final String NAME = "EnTranslationToTranslation"; + + final Set Ts = new LinkedHashSet<>(Arrays.asList("t", "t+", + "t-", "tø", "apdx-t", "ttbc")); + + public EnTranslationToTranslationParser(final List indexBuilders, + final Pattern[] langCodePatterns) { + this.indexBuilders = indexBuilders; + this.langCodePatterns = langCodePatterns; } - + @Override void removeUselessArgs(Map namedArgs) { - namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS); + namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS); } @Override void parseSection(String heading, String text) { - if (EnParser.isIgnorableTitle(title)) { - return; - } - final WikiTokenizer wikiTokenizer = new WikiTokenizer(text); - while (wikiTokenizer.nextToken() != null) { - if (wikiTokenizer.isHeading()) { - final String headerName = wikiTokenizer.headingWikiText(); - if (headerName.equals("Translations")) { - doTranslations(wikiTokenizer); - } - } else { - // TODO: optimization: skip to next heading, or even skip to translations. + if (EnParser.isIgnorableTitle(title)) { + return; + } + final WikiTokenizer.Callback callback = new WikiTokenizer.DoNothingCallback() { + @Override + public void onFunction(WikiTokenizer wikiTokenizer, String name, + List functionPositionArgs, + Map functionNamedArgs) { + //System.out.println(wikiTokenizer.token()); + if (Ts.contains(name)) { + onT(wikiTokenizer); + } else if (name.equals("trans-top") || name.equals("checktrans-top") || name.equals("checktrans")) { + startEntry(title, wikiTokenizer.token()); + } else if (name.equals("trans-bottom")) { + finishEntry(title); + } + } + + @Override + public void onListItem(WikiTokenizer wikiTokenizer) { + WikiTokenizer.dispatch(wikiTokenizer.listItemWikiText(), false, this); + } + }; + WikiTokenizer.dispatch(text, true, callback); + + if (builders != null) { + LOG.warning("unended translations: " + title); + finishEntry(title); } - } } - private void doTranslations(final WikiTokenizer wikiTokenizer) { - String topLevelLang = null; - boolean done = false; - StringBuilder[] builders; - while (wikiTokenizer.nextToken() != null) { - if (wikiTokenizer.isHeading()) { - wikiTokenizer.returnToLineStart(); - return; + final TranslationCallback translationCallback = new TranslationCallback<>(); + + final AppendAndIndexWikiCallback appendAndIndexWikiCallback = new AppendAndIndexWikiCallback<>( + this); + { + for (final String t : Ts) { + appendAndIndexWikiCallback.functionCallbacks.put(t, translationCallback); } - if (done) { - continue; + } + + private void onT(WikiTokenizer wikiTokenizer) { + if (builders == null) { + LOG.warning("{{t...}} section outside of {{trans-top}}: " + title); + startEntry(title, "QUICKDIC_OUTSIDE"); } - - // Check whether we care about this line: - if (wikiTokenizer.isFunction()) { - final String functionName = wikiTokenizer.functionName(); - final List positionArgs = wikiTokenizer.functionPositionArgs(); - - if (functionName.equals("trans-top")) { - if (wikiTokenizer.functionPositionArgs().size() >= 1) { - builders = new StringBuilder[] {new StringBuilder(), new StringBuilder()}; - } - } else if (functionName.equals("trans-bottom")) { - builders = null; - } else if (functionName.equals("trans-mid")) { - } else if (functionName.equals("trans-see")) { - } else if (functionName.startsWith("picdic")) { - } else if (functionName.startsWith("checktrans")) { - done = true; - } else if (functionName.startsWith("ttbc")) { - wikiTokenizer.nextLine(); - // TODO: would be great to handle ttbc - // TODO: Check this: done = true; - } else { - LOG.warning("Unexpected translation wikifunction: " + wikiTokenizer.token() + ", title=" + title); - } - } else if (wikiTokenizer.isListItem()) { - final String line = wikiTokenizer.listItemWikiText(); - // This line could produce an output... - - // First strip the language and check whether it matches. - // And hold onto it for sub-lines. - final int colonIndex = line.indexOf(":"); - if (colonIndex == -1) { - continue; - } - - final String lang = trim(WikiTokenizer.toPlainText(line.substring(0, colonIndex))); - incrementCount("tCount:" + lang); - - - final boolean appendLang; - if (wikiTokenizer.listItemPrefix().length() == 1) { - topLevelLang = lang; - final boolean thisFind = langPattern.matcher(lang).find(); - if (!thisFind) { - continue; - } - appendLang = !langPattern.matcher(lang).matches(); - } else if (topLevelLang == null) { - continue; - } else { - // Two-level -- the only way we won't append is if this second level matches exactly. - if (!langPattern.matcher(lang).matches() && !langPattern.matcher(topLevelLang).find()) { - continue; + + final List args = wikiTokenizer.functionPositionArgs(); + final String langCode = ListUtil.get(args, 0); + if (langCode == null) { + LOG.warning("Missing langCode: " + wikiTokenizer.token()); + return; + } + for (int p = 0; p < 2; ++p) { + if (langCodePatterns[p].matcher(langCode).matches()) { + appendAndIndexWikiCallback.builder = builders[p]; + if (appendAndIndexWikiCallback.builder.length() > 0) { + appendAndIndexWikiCallback.builder.append(", "); + } + appendAndIndexWikiCallback.indexBuilder = indexBuilders.get(p); + appendAndIndexWikiCallback.onFunction(wikiTokenizer, + wikiTokenizer.functionName(), wikiTokenizer.functionPositionArgs(), + wikiTokenizer.functionNamedArgs()); } - appendLang = !langPattern.matcher(lang).matches(); - } - - String rest = line.substring(colonIndex + 1).trim(); - if (rest.length() > 0) { - doTranslationLine(line, appendLang ? lang : null, rest); - } } - } } - - private void doTranslationLine(final String line, final String lang, final String pos, final String sense, final String rest) { - state = State.TRANSLATION_LINE; - // Good chance we'll actually file this one... - final PairEntry pairEntry = new PairEntry(entrySource); - final IndexedEntry indexedEntry = new IndexedEntry(pairEntry); - - final StringBuilder foreignText = new StringBuilder(); - appendAndIndexWikiCallback.reset(foreignText, indexedEntry); - appendAndIndexWikiCallback.dispatch(rest, foreignIndexBuilder, EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); - - if (foreignText.length() == 0) { - LOG.warning("Empty foreignText: " + line); - incrementCount("WARNING: Empty foreignText" ); - return; - } - - if (lang != null) { - foreignText.insert(0, String.format("(%s) ", lang)); - } - - StringBuilder englishText = new StringBuilder(); - - englishText.append(title); - if (sense != null) { - englishText.append(" (").append(sense).append(")"); - enIndexBuilder.addEntryWithString(indexedEntry, sense, EntryTypeName.WIKTIONARY_TRANSLATION_SENSE); - } - if (pos != null) { - englishText.append(" (").append(pos.toLowerCase()).append(")"); - } - enIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_MULTI); - - final Pair pair = new Pair(trim(englishText.toString()), trim(foreignText.toString()), swap); - pairEntry.pairs.add(pair); - if (!pairsAdded.add(pair.toString())) { - LOG.warning("Duplicate pair: " + pair.toString()); - incrementCount("WARNING: Duplicate pair" ); - } + + void startEntry(final String title, final String func) { + if (pairEntry != null) { + LOG.warning("startEntry() twice: " + title + ", " + func); + finishEntry(title); + } + + pairEntry = new PairEntry(entrySource); + indexedEntry = new IndexedEntry(pairEntry); + builders = new StringBuilder[] { new StringBuilder(), new StringBuilder() }; + appendAndIndexWikiCallback.indexedEntry = indexedEntry; + } + + void finishEntry(final String title) { + if (pairEntry == null) { + LOG.warning("finalizeEntry() twice: " + title); + return; + } + final String lang1 = builders[0].toString(); + final String lang2 = builders[1].toString(); + if (lang1.length() > 0 && lang2.length() > 0) { + final PairEntry.Pair newPair = new PairEntry.Pair(lang1, lang2); + // brute-force approach to prevent adding duplicates + if (!allPairs.contains(newPair)) + { + allPairs.add(newPair); + pairEntry.pairs.add(new PairEntry.Pair(lang1, lang2)); + indexedEntry.isValid = true; + } + } + + pairEntry = null; + indexedEntry = null; + builders = null; } - } \ No newline at end of file +}