package com.hughes.android.dictionary.parser.wiktionary;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
+import java.util.Set;
import java.util.regex.Pattern;
-import com.hughes.android.dictionary.engine.EntryTypeName;
import com.hughes.android.dictionary.engine.IndexBuilder;
import com.hughes.android.dictionary.engine.IndexedEntry;
import com.hughes.android.dictionary.engine.PairEntry;
-import com.hughes.android.dictionary.engine.PairEntry.Pair;
import com.hughes.android.dictionary.parser.WikiTokenizer;
+import com.hughes.android.dictionary.parser.wiktionary.EnFunctionCallbacks.TranslationCallback;
+import com.hughes.util.ListUtil;
public final class EnTranslationToTranslationParser extends AbstractWiktionaryParser {
-
- final IndexBuilder[] indexBuilders;
- final Pattern[] namePatterns;
-
- public EnTranslationToTranslationParser(final IndexBuilder[] indexBuilders,
- final Pattern[] namePatterns) {
- this.indexBuilders = indexBuilders;
- this.namePatterns = namePatterns;
+
+ final List<IndexBuilder> indexBuilders;
+ final Pattern[] langCodePatterns;
+
+ PairEntry pairEntry = null;
+ IndexedEntry indexedEntry = null;
+ StringBuilder[] builders = null;
+ final HashSet<PairEntry.Pair> allPairs = new HashSet<>();
+
+ public static final String NAME = "EnTranslationToTranslation";
+
+ final Set<String> Ts = new LinkedHashSet<>(Arrays.asList("t", "t+",
+ "t-", "tø", "apdx-t", "ttbc"));
+
+ public EnTranslationToTranslationParser(final List<IndexBuilder> indexBuilders,
+ final Pattern[] langCodePatterns) {
+ this.indexBuilders = indexBuilders;
+ this.langCodePatterns = langCodePatterns;
}
-
+
@Override
void removeUselessArgs(Map<String, String> namedArgs) {
- namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS);
+ namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS);
}
@Override
void parseSection(String heading, String text) {
- if (EnParser.isIgnorableTitle(title)) {
- return;
- }
- final WikiTokenizer wikiTokenizer = new WikiTokenizer(text);
- while (wikiTokenizer.nextToken() != null) {
- if (wikiTokenizer.isHeading()) {
- final String headerName = wikiTokenizer.headingWikiText();
- if (headerName.equals("Translations")) {
- doTranslations(wikiTokenizer);
- }
- } else {
- // TODO: optimization: skip to next heading, or even skip to translations.
+ if (EnParser.isIgnorableTitle(title)) {
+ return;
+ }
+ final WikiTokenizer.Callback callback = new WikiTokenizer.DoNothingCallback() {
+ @Override
+ public void onFunction(WikiTokenizer wikiTokenizer, String name,
+ List<String> functionPositionArgs,
+ Map<String, String> functionNamedArgs) {
+ //System.out.println(wikiTokenizer.token());
+ if (Ts.contains(name)) {
+ onT(wikiTokenizer);
+ } else if (name.equals("trans-top") || name.equals("checktrans-top") || name.equals("checktrans")) {
+ startEntry(title, wikiTokenizer.token());
+ } else if (name.equals("trans-bottom")) {
+ finishEntry(title);
+ }
+ }
+
+ @Override
+ public void onListItem(WikiTokenizer wikiTokenizer) {
+ WikiTokenizer.dispatch(wikiTokenizer.listItemWikiText(), false, this);
+ }
+ };
+ WikiTokenizer.dispatch(text, true, callback);
+
+ if (builders != null) {
+ LOG.warning("unended translations: " + title);
+ finishEntry(title);
}
- }
}
- private void doTranslations(final WikiTokenizer wikiTokenizer) {
- String topLevelLang = null;
- boolean done = false;
- StringBuilder[] builders;
- while (wikiTokenizer.nextToken() != null) {
- if (wikiTokenizer.isHeading()) {
- wikiTokenizer.returnToLineStart();
- return;
+ final TranslationCallback<EnTranslationToTranslationParser> translationCallback = new TranslationCallback<>();
+
+ final AppendAndIndexWikiCallback<EnTranslationToTranslationParser> appendAndIndexWikiCallback = new AppendAndIndexWikiCallback<>(
+ this);
+ {
+ for (final String t : Ts) {
+ appendAndIndexWikiCallback.functionCallbacks.put(t, translationCallback);
}
- if (done) {
- continue;
+ }
+
+ private void onT(WikiTokenizer wikiTokenizer) {
+ if (builders == null) {
+ LOG.warning("{{t...}} section outside of {{trans-top}}: " + title);
+ startEntry(title, "QUICKDIC_OUTSIDE");
}
-
- // Check whether we care about this line:
- if (wikiTokenizer.isFunction()) {
- final String functionName = wikiTokenizer.functionName();
- final List<String> positionArgs = wikiTokenizer.functionPositionArgs();
-
- if (functionName.equals("trans-top")) {
- if (wikiTokenizer.functionPositionArgs().size() >= 1) {
- builders = new StringBuilder[] {new StringBuilder(), new StringBuilder()};
- }
- } else if (functionName.equals("trans-bottom")) {
- builders = null;
- } else if (functionName.equals("trans-mid")) {
- } else if (functionName.equals("trans-see")) {
- } else if (functionName.startsWith("picdic")) {
- } else if (functionName.startsWith("checktrans")) {
- done = true;
- } else if (functionName.startsWith("ttbc")) {
- wikiTokenizer.nextLine();
- // TODO: would be great to handle ttbc
- // TODO: Check this: done = true;
- } else {
- LOG.warning("Unexpected translation wikifunction: " + wikiTokenizer.token() + ", title=" + title);
- }
- } else if (wikiTokenizer.isListItem()) {
- final String line = wikiTokenizer.listItemWikiText();
- // This line could produce an output...
-
- // First strip the language and check whether it matches.
- // And hold onto it for sub-lines.
- final int colonIndex = line.indexOf(":");
- if (colonIndex == -1) {
- continue;
- }
-
- final String lang = trim(WikiTokenizer.toPlainText(line.substring(0, colonIndex)));
- incrementCount("tCount:" + lang);
-
-
- final boolean appendLang;
- if (wikiTokenizer.listItemPrefix().length() == 1) {
- topLevelLang = lang;
- final boolean thisFind = langPattern.matcher(lang).find();
- if (!thisFind) {
- continue;
- }
- appendLang = !langPattern.matcher(lang).matches();
- } else if (topLevelLang == null) {
- continue;
- } else {
- // Two-level -- the only way we won't append is if this second level matches exactly.
- if (!langPattern.matcher(lang).matches() && !langPattern.matcher(topLevelLang).find()) {
- continue;
+
+ final List<String> args = wikiTokenizer.functionPositionArgs();
+ final String langCode = ListUtil.get(args, 0);
+ if (langCode == null) {
+ LOG.warning("Missing langCode: " + wikiTokenizer.token());
+ return;
+ }
+ for (int p = 0; p < 2; ++p) {
+ if (langCodePatterns[p].matcher(langCode).matches()) {
+ appendAndIndexWikiCallback.builder = builders[p];
+ if (appendAndIndexWikiCallback.builder.length() > 0) {
+ appendAndIndexWikiCallback.builder.append(", ");
+ }
+ appendAndIndexWikiCallback.indexBuilder = indexBuilders.get(p);
+ appendAndIndexWikiCallback.onFunction(wikiTokenizer,
+ wikiTokenizer.functionName(), wikiTokenizer.functionPositionArgs(),
+ wikiTokenizer.functionNamedArgs());
}
- appendLang = !langPattern.matcher(lang).matches();
- }
-
- String rest = line.substring(colonIndex + 1).trim();
- if (rest.length() > 0) {
- doTranslationLine(line, appendLang ? lang : null, rest);
- }
}
- }
}
-
- private void doTranslationLine(final String line, final String lang, final String pos, final String sense, final String rest) {
- state = State.TRANSLATION_LINE;
- // Good chance we'll actually file this one...
- final PairEntry pairEntry = new PairEntry(entrySource);
- final IndexedEntry indexedEntry = new IndexedEntry(pairEntry);
-
- final StringBuilder foreignText = new StringBuilder();
- appendAndIndexWikiCallback.reset(foreignText, indexedEntry);
- appendAndIndexWikiCallback.dispatch(rest, foreignIndexBuilder, EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT);
-
- if (foreignText.length() == 0) {
- LOG.warning("Empty foreignText: " + line);
- incrementCount("WARNING: Empty foreignText" );
- return;
- }
-
- if (lang != null) {
- foreignText.insert(0, String.format("(%s) ", lang));
- }
-
- StringBuilder englishText = new StringBuilder();
-
- englishText.append(title);
- if (sense != null) {
- englishText.append(" (").append(sense).append(")");
- enIndexBuilder.addEntryWithString(indexedEntry, sense, EntryTypeName.WIKTIONARY_TRANSLATION_SENSE);
- }
- if (pos != null) {
- englishText.append(" (").append(pos.toLowerCase()).append(")");
- }
- enIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_MULTI);
-
- final Pair pair = new Pair(trim(englishText.toString()), trim(foreignText.toString()), swap);
- pairEntry.pairs.add(pair);
- if (!pairsAdded.add(pair.toString())) {
- LOG.warning("Duplicate pair: " + pair.toString());
- incrementCount("WARNING: Duplicate pair" );
- }
+
+ void startEntry(final String title, final String func) {
+ if (pairEntry != null) {
+ LOG.warning("startEntry() twice: " + title + ", " + func);
+ finishEntry(title);
+ }
+
+ pairEntry = new PairEntry(entrySource);
+ indexedEntry = new IndexedEntry(pairEntry);
+ builders = new StringBuilder[] { new StringBuilder(), new StringBuilder() };
+ appendAndIndexWikiCallback.indexedEntry = indexedEntry;
+ }
+
+ void finishEntry(final String title) {
+ if (pairEntry == null) {
+ LOG.warning("finalizeEntry() twice: " + title);
+ return;
+ }
+ final String lang1 = builders[0].toString();
+ final String lang2 = builders[1].toString();
+ if (lang1.length() > 0 && lang2.length() > 0) {
+ final PairEntry.Pair newPair = new PairEntry.Pair(lang1, lang2);
+ // brute-force approach to prevent adding duplicates
+ if (!allPairs.contains(newPair))
+ {
+ allPairs.add(newPair);
+ pairEntry.pairs.add(new PairEntry.Pair(lang1, lang2));
+ indexedEntry.isValid = true;
+ }
+ }
+
+ pairEntry = null;
+ indexedEntry = null;
+ builders = null;
}
- }
\ No newline at end of file
+}