package com.hughes.android.dictionary.parser.wiktionary;
import java.util.Arrays;
+import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import com.hughes.android.dictionary.engine.IndexBuilder;
import com.hughes.android.dictionary.engine.IndexedEntry;
import com.hughes.android.dictionary.engine.PairEntry;
-import com.hughes.android.dictionary.engine.PairEntry.Pair;
import com.hughes.android.dictionary.parser.WikiTokenizer;
import com.hughes.android.dictionary.parser.wiktionary.EnFunctionCallbacks.TranslationCallback;
import com.hughes.util.ListUtil;
public final class EnTranslationToTranslationParser extends AbstractWiktionaryParser {
-
- final IndexBuilder[] indexBuilders;
+
+ final List<IndexBuilder> indexBuilders;
final Pattern[] langCodePatterns;
PairEntry pairEntry = null;
IndexedEntry indexedEntry = null;
- StringBuilder[] builders = null;
-
- final Set<String> Ts = new LinkedHashSet<String>(Arrays.asList("t", "t+",
- "t-", "tø", "apdx-t", "ttbc"));
-
- public EnTranslationToTranslationParser(final IndexBuilder[] indexBuilders,
- final Pattern[] langCodePatterns) {
- this.indexBuilders = indexBuilders;
- this.langCodePatterns = langCodePatterns;
+ StringBuilder[] builders = null;
+ final HashSet<PairEntry.Pair> allPairs = new HashSet<>();
+
+ public static final String NAME = "EnTranslationToTranslation";
+
+ final Set<String> Ts = new LinkedHashSet<>(Arrays.asList("t", "t+",
+ "t-", "tø", "apdx-t", "ttbc"));
+
+ public EnTranslationToTranslationParser(final List<IndexBuilder> indexBuilders,
+ final Pattern[] langCodePatterns) {
+ this.indexBuilders = indexBuilders;
+ this.langCodePatterns = langCodePatterns;
}
-
+
@Override
void removeUselessArgs(Map<String, String> namedArgs) {
- namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS);
+ namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS);
}
-
+
@Override
void parseSection(String heading, String text) {
- if (EnParser.isIgnorableTitle(title)) {
- return;
- }
- final WikiTokenizer wikiTokenizer = new WikiTokenizer(text);
- while (wikiTokenizer.nextToken() != null) {
- if (wikiTokenizer.isFunction()) {
- final String name = wikiTokenizer.functionName();
- if (Ts.contains(name)) {
- onT(wikiTokenizer);
- } else if (name.equals("trans-top")) {
- startEntry(title, wikiTokenizer.token());
- } else if (name.equals("trans-bottom")) {
+ if (EnParser.isIgnorableTitle(title)) {
+ return;
+ }
+ final WikiTokenizer.Callback callback = new WikiTokenizer.DoNothingCallback() {
+ @Override
+ public void onFunction(WikiTokenizer wikiTokenizer, String name,
+ List<String> functionPositionArgs,
+ Map<String, String> functionNamedArgs) {
+ //System.out.println(wikiTokenizer.token());
+ if (Ts.contains(name)) {
+ onT(wikiTokenizer);
+ } else if (name.equals("trans-top") || name.equals("checktrans-top") || name.equals("checktrans")) {
+ startEntry(title, wikiTokenizer.token());
+ } else if (name.equals("trans-bottom")) {
+ finishEntry(title);
+ }
+ }
+
+ @Override
+ public void onListItem(WikiTokenizer wikiTokenizer) {
+ WikiTokenizer.dispatch(wikiTokenizer.listItemWikiText(), false, this);
+ }
+ };
+ WikiTokenizer.dispatch(text, true, callback);
+
+ if (builders != null) {
+ LOG.warning("unended translations: " + title);
finishEntry(title);
- }
}
- }
}
-
- final TranslationCallback<EnTranslationToTranslationParser> translationCallback = new TranslationCallback<EnTranslationToTranslationParser>();
-
- final AppendAndIndexWikiCallback<EnTranslationToTranslationParser> appendAndIndexWikiCallback = new AppendAndIndexWikiCallback<EnTranslationToTranslationParser>(
- this);
- {
- for (final String t : Ts) {
- appendAndIndexWikiCallback.functionCallbacks.put(t, translationCallback);
+
+ final TranslationCallback<EnTranslationToTranslationParser> translationCallback = new TranslationCallback<>();
+
+ final AppendAndIndexWikiCallback<EnTranslationToTranslationParser> appendAndIndexWikiCallback = new AppendAndIndexWikiCallback<>(
+ this);
+ {
+ for (final String t : Ts) {
+ appendAndIndexWikiCallback.functionCallbacks.put(t, translationCallback);
+ }
}
- }
-
- private void onT(WikiTokenizer wikiTokenizer) {
- final List<String> args = wikiTokenizer.functionPositionArgs();
- final String langCode = ListUtil.get(args, 0);
- for (int p = 0; p < 2; ++p) {
- if (langCodePatterns[p].matcher(langCode).matches()) {
- appendAndIndexWikiCallback.builder = builders[p];
- appendAndIndexWikiCallback.indexBuilder = indexBuilders[p];
- appendAndIndexWikiCallback.onFunction(wikiTokenizer,
- wikiTokenizer.functionName(), wikiTokenizer.functionPositionArgs(),
- wikiTokenizer.functionNamedArgs());
- }
+
+ private void onT(WikiTokenizer wikiTokenizer) {
+ if (builders == null) {
+ LOG.warning("{{t...}} section outside of {{trans-top}}: " + title);
+ startEntry(title, "QUICKDIC_OUTSIDE");
+ }
+
+ final List<String> args = wikiTokenizer.functionPositionArgs();
+ final String langCode = ListUtil.get(args, 0);
+ if (langCode == null) {
+ LOG.warning("Missing langCode: " + wikiTokenizer.token());
+ return;
+ }
+ for (int p = 0; p < 2; ++p) {
+ if (langCodePatterns[p].matcher(langCode).matches()) {
+ appendAndIndexWikiCallback.builder = builders[p];
+ if (appendAndIndexWikiCallback.builder.length() > 0) {
+ appendAndIndexWikiCallback.builder.append(", ");
+ }
+ appendAndIndexWikiCallback.indexBuilder = indexBuilders.get(p);
+ appendAndIndexWikiCallback.onFunction(wikiTokenizer,
+ wikiTokenizer.functionName(), wikiTokenizer.functionPositionArgs(),
+ wikiTokenizer.functionNamedArgs());
+ }
+ }
}
- }
void startEntry(final String title, final String func) {
- if (pairEntry != null) {
- LOG.warning("startEntry() twice" + func);
- finishEntry(title);
- }
-
- pairEntry = new PairEntry(entrySource);
- indexedEntry = new IndexedEntry(pairEntry);
- builders = new StringBuilder[] { new StringBuilder(), new StringBuilder() };
+ if (pairEntry != null) {
+ LOG.warning("startEntry() twice: " + title + ", " + func);
+ finishEntry(title);
+ }
+
+ pairEntry = new PairEntry(entrySource);
+ indexedEntry = new IndexedEntry(pairEntry);
+ builders = new StringBuilder[] { new StringBuilder(), new StringBuilder() };
+ appendAndIndexWikiCallback.indexedEntry = indexedEntry;
}
-
+
void finishEntry(final String title) {
- if (pairEntry == null) {
- LOG.warning("finalizeEntry() twice" + title);
- return;
- }
- final String lang1 = builders[0].toString();
- final String lang2 = builders[1].toString();
- if (lang1.length() > 0 && lang2.length() > 0) {
- pairEntry.pairs.add(new Pair(lang1, lang2));
- indexedEntry.isValid = true;
- }
-
- pairEntry = null;
- indexedEntry = null;
- builders = null;
+ if (pairEntry == null) {
+ LOG.warning("finalizeEntry() twice: " + title);
+ return;
+ }
+ final String lang1 = builders[0].toString();
+ final String lang2 = builders[1].toString();
+ if (lang1.length() > 0 && lang2.length() > 0) {
+ final PairEntry.Pair newPair = new PairEntry.Pair(lang1, lang2);
+ // brute-force approach to prevent adding duplicates
+ if (!allPairs.contains(newPair))
+ {
+ allPairs.add(newPair);
+ pairEntry.pairs.add(new PairEntry.Pair(lang1, lang2));
+ indexedEntry.isValid = true;
+ }
+ }
+
+ pairEntry = null;
+ indexedEntry = null;
+ builders = null;
}
- }
\ No newline at end of file
+}