}
public void build() {
- final Set<IndexedEntry> tokenEntryDatas = new HashSet<IndexedEntry>();
+ final Set<IndexedEntry> tokenIndexedEntries = new HashSet<IndexedEntry>();
final List<RowBase> rows = index.rows;
index.mainTokenCount = 0;
for (final TokenData tokenData : tokenToData.values()) {
- tokenEntryDatas.clear();
+ tokenIndexedEntries.clear();
final int indexIndex = index.sortedIndexEntries.size();
final int startRow = rows.size();
- final TokenRow tokenRow = new TokenRow(indexIndex, rows.size(), index, tokenData.hasMainEntry);
- rows.add(tokenRow);
- if (tokenRow.hasMainEntry) {
- index.mainTokenCount++;
- }
-// System.out.println("Added TokenRow: " + rows.get(rows.size() - 1));
+ TokenRow tokenRow = null;
+
int numRows = 0; // off by one--doesn't count the token row!
// System.out.println("TOKEN: " + tokenData.token);
- for (final Map.Entry<EntryTypeName, List<IndexedEntry>> typeToEntry : tokenData.typeToEntries.entrySet()) {
- for (final IndexedEntry entryData : typeToEntry.getValue()) {
- if (entryData.index() == -1) {
- entryData.addToDictionary(dictionaryBuilder.dictionary);
- assert entryData.index() >= 0;
+ for (final Map.Entry<EntryTypeName, List<IndexedEntry>> typeToIndexedEntries : tokenData.typeToEntries.entrySet()) {
+ for (final IndexedEntry indexedEntry : typeToIndexedEntries.getValue()) {
+
+ if (!indexedEntry.isValid) {
+ continue;
+ }
+
+ if (tokenRow == null) {
+// System.out.println("Added TokenRow: " + rows.get(rows.size() - 1));
+ tokenRow = new TokenRow(indexIndex, rows.size(), index, tokenData.hasMainEntry);
+ rows.add(tokenRow);
+ if (tokenRow.hasMainEntry) {
+ index.mainTokenCount++;
+ }
+ }
+
+ if (indexedEntry.index() == -1) {
+ indexedEntry.addToDictionary(dictionaryBuilder.dictionary);
+ assert indexedEntry.index() >= 0;
}
- if (tokenEntryDatas.add(entryData)) {
- rows.add(new PairEntry.Row(entryData.index(), rows.size(), index));
- ++entryData.entry.entrySource.numEntries;
+ if (tokenIndexedEntries.add(indexedEntry)) {
+ rows.add(new PairEntry.Row(indexedEntry.index(), rows.size(), index));
+ ++indexedEntry.entry.entrySource.numEntries;
++numRows;
// System.out.print(" " + typeToEntry.getKey() + ": ");
package com.hughes.android.dictionary.parser.wiktionary;
+import java.util.Arrays;
+import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
+import java.util.Set;
import java.util.regex.Pattern;
-import com.hughes.android.dictionary.engine.EntryTypeName;
import com.hughes.android.dictionary.engine.IndexBuilder;
import com.hughes.android.dictionary.engine.IndexedEntry;
import com.hughes.android.dictionary.engine.PairEntry;
import com.hughes.android.dictionary.engine.PairEntry.Pair;
import com.hughes.android.dictionary.parser.WikiTokenizer;
+import com.hughes.android.dictionary.parser.wiktionary.EnFunctionCallbacks.TranslationCallback;
+import com.hughes.util.ListUtil;
public final class EnTranslationToTranslationParser extends AbstractWiktionaryParser {
final IndexBuilder[] indexBuilders;
- final Pattern[] namePatterns;
+ final Pattern[] langCodePatterns;
+ PairEntry pairEntry = null;
+ IndexedEntry indexedEntry = null;
+ StringBuilder[] builders = null;
+
+ final Set<String> Ts = new LinkedHashSet<String>(Arrays.asList("t", "t+",
+ "t-", "tø", "apdx-t", "ttbc"));
+
public EnTranslationToTranslationParser(final IndexBuilder[] indexBuilders,
- final Pattern[] namePatterns) {
+ final Pattern[] langCodePatterns) {
this.indexBuilders = indexBuilders;
- this.namePatterns = namePatterns;
+ this.langCodePatterns = langCodePatterns;
}
@Override
void removeUselessArgs(Map<String, String> namedArgs) {
namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS);
}
-
+
@Override
void parseSection(String heading, String text) {
if (EnParser.isIgnorableTitle(title)) {
}
final WikiTokenizer wikiTokenizer = new WikiTokenizer(text);
while (wikiTokenizer.nextToken() != null) {
- if (wikiTokenizer.isHeading()) {
- final String headerName = wikiTokenizer.headingWikiText();
- if (headerName.equals("Translations")) {
- //doTranslations(wikiTokenizer);
- }
- } else {
- // TODO: optimization: skip to next heading, or even skip to translations.
- }
- }
- }
-/*
- private void doTranslations(final WikiTokenizer wikiTokenizer) {
- String topLevelLang = null;
- boolean done = false;
- StringBuilder[] builders;
- while (wikiTokenizer.nextToken() != null) {
- if (wikiTokenizer.isHeading()) {
- wikiTokenizer.returnToLineStart();
- return;
- }
- if (done) {
- continue;
- }
-
- // Check whether we care about this line:
if (wikiTokenizer.isFunction()) {
- final String functionName = wikiTokenizer.functionName();
- final List<String> positionArgs = wikiTokenizer.functionPositionArgs();
-
- if (functionName.equals("trans-top")) {
- if (wikiTokenizer.functionPositionArgs().size() >= 1) {
- builders = new StringBuilder[] {new StringBuilder(), new StringBuilder()};
- }
- } else if (functionName.equals("trans-bottom")) {
- builders = null;
- } else if (functionName.equals("trans-mid")) {
- } else if (functionName.equals("trans-see")) {
- } else if (functionName.startsWith("picdic")) {
- } else if (functionName.startsWith("checktrans")) {
- done = true;
- } else if (functionName.startsWith("ttbc")) {
- wikiTokenizer.nextLine();
- // TODO: would be great to handle ttbc
- // TODO: Check this: done = true;
- } else {
- LOG.warning("Unexpected translation wikifunction: " + wikiTokenizer.token() + ", title=" + title);
- }
- } else if (wikiTokenizer.isListItem()) {
- final String line = wikiTokenizer.listItemWikiText();
- // This line could produce an output...
-
- // First strip the language and check whether it matches.
- // And hold onto it for sub-lines.
- final int colonIndex = line.indexOf(":");
- if (colonIndex == -1) {
- continue;
- }
-
- final String lang = trim(WikiTokenizer.toPlainText(line.substring(0, colonIndex)));
- incrementCount("tCount:" + lang);
-
-
- final boolean appendLang;
- if (wikiTokenizer.listItemPrefix().length() == 1) {
- topLevelLang = lang;
- final boolean thisFind = langPattern.matcher(lang).find();
- if (!thisFind) {
- continue;
- }
- appendLang = !langPattern.matcher(lang).matches();
- } else if (topLevelLang == null) {
- continue;
- } else {
- // Two-level -- the only way we won't append is if this second level matches exactly.
- if (!langPattern.matcher(lang).matches() && !langPattern.matcher(topLevelLang).find()) {
- continue;
- }
- appendLang = !langPattern.matcher(lang).matches();
- }
-
- String rest = line.substring(colonIndex + 1).trim();
- if (rest.length() > 0) {
- doTranslationLine(line, appendLang ? lang : null, rest);
+ final String name = wikiTokenizer.functionName();
+ if (Ts.contains(name)) {
+ onT(wikiTokenizer);
+ } else if (name.equals("trans-top")) {
+ startEntry(title, wikiTokenizer.token());
+ } else if (name.equals("trans-bottom")) {
+ finishEntry(title);
}
}
}
}
- private void doTranslationLine(final String line, final String lang, final String pos, final String sense, final String rest) {
- state = State.TRANSLATION_LINE;
- // Good chance we'll actually file this one...
- final PairEntry pairEntry = new PairEntry(entrySource);
- final IndexedEntry indexedEntry = new IndexedEntry(pairEntry);
-
- final StringBuilder foreignText = new StringBuilder();
- appendAndIndexWikiCallback.reset(foreignText, indexedEntry);
- appendAndIndexWikiCallback.dispatch(rest, foreignIndexBuilder, EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT);
-
- if (foreignText.length() == 0) {
- LOG.warning("Empty foreignText: " + line);
- incrementCount("WARNING: Empty foreignText" );
- return;
+ final TranslationCallback<EnTranslationToTranslationParser> translationCallback = new TranslationCallback<EnTranslationToTranslationParser>();
+
+ final AppendAndIndexWikiCallback<EnTranslationToTranslationParser> appendAndIndexWikiCallback = new AppendAndIndexWikiCallback<EnTranslationToTranslationParser>(
+ this);
+ {
+ for (final String t : Ts) {
+ appendAndIndexWikiCallback.functionCallbacks.put(t, translationCallback);
+ }
+ }
+
+ private void onT(WikiTokenizer wikiTokenizer) {
+ final List<String> args = wikiTokenizer.functionPositionArgs();
+ final String langCode = ListUtil.get(args, 0);
+ for (int p = 0; p < 2; ++p) {
+ if (langCodePatterns[p].matcher(langCode).matches()) {
+ appendAndIndexWikiCallback.builder = builders[p];
+ appendAndIndexWikiCallback.indexBuilder = indexBuilders[p];
+ appendAndIndexWikiCallback.onFunction(wikiTokenizer,
+ wikiTokenizer.functionName(), wikiTokenizer.functionPositionArgs(),
+ wikiTokenizer.functionNamedArgs());
}
-
- if (lang != null) {
- foreignText.insert(0, String.format("(%s) ", lang));
+ }
+ }
+
+ void startEntry(final String title, final String func) {
+ if (pairEntry != null) {
+ LOG.warning("startEntry() twice" + func);
+ finishEntry(title);
}
- StringBuilder englishText = new StringBuilder();
-
- englishText.append(title);
- if (sense != null) {
- englishText.append(" (").append(sense).append(")");
- enIndexBuilder.addEntryWithString(indexedEntry, sense, EntryTypeName.WIKTIONARY_TRANSLATION_SENSE);
+ pairEntry = new PairEntry(entrySource);
+ indexedEntry = new IndexedEntry(pairEntry);
+ builders = new StringBuilder[] { new StringBuilder(), new StringBuilder() };
+ }
+
+ void finishEntry(final String title) {
+ if (pairEntry == null) {
+ LOG.warning("finalizeEntry() twice" + title);
+ return;
}
- if (pos != null) {
- englishText.append(" (").append(pos.toLowerCase()).append(")");
+ final String lang1 = builders[0].toString();
+ final String lang2 = builders[1].toString();
+ if (lang1.length() > 0 && lang2.length() > 0) {
+ pairEntry.pairs.add(new Pair(lang1, lang2));
+ indexedEntry.isValid = true;
}
- enIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_MULTI);
- final Pair pair = new Pair(trim(englishText.toString()), trim(foreignText.toString()), swap);
- pairEntry.pairs.add(pair);
- if (!pairsAdded.add(pair.toString())) {
- LOG.warning("Duplicate pair: " + pair.toString());
- incrementCount("WARNING: Duplicate pair" );
- }
+ pairEntry = null;
+ indexedEntry = null;
+ builders = null;
}
- */
}
\ No newline at end of file