From: Thad Hughes Date: Wed, 9 May 2012 22:09:04 +0000 (-0700) Subject: Unit tests working, looks like I'd been revamping the parsers. X-Git-Url: http://gitweb.fperrin.net/?p=DictionaryPC.git;a=commitdiff_plain;h=8bd105dedc47aaa0957601d59808d44333dd7e9c Unit tests working, looks like I'd been revamping the parsers. --- diff --git a/src/com/hughes/android/dictionary/engine/IndexBuilder.java b/src/com/hughes/android/dictionary/engine/IndexBuilder.java index 0d11191..0424d0b 100644 --- a/src/com/hughes/android/dictionary/engine/IndexBuilder.java +++ b/src/com/hughes/android/dictionary/engine/IndexBuilder.java @@ -44,31 +44,41 @@ public class IndexBuilder { } public void build() { - final Set tokenEntryDatas = new HashSet(); + final Set tokenIndexedEntries = new HashSet(); final List rows = index.rows; index.mainTokenCount = 0; for (final TokenData tokenData : tokenToData.values()) { - tokenEntryDatas.clear(); + tokenIndexedEntries.clear(); final int indexIndex = index.sortedIndexEntries.size(); final int startRow = rows.size(); - final TokenRow tokenRow = new TokenRow(indexIndex, rows.size(), index, tokenData.hasMainEntry); - rows.add(tokenRow); - if (tokenRow.hasMainEntry) { - index.mainTokenCount++; - } -// System.out.println("Added TokenRow: " + rows.get(rows.size() - 1)); + TokenRow tokenRow = null; + int numRows = 0; // off by one--doesn't count the token row! // System.out.println("TOKEN: " + tokenData.token); - for (final Map.Entry> typeToEntry : tokenData.typeToEntries.entrySet()) { - for (final IndexedEntry entryData : typeToEntry.getValue()) { - if (entryData.index() == -1) { - entryData.addToDictionary(dictionaryBuilder.dictionary); - assert entryData.index() >= 0; + for (final Map.Entry> typeToIndexedEntries : tokenData.typeToEntries.entrySet()) { + for (final IndexedEntry indexedEntry : typeToIndexedEntries.getValue()) { + + if (!indexedEntry.isValid) { + continue; + } + + if (tokenRow == null) { +// System.out.println("Added TokenRow: " + rows.get(rows.size() - 1)); + tokenRow = new TokenRow(indexIndex, rows.size(), index, tokenData.hasMainEntry); + rows.add(tokenRow); + if (tokenRow.hasMainEntry) { + index.mainTokenCount++; + } + } + + if (indexedEntry.index() == -1) { + indexedEntry.addToDictionary(dictionaryBuilder.dictionary); + assert indexedEntry.index() >= 0; } - if (tokenEntryDatas.add(entryData)) { - rows.add(new PairEntry.Row(entryData.index(), rows.size(), index)); - ++entryData.entry.entrySource.numEntries; + if (tokenIndexedEntries.add(indexedEntry)) { + rows.add(new PairEntry.Row(indexedEntry.index(), rows.size(), index)); + ++indexedEntry.entry.entrySource.numEntries; ++numRows; // System.out.print(" " + typeToEntry.getKey() + ": "); diff --git a/src/com/hughes/android/dictionary/engine/IndexedEntry.java b/src/com/hughes/android/dictionary/engine/IndexedEntry.java index 6f1b8da..3c0b168 100644 --- a/src/com/hughes/android/dictionary/engine/IndexedEntry.java +++ b/src/com/hughes/android/dictionary/engine/IndexedEntry.java @@ -17,11 +17,13 @@ package com.hughes.android.dictionary.engine; import com.hughes.util.IndexedObject; public class IndexedEntry extends IndexedObject { + AbstractEntry entry; + public boolean isValid = false; + public IndexedEntry(final AbstractEntry entry) { super(-1); this.entry = entry; } - AbstractEntry entry; public void addToDictionary(Dictionary dictionary) { assert index == -1; diff --git a/src/com/hughes/android/dictionary/parser/DictFileParser.java b/src/com/hughes/android/dictionary/parser/DictFileParser.java index edc0ce0..c0ea59d 100644 --- a/src/com/hughes/android/dictionary/parser/DictFileParser.java +++ b/src/com/hughes/android/dictionary/parser/DictFileParser.java @@ -155,6 +155,7 @@ public class DictFileParser implements Parser { pairEntry.pairs.add(new Pair(subfields[0][i], subfields[1][i])); } final IndexedEntry entryData = new IndexedEntry(pairEntry); + entryData.isValid = true; for (int l = 0; l < 2; ++l) { // alreadyDone.clear(); diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/EnForeignParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/EnForeignParser.java index 39dfea0..f89b710 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/EnForeignParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/EnForeignParser.java @@ -214,6 +214,7 @@ public final class EnForeignParser extends EnParser { final PairEntry pairEntry = new PairEntry(entrySource); final IndexedEntry indexedEntry = new IndexedEntry(pairEntry); + indexedEntry.isValid = true; entryIsFormOfSomething = false; final StringBuilder englishBuilder = new StringBuilder(); diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/EnFunctionCallbacks.java b/src/com/hughes/android/dictionary/parser/wiktionary/EnFunctionCallbacks.java index eae2a80..3ace3b1 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/EnFunctionCallbacks.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/EnFunctionCallbacks.java @@ -34,7 +34,7 @@ class EnFunctionCallbacks { static final Map> DEFAULT = new LinkedHashMap>(); static { - FunctionCallback callback = new TranslationCallback(); + FunctionCallback callback = new TranslationCallback(); DEFAULT.put("t", callback); DEFAULT.put("t+", callback); DEFAULT.put("t-", callback); @@ -111,11 +111,11 @@ class EnFunctionCallbacks { // ------------------------------------------------------------------ - static final class TranslationCallback implements FunctionCallback { + static final class TranslationCallback implements FunctionCallback { @Override public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, final EnParser parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + final Map namedArgs, final T parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { final String transliteration = namedArgs.remove("tr"); final String alt = namedArgs.remove("alt"); diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/EnToTranslationParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/EnToTranslationParser.java index 6c1bbec..c128055 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/EnToTranslationParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/EnToTranslationParser.java @@ -193,6 +193,7 @@ public final class EnToTranslationParser extends EnParser { // Good chance we'll actually file this one... final PairEntry pairEntry = new PairEntry(entrySource); final IndexedEntry indexedEntry = new IndexedEntry(pairEntry); + indexedEntry.isValid = true; final StringBuilder foreignText = new StringBuilder(); appendAndIndexWikiCallback.reset(foreignText, indexedEntry); diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/EnTranslationToTranslationParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/EnTranslationToTranslationParser.java index 08306b0..d088266 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/EnTranslationToTranslationParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/EnTranslationToTranslationParser.java @@ -14,33 +14,44 @@ package com.hughes.android.dictionary.parser.wiktionary; +import java.util.Arrays; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.regex.Pattern; -import com.hughes.android.dictionary.engine.EntryTypeName; import com.hughes.android.dictionary.engine.IndexBuilder; import com.hughes.android.dictionary.engine.IndexedEntry; import com.hughes.android.dictionary.engine.PairEntry; import com.hughes.android.dictionary.engine.PairEntry.Pair; import com.hughes.android.dictionary.parser.WikiTokenizer; +import com.hughes.android.dictionary.parser.wiktionary.EnFunctionCallbacks.TranslationCallback; +import com.hughes.util.ListUtil; public final class EnTranslationToTranslationParser extends AbstractWiktionaryParser { final IndexBuilder[] indexBuilders; - final Pattern[] namePatterns; + final Pattern[] langCodePatterns; + PairEntry pairEntry = null; + IndexedEntry indexedEntry = null; + StringBuilder[] builders = null; + + final Set Ts = new LinkedHashSet(Arrays.asList("t", "t+", + "t-", "tø", "apdx-t", "ttbc")); + public EnTranslationToTranslationParser(final IndexBuilder[] indexBuilders, - final Pattern[] namePatterns) { + final Pattern[] langCodePatterns) { this.indexBuilders = indexBuilders; - this.namePatterns = namePatterns; + this.langCodePatterns = langCodePatterns; } @Override void removeUselessArgs(Map namedArgs) { namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS); } - + @Override void parseSection(String heading, String text) { if (EnParser.isIgnorableTitle(title)) { @@ -48,133 +59,69 @@ public final class EnTranslationToTranslationParser extends AbstractWiktionaryPa } final WikiTokenizer wikiTokenizer = new WikiTokenizer(text); while (wikiTokenizer.nextToken() != null) { - if (wikiTokenizer.isHeading()) { - final String headerName = wikiTokenizer.headingWikiText(); - if (headerName.equals("Translations")) { - //doTranslations(wikiTokenizer); - } - } else { - // TODO: optimization: skip to next heading, or even skip to translations. - } - } - } -/* - private void doTranslations(final WikiTokenizer wikiTokenizer) { - String topLevelLang = null; - boolean done = false; - StringBuilder[] builders; - while (wikiTokenizer.nextToken() != null) { - if (wikiTokenizer.isHeading()) { - wikiTokenizer.returnToLineStart(); - return; - } - if (done) { - continue; - } - - // Check whether we care about this line: if (wikiTokenizer.isFunction()) { - final String functionName = wikiTokenizer.functionName(); - final List positionArgs = wikiTokenizer.functionPositionArgs(); - - if (functionName.equals("trans-top")) { - if (wikiTokenizer.functionPositionArgs().size() >= 1) { - builders = new StringBuilder[] {new StringBuilder(), new StringBuilder()}; - } - } else if (functionName.equals("trans-bottom")) { - builders = null; - } else if (functionName.equals("trans-mid")) { - } else if (functionName.equals("trans-see")) { - } else if (functionName.startsWith("picdic")) { - } else if (functionName.startsWith("checktrans")) { - done = true; - } else if (functionName.startsWith("ttbc")) { - wikiTokenizer.nextLine(); - // TODO: would be great to handle ttbc - // TODO: Check this: done = true; - } else { - LOG.warning("Unexpected translation wikifunction: " + wikiTokenizer.token() + ", title=" + title); - } - } else if (wikiTokenizer.isListItem()) { - final String line = wikiTokenizer.listItemWikiText(); - // This line could produce an output... - - // First strip the language and check whether it matches. - // And hold onto it for sub-lines. - final int colonIndex = line.indexOf(":"); - if (colonIndex == -1) { - continue; - } - - final String lang = trim(WikiTokenizer.toPlainText(line.substring(0, colonIndex))); - incrementCount("tCount:" + lang); - - - final boolean appendLang; - if (wikiTokenizer.listItemPrefix().length() == 1) { - topLevelLang = lang; - final boolean thisFind = langPattern.matcher(lang).find(); - if (!thisFind) { - continue; - } - appendLang = !langPattern.matcher(lang).matches(); - } else if (topLevelLang == null) { - continue; - } else { - // Two-level -- the only way we won't append is if this second level matches exactly. - if (!langPattern.matcher(lang).matches() && !langPattern.matcher(topLevelLang).find()) { - continue; - } - appendLang = !langPattern.matcher(lang).matches(); - } - - String rest = line.substring(colonIndex + 1).trim(); - if (rest.length() > 0) { - doTranslationLine(line, appendLang ? lang : null, rest); + final String name = wikiTokenizer.functionName(); + if (Ts.contains(name)) { + onT(wikiTokenizer); + } else if (name.equals("trans-top")) { + startEntry(title, wikiTokenizer.token()); + } else if (name.equals("trans-bottom")) { + finishEntry(title); } } } } - private void doTranslationLine(final String line, final String lang, final String pos, final String sense, final String rest) { - state = State.TRANSLATION_LINE; - // Good chance we'll actually file this one... - final PairEntry pairEntry = new PairEntry(entrySource); - final IndexedEntry indexedEntry = new IndexedEntry(pairEntry); - - final StringBuilder foreignText = new StringBuilder(); - appendAndIndexWikiCallback.reset(foreignText, indexedEntry); - appendAndIndexWikiCallback.dispatch(rest, foreignIndexBuilder, EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); - - if (foreignText.length() == 0) { - LOG.warning("Empty foreignText: " + line); - incrementCount("WARNING: Empty foreignText" ); - return; + final TranslationCallback translationCallback = new TranslationCallback(); + + final AppendAndIndexWikiCallback appendAndIndexWikiCallback = new AppendAndIndexWikiCallback( + this); + { + for (final String t : Ts) { + appendAndIndexWikiCallback.functionCallbacks.put(t, translationCallback); + } + } + + private void onT(WikiTokenizer wikiTokenizer) { + final List args = wikiTokenizer.functionPositionArgs(); + final String langCode = ListUtil.get(args, 0); + for (int p = 0; p < 2; ++p) { + if (langCodePatterns[p].matcher(langCode).matches()) { + appendAndIndexWikiCallback.builder = builders[p]; + appendAndIndexWikiCallback.indexBuilder = indexBuilders[p]; + appendAndIndexWikiCallback.onFunction(wikiTokenizer, + wikiTokenizer.functionName(), wikiTokenizer.functionPositionArgs(), + wikiTokenizer.functionNamedArgs()); } - - if (lang != null) { - foreignText.insert(0, String.format("(%s) ", lang)); + } + } + + void startEntry(final String title, final String func) { + if (pairEntry != null) { + LOG.warning("startEntry() twice" + func); + finishEntry(title); } - StringBuilder englishText = new StringBuilder(); - - englishText.append(title); - if (sense != null) { - englishText.append(" (").append(sense).append(")"); - enIndexBuilder.addEntryWithString(indexedEntry, sense, EntryTypeName.WIKTIONARY_TRANSLATION_SENSE); + pairEntry = new PairEntry(entrySource); + indexedEntry = new IndexedEntry(pairEntry); + builders = new StringBuilder[] { new StringBuilder(), new StringBuilder() }; + } + + void finishEntry(final String title) { + if (pairEntry == null) { + LOG.warning("finalizeEntry() twice" + title); + return; } - if (pos != null) { - englishText.append(" (").append(pos.toLowerCase()).append(")"); + final String lang1 = builders[0].toString(); + final String lang2 = builders[1].toString(); + if (lang1.length() > 0 && lang2.length() > 0) { + pairEntry.pairs.add(new Pair(lang1, lang2)); + indexedEntry.isValid = true; } - enIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_MULTI); - final Pair pair = new Pair(trim(englishText.toString()), trim(foreignText.toString()), swap); - pairEntry.pairs.add(pair); - if (!pairsAdded.add(pair.toString())) { - LOG.warning("Duplicate pair: " + pair.toString()); - incrementCount("WARNING: Duplicate pair" ); - } + pairEntry = null; + indexedEntry = null; + builders = null; } - */ } \ No newline at end of file diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/SimpleSingleWiktionaryParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/SimpleSingleWiktionaryParser.java new file mode 100644 index 0000000..d82b276 --- /dev/null +++ b/src/com/hughes/android/dictionary/parser/wiktionary/SimpleSingleWiktionaryParser.java @@ -0,0 +1,7 @@ +package com.hughes.android.dictionary.parser.wiktionary; + +public class SimpleSingleWiktionaryParser { + + // Just does everything about a word, minus translations. + +}