From: Thad Hughes Date: Mon, 2 Jan 2012 18:00:35 +0000 (-0800) Subject: Major refactor in the way wikiText is parsed. X-Git-Url: http://gitweb.fperrin.net/?p=DictionaryPC.git;a=commitdiff_plain;h=794c2989d4ff4c456c9aa1066150c6d51a5aae84 Major refactor in the way wikiText is parsed. --- diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java index 7cdb313..c0921c3 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java @@ -32,6 +32,27 @@ public class DictionaryBuilderTest extends TestCase { public static final String GOLDENS = "testdata/goldens/"; public static final String TEST_OUTPUTS = "testdata/outputs/"; + + public void testWiktionary_IT_EN() throws Exception { + wiktionaryTestWithLangToEn("wiktionary.it_en.quickdic", "IT", "it.txt", + "EN.data", "enwiktionary.english", "Italian", "it"); + } + + public void testWiktionary_ZH_EN() throws Exception { + wiktionaryTestWithLangToEn("wiktionary.zh_en.quickdic", "ZH", "empty.txt", + // These missing "e" prevents a complete match, forcing the name to be printed + "EN.data", "enwiktionary.english", "Chinese|Mandarin|Cantones", "zh"); + } + + public void testWiktionary_DE_EN() throws Exception { + wiktionaryTestWithLangToEn("wiktionary.de_en.quickdic", "DE", "de.txt", + "EN.data", "enwiktionary.english", "German", "it"); + } + + public void testWiktionary_IT_IT() throws Exception { + wiktionaryTestWithLangToEn("wiktionary.it_it.quickdic", "IT", "it.txt", + "IT.data", "enwiktionary.italian", "Italian", "it"); + } // French public void testWiktionary_FR_FR() throws Exception { @@ -53,34 +74,12 @@ public class DictionaryBuilderTest extends TestCase { "ZH.data", "enwiktionary.chinese", "Chinese|Mandarin|Cantones", "zh"); } - public void testWiktionary_ZH_EN() throws Exception { - wiktionaryTestWithLangToEn("wiktionary.zh_en.quickdic", "ZH", "empty.txt", - // These missing "e" prevents a complete match, forcing the name to be printed - "EN.data", "enwiktionary.english", "Chinese|Mandarin|Cantones", "zh"); - } - // German public void testWiktionary_DE_DE() throws Exception { wiktionaryTestWithLangToEn("wiktionary.de_de.quickdic", "DE", "de.txt", "DE.data", "enwiktionary.german", "German", "it"); } - public void testWiktionary_DE_EN() throws Exception { - wiktionaryTestWithLangToEn("wiktionary.de_en.quickdic", "DE", "de.txt", - "EN.data", "enwiktionary.english", "German", "it"); - } - - // Italian - public void testWiktionary_IT_IT() throws Exception { - wiktionaryTestWithLangToEn("wiktionary.it_it.quickdic", "IT", "it.txt", - "IT.data", "enwiktionary.italian", "Italian", "it"); - } - - public void testWiktionary_IT_EN() throws Exception { - wiktionaryTestWithLangToEn("wiktionary.it_en.quickdic", "IT", "it.txt", - "EN.data", "enwiktionary.english", "Italian", "it"); - } - public void wiktionaryTestWithLangToEn(final String name, final String lang1, final String stoplist, final String data, final String dictName, final String langPattern, final String langCode) throws Exception { diff --git a/src/com/hughes/android/dictionary/engine/IndexBuilder.java b/src/com/hughes/android/dictionary/engine/IndexBuilder.java index 9e6b6c0..81de5a2 100644 --- a/src/com/hughes/android/dictionary/engine/IndexBuilder.java +++ b/src/com/hughes/android/dictionary/engine/IndexBuilder.java @@ -126,13 +126,14 @@ public class IndexBuilder { } public void addEntryWithString(final IndexedEntry indexedEntry, final String untokenizedString, - final EntryTypeName singleTokenEntryTypeName, final EntryTypeName multiTokenEntryTypeName) { + final EntryTypeName entryTypeName) { final Set tokens = DictFileParser.tokenize(untokenizedString, DictFileParser.NON_CHAR); - addEntryWithTokens(indexedEntry, tokens, tokens.size() == 1 ? singleTokenEntryTypeName : multiTokenEntryTypeName); + addEntryWithTokens(indexedEntry, tokens, tokens.size() == 1 ? entryTypeName.singleWordInstance : entryTypeName); } - public void addEntryWithString(final IndexedEntry indexedEntry, final String untokenizedString, + public void addEntryWithStringNoSingle(final IndexedEntry indexedEntry, final String untokenizedString, final EntryTypeName entryTypeName) { - addEntryWithString(indexedEntry, untokenizedString, entryTypeName, entryTypeName); + final Set tokens = DictFileParser.tokenize(untokenizedString, DictFileParser.NON_CHAR); + addEntryWithTokens(indexedEntry, tokens, entryTypeName); } } diff --git a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java index 6c81749..47aac94 100644 --- a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java +++ b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java @@ -24,11 +24,11 @@ import java.util.regex.Pattern; public final class WikiTokenizer { public static interface Callback { - void onPlainText(WikiTokenizer wikiTokenizer); + void onPlainText(final String text); void onMarkup(WikiTokenizer wikiTokenizer); void onWikiLink(WikiTokenizer wikiTokenizer); void onNewline(WikiTokenizer wikiTokenizer); - void onFunction(String functionName, List functionPositionArgs, + void onFunction(final WikiTokenizer tokenizer, String functionName, List functionPositionArgs, Map functionNamedArgs); void onHeading(WikiTokenizer wikiTokenizer); void onListItem(WikiTokenizer wikiTokenizer); @@ -104,27 +104,41 @@ public final class WikiTokenizer { positionArgs.clear(); namedArgs.clear(); } - - public void dispatch(final Callback callback) { - while (nextToken() != null) { - if (isPlainText()) { - callback.onPlainText(this); - } else if (isMarkup()) { - callback.onMarkup(this); - } else if (isWikiLink) { - callback.onWikiLink(this); - } else if (isNewline()) { - callback.onNewline(this); - } else if (isFunction()) { - callback.onFunction(functionName(), functionPositionArgs(), functionNamedArgs()); - } else if (isHeading()) { - callback.onHeading(this); - } else if (isListItem()) { - callback.onListItem(this); - } else if (isComment()) { - callback.onComment(this); - } else { - throw new IllegalStateException("Unknown wiki state."); + + private static final Pattern POSSIBLE_WIKI_TEXT = Pattern.compile( + "\\{\\{|" + + "\\[\\[|" + + "