From bfc1f7877ce0ffe5111e92fc60b143b686ea8f4c Mon Sep 17 00:00:00 2001 From: Thad Hughes Date: Thu, 29 Dec 2011 18:25:29 -0800 Subject: [PATCH] More languages, simpler splitter. --- .../dictionary/engine/WiktionarySplitter.java | 52 ++----------------- .../parser/EnWiktionaryXmlParser.java | 34 +++++++++--- .../parser/WikiFunctionCallback.java | 5 ++ todo.txt | 7 +++ 4 files changed, 42 insertions(+), 56 deletions(-) diff --git a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java index ee9b630..2628700 100644 --- a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java +++ b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java @@ -22,6 +22,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -80,54 +81,9 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { } if (selectors.isEmpty()) { - selectors.addAll(Arrays.asList( - new Selector("data/inputs/enWikiSplit/AF.data", ".*[Aa]frikaans.*"), - new Selector("data/inputs/enWikiSplit/AR.data", ".*[Aa]rabic.*"), - new Selector("data/inputs/enWikiSplit/HY.data", ".*[Aa]rmenian.*"), - new Selector("data/inputs/enWikiSplit/HR.data", ".*[Cc]roatian.*"), - new Selector("data/inputs/enWikiSplit/CS.data", ".*[Cc]zech.*"), - new Selector("data/inputs/enWikiSplit/ZH.data", ".*[Cc]hinese.*|.*[Mm]andarin.*|.*Cantonese.*"), - new Selector("data/inputs/enWikiSplit/DA.data", ".*[Dd]anish.*"), - new Selector("data/inputs/enWikiSplit/NL.data", ".*[Dd]utch.*"), - new Selector("data/inputs/enWikiSplit/EN.data", ".*[Ee]nglish.*"), - new Selector("data/inputs/enWikiSplit/FI.data", ".*[Ff]innish.*"), - new Selector("data/inputs/enWikiSplit/FR.data", ".*[Ff]rench.*"), - new Selector("data/inputs/enWikiSplit/DE.data", ".*[Gg]erman.*"), - new Selector("data/inputs/enWikiSplit/EL.data", ".*[Gg]reek.*"), - new Selector("data/inputs/enWikiSplit/haw.data", ".*[Hh]awaiian.*"), - new Selector("data/inputs/enWikiSplit/HE.data", ".*[Hh]ebrew.*"), - new Selector("data/inputs/enWikiSplit/HI.data", ".*[Hh]indi.*"), - new Selector("data/inputs/enWikiSplit/IS.data", ".*[Ii]celandic.*"), - new Selector("data/inputs/enWikiSplit/GA.data", ".*[Ii]rish.*"), - new Selector("data/inputs/enWikiSplit/IT.data", ".*[Ii]talian.*"), - new Selector("data/inputs/enWikiSplit/LT.data", ".*[Ll]ithuanian.*"), - new Selector("data/inputs/enWikiSplit/JA.data", ".*[Jj]apanese.*"), - new Selector("data/inputs/enWikiSplit/KO.data", ".*[Kk]orean.*"), - new Selector("data/inputs/enWikiSplit/KU.data", ".*[Kk]urdish.*"), - new Selector("data/inputs/enWikiSplit/MS.data", ".*[Mm]alay.*"), - new Selector("data/inputs/enWikiSplit/MI.data", ".*[Mm]aori.*"), - new Selector("data/inputs/enWikiSplit/MN.data", ".*[Mm]ongolian.*"), - new Selector("data/inputs/enWikiSplit/NO.data", ".*[Nn]orwegian.*"), - new Selector("data/inputs/enWikiSplit/FA.data", ".*[Pp]ersian.*"), - new Selector("data/inputs/enWikiSplit/PT.data", ".*[Pp]ortuguese.*"), - new Selector("data/inputs/enWikiSplit/PL.data", ".*[Pp]olish.*"), - new Selector("data/inputs/enWikiSplit/RO.data", ".*[Rr]omanian.*"), - new Selector("data/inputs/enWikiSplit/RU.data", ".*[Rr]ussian.*"), - new Selector("data/inputs/enWikiSplit/SA.data", ".*[Ss]anskrit.*"), - new Selector("data/inputs/enWikiSplit/SR.data", ".*[Ss]erbian.*"), - new Selector("data/inputs/enWikiSplit/SO.data", ".*[Ss]omali.*"), - new Selector("data/inputs/enWikiSplit/ES.data", ".*[Ss]panish.*"), - new Selector("data/inputs/enWikiSplit/SV.data", ".*[Ss]wedish.*"), - new Selector("data/inputs/enWikiSplit/TG.data", ".*[Tt]ajik.*"), - new Selector("data/inputs/enWikiSplit/TH.data", ".*[Tt]hai.*"), - new Selector("data/inputs/enWikiSplit/BO.data", ".*[Tt]ibetan.*"), - new Selector("data/inputs/enWikiSplit/TR.data", ".*[Tt]urkish.*"), - new Selector("data/inputs/enWikiSplit/UK.data", ".*[Uu]krainian.*"), - new Selector("data/inputs/enWikiSplit/VI.data", ".*[Vv]ietnamese.*"), - new Selector("data/inputs/enWikiSplit/CI.data", ".*[Ww]elsh.*"), - new Selector("data/inputs/enWikiSplit/YI.data", ".*[Yy]iddish.*"), - new Selector("data/inputs/enWikiSplit/ZU.data", ".*[Zz]ulu.*") - )); + for (final Map.Entry entry : Language.isoCodeToWikiName.entrySet()) { + selectors.add(new Selector(String.format("data/inputs/enWikiSplit/%s.data", entry.getKey()), entry.getValue())); + } } // Do it. diff --git a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java index e3c7d2a..b9dbc7d 100644 --- a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java +++ b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java @@ -254,7 +254,7 @@ public class EnWiktionaryXmlParser { } else if (wikiTokenizer.remainderStartsWith("''See''")) { wikiTokenizer.nextLine(); - LOG.fine("Skipping line: " + wikiTokenizer.token()); + LOG.fine("Skipping See line: " + wikiTokenizer.token()); } else if (wikiTokenizer.isWikiLink()) { final String wikiLink = wikiTokenizer.wikiLinkText(); if (wikiLink.contains(":") && wikiLink.contains(title)) { @@ -287,24 +287,28 @@ public class EnWiktionaryXmlParser { } - static final class Callback implements WikiTokenizer.Callback { - public Callback(IndexedEntry indexedEntry, IndexBuilder defaultIndexBuilder, - StringBuilder builder, Map functionCallbacks) { + static final class AppendAndIndexCallback implements WikiTokenizer.Callback { + public AppendAndIndexCallback( + final StringBuilder builder, + final IndexedEntry indexedEntry, + final IndexBuilder defaultIndexBuilder, + final Map functionCallbacks) { this.indexedEntry = indexedEntry; this.defaultIndexBuilder = defaultIndexBuilder; this.builder = builder; this.functionCallbacks = functionCallbacks; } - final IndexedEntry indexedEntry; - final IndexBuilder defaultIndexBuilder; final StringBuilder builder; + final IndexedEntry indexedEntry; + IndexBuilder defaultIndexBuilder; final Map functionCallbacks; // TODO: the classes of text are wrong.... @Override public void onPlainText(WikiTokenizer wikiTokenizer) { + // The only non-recursive callback. Just appends to the builder, and final String plainText = wikiTokenizer.token(); builder.append(plainText); defaultIndexBuilder.addEntryWithString(indexedEntry, plainText, EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); @@ -319,8 +323,22 @@ public class EnWiktionaryXmlParser { } @Override - public void onFunction(String functionName, - List functionPositionArgs, Map functionNamedArgs) { + public void onFunction(final String name, + final List args, final Map namedArgs) { + final WikiFunctionCallback functionCallback = functionCallbacks.get(name); + if (functionCallback != null) { + // Dispatch the handling elsewhere. + functionCallback.onWikiFunction(name, args, namedArgs); + } else { + // Default function handling: + for (int i = 0; i < args.size(); ++i) { + args.set(i, WikiTokenizer.toPlainText(args.get(i))); + } + for (final Map.Entry entry : namedArgs.entrySet()) { + entry.setValue(WikiTokenizer.toPlainText(entry.getValue())); + } + WikiTokenizer.appendFunction(builder, name, args, namedArgs); + } } @Override diff --git a/src/com/hughes/android/dictionary/parser/WikiFunctionCallback.java b/src/com/hughes/android/dictionary/parser/WikiFunctionCallback.java index 6043323..5ae62d4 100644 --- a/src/com/hughes/android/dictionary/parser/WikiFunctionCallback.java +++ b/src/com/hughes/android/dictionary/parser/WikiFunctionCallback.java @@ -1,5 +1,10 @@ package com.hughes.android.dictionary.parser; +import java.util.List; +import java.util.Map; + public interface WikiFunctionCallback { + + void onWikiFunction(final String name, final List args, final Map namedArgs); } diff --git a/todo.txt b/todo.txt index 465de4f..7adfe5c 100644 --- a/todo.txt +++ b/todo.txt @@ -1,4 +1,11 @@ +For next release: +refactor wiki parsing. random word jump +multiword find. +dictionary update. +{{Arab}} +???italian verbs + pronunciation synonyms move dict to top of list when downloaded -- 2.43.0