From e479ba38bbcb261951399326623c20ffacc147d4 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Reimar=20D=C3=B6ffinger?= Date: Tue, 8 Nov 2016 23:28:19 +0100 Subject: [PATCH] Apply astyle code formatting. --- .../android/dictionary/DateFormatTest.java | 12 +- .../dictionary/SerializeCollatorTest.java | 18 +- .../engine/CheckDictionariesMain.java | 124 +- .../dictionary/engine/DictionaryBuilder.java | 368 +-- .../engine/DictionaryBuilderMain.java | 602 ++--- .../engine/DictionaryBuilderTest.java | 662 ++--- .../dictionary/engine/DictionaryTest.java | 634 ++--- .../dictionary/engine/IndexBuilder.java | 263 +- .../dictionary/engine/IndexedEntry.java | 12 +- .../dictionary/engine/LanguageTest.java | 322 +-- .../dictionary/engine/WiktionarySplitter.java | 493 ++-- .../dictionary/parser/DictFileParser.java | 470 ++-- .../android/dictionary/parser/Parser.java | 4 +- .../dictionary/parser/WikiTokenizer.java | 1216 ++++----- .../dictionary/parser/WikiTokenizerTest.java | 622 ++--- .../wiktionary/AbstractWiktionaryParser.java | 428 ++-- .../wiktionary/DeFunctionCallbacks.java | 90 +- .../parser/wiktionary/EnForeignParser.java | 552 ++-- .../wiktionary/EnFunctionCallbacks.java | 2239 +++++++++-------- .../parser/wiktionary/EnParser.java | 228 +- .../wiktionary/EnToTranslationParser.java | 366 +-- .../EnTranslationToTranslationParser.java | 196 +- .../wiktionary/FrFunctionCallbacks.java | 76 +- .../parser/wiktionary/FunctionCallback.java | 12 +- .../wiktionary/ItFunctionCallbacks.java | 24 +- .../wiktionary/WholeSectionToHtmlParser.java | 64 +- .../parser/wiktionary/WiktionaryLangs.java | 380 +-- 27 files changed, 5241 insertions(+), 5236 deletions(-) diff --git a/src/com/hughes/android/dictionary/DateFormatTest.java b/src/com/hughes/android/dictionary/DateFormatTest.java index fd2c910..fce2095 100644 --- a/src/com/hughes/android/dictionary/DateFormatTest.java +++ b/src/com/hughes/android/dictionary/DateFormatTest.java @@ -19,11 +19,11 @@ import java.util.Date; public class DateFormatTest { - /** - * @param args - */ - public static void main(String[] args) { - System.out.println(new SimpleDateFormat("yyyy.MM.dd HH:mm:ss").format(new Date())); - } + /** + * @param args + */ + public static void main(String[] args) { + System.out.println(new SimpleDateFormat("yyyy.MM.dd HH:mm:ss").format(new Date())); + } } diff --git a/src/com/hughes/android/dictionary/SerializeCollatorTest.java b/src/com/hughes/android/dictionary/SerializeCollatorTest.java index bfc531e..7a1e42e 100644 --- a/src/com/hughes/android/dictionary/SerializeCollatorTest.java +++ b/src/com/hughes/android/dictionary/SerializeCollatorTest.java @@ -23,14 +23,14 @@ import java.text.Collator; public class SerializeCollatorTest { - /** - * @param args - * @throws IOException - */ - public static void main(String[] args) throws IOException { - File temp = File.createTempFile("temp", null); - final Comparator c = Language.de.getCollator(); - //FileUtil.writeObject(c, temp); - } + /** + * @param args + * @throws IOException + */ + public static void main(String[] args) throws IOException { + File temp = File.createTempFile("temp", null); + final Comparator c = Language.de.getCollator(); + //FileUtil.writeObject(c, temp); + } } diff --git a/src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java b/src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java index 6ad8fb2..8be96fc 100644 --- a/src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java +++ b/src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java @@ -14,45 +14,45 @@ import java.util.Collections; import java.util.List; public class CheckDictionariesMain { - - static final String BASE_URL = "http://github.com/rdoeffinger/Dictionary/releases/download/v0.2-dictionaries/"; - static final String VERSION_CODE_OLD = "v006"; - static final String VERSION_CODE = "v007"; - - public static void main(String[] args) throws IOException { - final File dictDir = new File(DictionaryBuilderMain.OUTPUTS); - - final PrintWriter dictionaryInfoOut = new PrintWriter(new File("../Dictionary/res/raw/dictionary_info.txt")); + + static final String BASE_URL = "http://github.com/rdoeffinger/Dictionary/releases/download/v0.2-dictionaries/"; + static final String VERSION_CODE_OLD = "v006"; + static final String VERSION_CODE = "v007"; + + public static void main(String[] args) throws IOException { + final File dictDir = new File(DictionaryBuilderMain.OUTPUTS); + + final PrintWriter dictionaryInfoOut = new PrintWriter(new File("../Dictionary/res/raw/dictionary_info.txt")); // dictionaryInfoOut.println("# LANG_1\t%LANG_2\tFILENAME\tVERSION_CODE\tFILESIZE\tNUM_MAIN_WORDS_1\tNUM_MAIN_WORDS_2\tNUM_ALL_WORDS_1\tNUM_ALL_WORDS_2"); - final File[] files = dictDir.listFiles(); - final List dictNames = new ArrayList(); - Arrays.sort(files); - for (final File dictFile : files) { - if (!dictFile.getName().endsWith("quickdic")) { - continue; - } - System.out.println(dictFile.getPath()); - - - final RandomAccessFile raf = new RandomAccessFile(dictFile, "r"); - final Dictionary dict = new Dictionary(raf); - - final DictionaryInfo dictionaryInfo = dict.getDictionaryInfo(); - - String version_code = VERSION_CODE; - File zipFile = new File(dictFile.getPath() + "." + version_code + ".zip"); - if (!zipFile.canRead()) { - version_code = VERSION_CODE_OLD; - zipFile = new File(dictFile.getPath() + "." + version_code + ".zip"); - } - dictionaryInfo.uncompressedFilename = dictFile.getName(); - dictionaryInfo.downloadUrl = BASE_URL + dictFile.getName() + "." + version_code + ".zip"; - // TODO: zip it right here.... - dictionaryInfo.uncompressedBytes = dictFile.length(); - dictionaryInfo.zipBytes = zipFile.canRead() ? zipFile.length() : -1; - - // Print it. + final File[] files = dictDir.listFiles(); + final List dictNames = new ArrayList(); + Arrays.sort(files); + for (final File dictFile : files) { + if (!dictFile.getName().endsWith("quickdic")) { + continue; + } + System.out.println(dictFile.getPath()); + + + final RandomAccessFile raf = new RandomAccessFile(dictFile, "r"); + final Dictionary dict = new Dictionary(raf); + + final DictionaryInfo dictionaryInfo = dict.getDictionaryInfo(); + + String version_code = VERSION_CODE; + File zipFile = new File(dictFile.getPath() + "." + version_code + ".zip"); + if (!zipFile.canRead()) { + version_code = VERSION_CODE_OLD; + zipFile = new File(dictFile.getPath() + "." + version_code + ".zip"); + } + dictionaryInfo.uncompressedFilename = dictFile.getName(); + dictionaryInfo.downloadUrl = BASE_URL + dictFile.getName() + "." + version_code + ".zip"; + // TODO: zip it right here.... + dictionaryInfo.uncompressedBytes = dictFile.length(); + dictionaryInfo.zipBytes = zipFile.canRead() ? zipFile.length() : -1; + + // Print it. // final PrintWriter textOut = new PrintWriter(new BufferedWriter(new FileWriter(dictFile + ".text"))); // final List sorted = new ArrayList(dict.pairEntries); // Collections.sort(sorted); @@ -60,31 +60,31 @@ public class CheckDictionariesMain { // textOut.println(pairEntry.getRawText(false)); // } // textOut.close(); - - // Find the stats. - System.out.println("Stats..."); - final List indexNames = new ArrayList(); - for (final IndexInfo indexInfo : dictionaryInfo.indexInfos) { - indexNames.add(indexInfo.shortName); - } - dictNames.add(CollectionUtil.join(indexNames, "-") + "\n"); - final String row = dictionaryInfo.append(new StringBuilder()).toString(); - if (!zipFile.canRead()) { - System.err.println("Couldn't read zipfile: " + zipFile); - } - System.out.println(row + "\n"); - - - dictionaryInfoOut.println(row); - dictionaryInfoOut.flush(); - - raf.close(); + + // Find the stats. + System.out.println("Stats..."); + final List indexNames = new ArrayList(); + for (final IndexInfo indexInfo : dictionaryInfo.indexInfos) { + indexNames.add(indexInfo.shortName); + } + dictNames.add(CollectionUtil.join(indexNames, "-") + "\n"); + final String row = dictionaryInfo.append(new StringBuilder()).toString(); + if (!zipFile.canRead()) { + System.err.println("Couldn't read zipfile: " + zipFile); + } + System.out.println(row + "\n"); + + + dictionaryInfoOut.println(row); + dictionaryInfoOut.flush(); + + raf.close(); + } + + Collections.sort(dictNames); + System.out.println(dictNames.toString().replace(",", " *")); + + dictionaryInfoOut.close(); } - - Collections.sort(dictNames); - System.out.println(dictNames.toString().replace(",", " *")); - - dictionaryInfoOut.close(); - } } diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java index 624ade9..d105af2 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java @@ -40,198 +40,198 @@ import com.hughes.util.Args; import com.hughes.util.FileUtil; public class DictionaryBuilder { - - public final Dictionary dictionary; - public final List indexBuilders = new ArrayList(); - - public DictionaryBuilder(final String dictInfoString, final Language lang0, final Language lang1, final String normalizerRules1, final String normalizerRules2, final Set lang1Stoplist, final Set lang2Stoplist) { - dictionary = new Dictionary(dictInfoString); - if (lang1 != null) { - indexBuilders.add(new IndexBuilder(this, lang0.getIsoCode(), lang0.getIsoCode() + "->" + lang1.getIsoCode(), lang0, normalizerRules1, lang1Stoplist, false)); - indexBuilders.add(new IndexBuilder(this, lang1.getIsoCode(), lang1.getIsoCode() + "->" + lang0.getIsoCode(), lang1, normalizerRules2, lang2Stoplist, true)); - } else { - indexBuilders.add(new IndexBuilder(this, lang0.getIsoCode(), lang0.getIsoCode(), lang0, normalizerRules1, lang1Stoplist, false)); - } - } - - void build() { - for (final IndexBuilder indexBuilder : indexBuilders) { - indexBuilder.build(); - dictionary.indices.add(indexBuilder.index); - } - } - - public static void main(final String[] args) throws IOException, ParserConfigurationException, SAXException { - System.out.println("Running with arguments:"); - for (final String arg : args) { - System.out.println(arg); - } - - final Map keyValueArgs = Args.keyValueArgs(args); - - if (!keyValueArgs.containsKey("lang1")) { - fatalError("--lang1= must be specified."); - } - final Language lang1 = Language.lookup(keyValueArgs.remove("lang1")); - final Language lang2; - if (keyValueArgs.containsKey("lang2")) { - lang2 = Language.lookup(keyValueArgs.remove("lang2")); - } else { - lang2 = null; - } - final Set lang1Stoplist = new LinkedHashSet(); - final Set lang2Stoplist = new LinkedHashSet(); - final String lang1StoplistFile = keyValueArgs.remove("lang1Stoplist"); - final String lang2StoplistFile = keyValueArgs.remove("lang2Stoplist"); - if (lang1StoplistFile != null) { - lang1Stoplist.addAll(FileUtil.readLines(new File(lang1StoplistFile))); - } - if (lang2StoplistFile != null) { - lang2Stoplist.addAll(FileUtil.readLines(new File(lang2StoplistFile))); - } + public final Dictionary dictionary; + public final List indexBuilders = new ArrayList(); - String normalizerRules1 = keyValueArgs.remove("normalizerRules1"); - String normalizerRules2 = keyValueArgs.remove("normalizerRules2"); - if (normalizerRules1 == null) { - normalizerRules1 = lang1.getDefaultNormalizerRules(); - } - if (normalizerRules2 == null) { - normalizerRules2 = lang2 == null ? null : lang2.getDefaultNormalizerRules(); - } - - final String dictOutFilename = keyValueArgs.remove("dictOut"); - if (dictOutFilename == null) { - fatalError("--dictOut= must be specified."); - } - - String dictInfo = keyValueArgs.remove("dictInfo"); - if (dictInfo == null) { - fatalError("--dictInfo= must be specified."); - } - if (dictInfo.startsWith("@")) { - dictInfo = FileUtil.readToString(new File(dictInfo.substring(1))); + public DictionaryBuilder(final String dictInfoString, final Language lang0, final Language lang1, final String normalizerRules1, final String normalizerRules2, final Set lang1Stoplist, final Set lang2Stoplist) { + dictionary = new Dictionary(dictInfoString); + if (lang1 != null) { + indexBuilders.add(new IndexBuilder(this, lang0.getIsoCode(), lang0.getIsoCode() + "->" + lang1.getIsoCode(), lang0, normalizerRules1, lang1Stoplist, false)); + indexBuilders.add(new IndexBuilder(this, lang1.getIsoCode(), lang1.getIsoCode() + "->" + lang0.getIsoCode(), lang1, normalizerRules2, lang2Stoplist, true)); + } else { + indexBuilders.add(new IndexBuilder(this, lang0.getIsoCode(), lang0.getIsoCode(), lang0, normalizerRules1, lang1Stoplist, false)); + } } - - final String printFile = keyValueArgs.remove("print"); - - System.out.println("lang1=" + lang1); - System.out.println("lang2=" + lang2); - System.out.println("normalizerRules1=" + normalizerRules1); - System.out.println("normalizerRules2=" + normalizerRules2); - System.out.println("dictInfo=" + dictInfo); - System.out.println("dictOut=" + dictOutFilename); - - final DictionaryBuilder dictionaryBuilder = new DictionaryBuilder(dictInfo, lang1, lang2, normalizerRules1, normalizerRules2, lang1Stoplist, lang2Stoplist); - - for (int i = 0; i < 100; ++i) { - final String prefix = "input" + i; - if (keyValueArgs.containsKey(prefix)) { - final File file = new File(keyValueArgs.remove(prefix)); - System.out.println("Processing: " + file); - String charsetName = keyValueArgs.remove(prefix + "Charset"); - if (charsetName == null) { - charsetName = "UTF8"; + + void build() { + for (final IndexBuilder indexBuilder : indexBuilders) { + indexBuilder.build(); + dictionary.indices.add(indexBuilder.index); } - final Charset charset = Charset.forName(charsetName); - String inputName = keyValueArgs.remove(prefix + "Name"); - if (inputName == null) { - fatalError("Must specify human readable name for: " + prefix + "Name"); + } + + public static void main(final String[] args) throws IOException, ParserConfigurationException, SAXException { + System.out.println("Running with arguments:"); + for (final String arg : args) { + System.out.println(arg); } - String pageLimitString = keyValueArgs.remove(prefix + "PageLimit"); - if (pageLimitString == null) { - pageLimitString = "-1"; + + final Map keyValueArgs = Args.keyValueArgs(args); + + if (!keyValueArgs.containsKey("lang1")) { + fatalError("--lang1= must be specified."); } - final int pageLimit = Integer.parseInt(pageLimitString); - - final EntrySource entrySource = new EntrySource(dictionaryBuilder.dictionary.sources.size(), inputName, 0); - System.out.println(""); - - String inputFormat = keyValueArgs.remove(prefix + "Format"); - if ("tab_separated".equals(inputFormat)) { - final boolean flipColumns = "true".equals(keyValueArgs.remove(prefix + "FlipColumns")); - new DictFileParser(charset, flipColumns, DictFileParser.TAB, null, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parse(file, entrySource, pageLimit); - } else if ("chemnitz".equals(inputFormat)) { - final boolean flipColumns = "true".equals(keyValueArgs.remove(prefix + "FlipColumns")); - new DictFileParser(charset, flipColumns, DictFileParser.DOUBLE_COLON, DictFileParser.PIPE, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parse(file, entrySource, pageLimit); - } else if ("enwiktionary".equals(inputFormat)) { - final String type = keyValueArgs.remove(prefix + "WiktionaryType"); - final Pattern langPattern = Pattern.compile(keyValueArgs.remove(prefix + "LangPattern"), Pattern.CASE_INSENSITIVE); - final Pattern langCodePattern = Pattern.compile(keyValueArgs.remove(prefix + "LangCodePattern")); - final int enIndex = Integer.parseInt(keyValueArgs.remove(prefix + "EnIndex")) - 1; - - if (enIndex < 0 || enIndex >= 2) { - fatalError("Must be 1 or 2: " + prefix + "EnIndex"); - } - final Parser parser; - if ("EnToTranslation".equals(type)) { - parser = new EnToTranslationParser(dictionaryBuilder.indexBuilders.get(enIndex), dictionaryBuilder.indexBuilders.get(1-enIndex), - langPattern, langCodePattern, enIndex != 0); - } else if ("EnForeign".equals(type)) { - parser = new EnForeignParser(dictionaryBuilder.indexBuilders.get(enIndex), dictionaryBuilder.indexBuilders.get(1-enIndex), - langPattern, langCodePattern, enIndex != 0); - } else if ("EnEnglish".equals(type)) { - parser = new EnForeignParser(dictionaryBuilder.indexBuilders.get(enIndex), dictionaryBuilder.indexBuilders.get(enIndex), - langPattern, langCodePattern, true); - } else { - fatalError("Invalid WiktionaryType (use EnToTranslation or EnForeign or EnEnglish): " + type); - return; - } - parser.parse(file, entrySource, pageLimit); - } else if (EnTranslationToTranslationParser.NAME.equals(inputFormat)) { - final String code1 = keyValueArgs.remove(prefix + "LangPattern1"); - final String code2 = keyValueArgs.remove(prefix + "LangPattern2"); - if (code1 == null || code2 == null) { - fatalError("Must specify LangPattern1 and LangPattern2."); - return; - } - final Pattern codePattern1 = Pattern.compile(code1, Pattern.CASE_INSENSITIVE); - final Pattern codePattern2 = Pattern.compile(code2, Pattern.CASE_INSENSITIVE); - new EnTranslationToTranslationParser(dictionaryBuilder.indexBuilders, new Pattern[] {codePattern1, codePattern2}).parse(file, entrySource, pageLimit); - } else if (WholeSectionToHtmlParser.NAME.equals(inputFormat)) { - final int titleIndex = Integer.parseInt(keyValueArgs.remove(prefix + "TitleIndex")) - 1; - final String wiktionaryLang = keyValueArgs.remove(prefix + "WiktionaryLang"); - final String webUrlTemplate = keyValueArgs.remove(prefix + "WebUrlTemplate"); - String skipLang = keyValueArgs.remove(prefix + "SkipLang"); - if (skipLang == null) skipLang = ""; - new WholeSectionToHtmlParser(dictionaryBuilder.indexBuilders.get(titleIndex), null, wiktionaryLang, skipLang, webUrlTemplate).parse(file, entrySource, pageLimit); + final Language lang1 = Language.lookup(keyValueArgs.remove("lang1")); + final Language lang2; + if (keyValueArgs.containsKey("lang2")) { + lang2 = Language.lookup(keyValueArgs.remove("lang2")); } else { - fatalError("Invalid or missing input format: " + inputFormat); + lang2 = null; } - - dictionaryBuilder.dictionary.sources.add(entrySource); - System.out.println("Done: " + file + "\n\n"); - } - } - - dictionaryBuilder.build(); - // Drop indexBuilders to free RAM - dictionaryBuilder.indexBuilders.clear(); - - if (printFile != null) { - final PrintStream out = new PrintStream(new File(printFile)); - dictionaryBuilder.dictionary.print(out); - out.close(); + + final Set lang1Stoplist = new LinkedHashSet(); + final Set lang2Stoplist = new LinkedHashSet(); + final String lang1StoplistFile = keyValueArgs.remove("lang1Stoplist"); + final String lang2StoplistFile = keyValueArgs.remove("lang2Stoplist"); + if (lang1StoplistFile != null) { + lang1Stoplist.addAll(FileUtil.readLines(new File(lang1StoplistFile))); + } + if (lang2StoplistFile != null) { + lang2Stoplist.addAll(FileUtil.readLines(new File(lang2StoplistFile))); + } + + String normalizerRules1 = keyValueArgs.remove("normalizerRules1"); + String normalizerRules2 = keyValueArgs.remove("normalizerRules2"); + if (normalizerRules1 == null) { + normalizerRules1 = lang1.getDefaultNormalizerRules(); + } + if (normalizerRules2 == null) { + normalizerRules2 = lang2 == null ? null : lang2.getDefaultNormalizerRules(); + } + + final String dictOutFilename = keyValueArgs.remove("dictOut"); + if (dictOutFilename == null) { + fatalError("--dictOut= must be specified."); + } + + String dictInfo = keyValueArgs.remove("dictInfo"); + if (dictInfo == null) { + fatalError("--dictInfo= must be specified."); + } + if (dictInfo.startsWith("@")) { + dictInfo = FileUtil.readToString(new File(dictInfo.substring(1))); + } + + final String printFile = keyValueArgs.remove("print"); + + System.out.println("lang1=" + lang1); + System.out.println("lang2=" + lang2); + System.out.println("normalizerRules1=" + normalizerRules1); + System.out.println("normalizerRules2=" + normalizerRules2); + System.out.println("dictInfo=" + dictInfo); + System.out.println("dictOut=" + dictOutFilename); + + final DictionaryBuilder dictionaryBuilder = new DictionaryBuilder(dictInfo, lang1, lang2, normalizerRules1, normalizerRules2, lang1Stoplist, lang2Stoplist); + + for (int i = 0; i < 100; ++i) { + final String prefix = "input" + i; + if (keyValueArgs.containsKey(prefix)) { + final File file = new File(keyValueArgs.remove(prefix)); + System.out.println("Processing: " + file); + String charsetName = keyValueArgs.remove(prefix + "Charset"); + if (charsetName == null) { + charsetName = "UTF8"; + } + final Charset charset = Charset.forName(charsetName); + String inputName = keyValueArgs.remove(prefix + "Name"); + if (inputName == null) { + fatalError("Must specify human readable name for: " + prefix + "Name"); + } + String pageLimitString = keyValueArgs.remove(prefix + "PageLimit"); + if (pageLimitString == null) { + pageLimitString = "-1"; + } + final int pageLimit = Integer.parseInt(pageLimitString); + + final EntrySource entrySource = new EntrySource(dictionaryBuilder.dictionary.sources.size(), inputName, 0); + System.out.println(""); + + String inputFormat = keyValueArgs.remove(prefix + "Format"); + if ("tab_separated".equals(inputFormat)) { + final boolean flipColumns = "true".equals(keyValueArgs.remove(prefix + "FlipColumns")); + new DictFileParser(charset, flipColumns, DictFileParser.TAB, null, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parse(file, entrySource, pageLimit); + } else if ("chemnitz".equals(inputFormat)) { + final boolean flipColumns = "true".equals(keyValueArgs.remove(prefix + "FlipColumns")); + new DictFileParser(charset, flipColumns, DictFileParser.DOUBLE_COLON, DictFileParser.PIPE, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parse(file, entrySource, pageLimit); + } else if ("enwiktionary".equals(inputFormat)) { + final String type = keyValueArgs.remove(prefix + "WiktionaryType"); + final Pattern langPattern = Pattern.compile(keyValueArgs.remove(prefix + "LangPattern"), Pattern.CASE_INSENSITIVE); + final Pattern langCodePattern = Pattern.compile(keyValueArgs.remove(prefix + "LangCodePattern")); + final int enIndex = Integer.parseInt(keyValueArgs.remove(prefix + "EnIndex")) - 1; + + if (enIndex < 0 || enIndex >= 2) { + fatalError("Must be 1 or 2: " + prefix + "EnIndex"); + } + final Parser parser; + if ("EnToTranslation".equals(type)) { + parser = new EnToTranslationParser(dictionaryBuilder.indexBuilders.get(enIndex), dictionaryBuilder.indexBuilders.get(1-enIndex), + langPattern, langCodePattern, enIndex != 0); + } else if ("EnForeign".equals(type)) { + parser = new EnForeignParser(dictionaryBuilder.indexBuilders.get(enIndex), dictionaryBuilder.indexBuilders.get(1-enIndex), + langPattern, langCodePattern, enIndex != 0); + } else if ("EnEnglish".equals(type)) { + parser = new EnForeignParser(dictionaryBuilder.indexBuilders.get(enIndex), dictionaryBuilder.indexBuilders.get(enIndex), + langPattern, langCodePattern, true); + } else { + fatalError("Invalid WiktionaryType (use EnToTranslation or EnForeign or EnEnglish): " + type); + return; + } + parser.parse(file, entrySource, pageLimit); + } else if (EnTranslationToTranslationParser.NAME.equals(inputFormat)) { + final String code1 = keyValueArgs.remove(prefix + "LangPattern1"); + final String code2 = keyValueArgs.remove(prefix + "LangPattern2"); + if (code1 == null || code2 == null) { + fatalError("Must specify LangPattern1 and LangPattern2."); + return; + } + final Pattern codePattern1 = Pattern.compile(code1, Pattern.CASE_INSENSITIVE); + final Pattern codePattern2 = Pattern.compile(code2, Pattern.CASE_INSENSITIVE); + new EnTranslationToTranslationParser(dictionaryBuilder.indexBuilders, new Pattern[] {codePattern1, codePattern2}).parse(file, entrySource, pageLimit); + } else if (WholeSectionToHtmlParser.NAME.equals(inputFormat)) { + final int titleIndex = Integer.parseInt(keyValueArgs.remove(prefix + "TitleIndex")) - 1; + final String wiktionaryLang = keyValueArgs.remove(prefix + "WiktionaryLang"); + final String webUrlTemplate = keyValueArgs.remove(prefix + "WebUrlTemplate"); + String skipLang = keyValueArgs.remove(prefix + "SkipLang"); + if (skipLang == null) skipLang = ""; + new WholeSectionToHtmlParser(dictionaryBuilder.indexBuilders.get(titleIndex), null, wiktionaryLang, skipLang, webUrlTemplate).parse(file, entrySource, pageLimit); + } else { + fatalError("Invalid or missing input format: " + inputFormat); + } + + dictionaryBuilder.dictionary.sources.add(entrySource); + System.out.println("Done: " + file + "\n\n"); + } + } + + dictionaryBuilder.build(); + // Drop indexBuilders to free RAM + dictionaryBuilder.indexBuilders.clear(); + + if (printFile != null) { + final PrintStream out = new PrintStream(new File(printFile)); + dictionaryBuilder.dictionary.print(out); + out.close(); + } + + System.out.println("Writing dictionary to: " + dictOutFilename); + final RandomAccessFile dictOut = new RandomAccessFile(dictOutFilename, "rw"); + dictOut.setLength(0); + dictionaryBuilder.dictionary.write(dictOut); + dictOut.close(); + + if (!keyValueArgs.isEmpty()) { + System.err.println("WARNING: couldn't parse arguments: " + keyValueArgs); + System.exit(1); + } + } - - System.out.println("Writing dictionary to: " + dictOutFilename); - final RandomAccessFile dictOut = new RandomAccessFile(dictOutFilename, "rw"); - dictOut.setLength(0); - dictionaryBuilder.dictionary.write(dictOut); - dictOut.close(); - - if (!keyValueArgs.isEmpty()) { - System.err.println("WARNING: couldn't parse arguments: " + keyValueArgs); - System.exit(1); + + private static void fatalError(String string) { + System.err.println(string); + + + System.exit(1); } - - } - - private static void fatalError(String string) { - System.err.println(string); - - - System.exit(1); - } - + } diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java index 57e76cc..cf5fa96 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java @@ -30,320 +30,320 @@ import java.util.Map; import java.util.Set; public class DictionaryBuilderMain extends TestCase { - - static final String INPUTS = "data/inputs/"; - static final String STOPLISTS = "data/inputs/stoplists/"; - static final String OUTPUTS = "data/outputs/"; - - // Build the non EN ones. - static final String[][] nonEnPairs = new String[][] { - {"EN"}, - {"DE"}, - {"IT"}, - // This one takes a really long time, and the result is too big for code.google.com - //{"FR"}, - - // The 3 I use most: - {"IT", "EN" }, - {"DE", "EN" }, - {"DE", "IT" }, - - {"AR", "DE" }, - {"AR", "ES" }, - {"AR", "FR" }, - {"AR", "HE" }, - {"AR", "IT" }, - {"AR", "JA" }, - {"AR", "RU" }, - {"AR", "TR" }, // Turkish - {"AR", "cmn" }, - - {"DE", "AR" }, - {"DE", "FR" }, - {"DE", "CA" }, // Catalan - {"DE", "CS" }, // Czech - {"DE", "EO" }, // Esperanto - {"DE", "ES" }, - {"DE", "FR" }, - {"DE", "HE" }, - {"DE", "HU" }, // Hungarian - {"DE", "IT" }, - {"DE", "JA" }, - {"DE", "LA" }, // Latin - {"DE", "NL" }, // Dutch - {"DE", "PL" }, // Polish - {"DE", "RU" }, - {"DE", "SV" }, // Swedish - {"DE", "TR" }, // Turkish - {"DE", "cmn" }, - {"DE", "TA" }, // Tamil - - {"ES", "RU" }, // Spanish-Russian - - {"FR", "BG" }, // Bulgarian - {"FR", "CS" }, // Czech - {"FR", "DE" }, - {"FR", "ES" }, - {"FR", "IT" }, - {"FR", "JA" }, - {"FR", "LA" }, - {"FR", "NL" }, // Dutch - {"FR", "RU" }, - {"FR", "TR" }, // Turkish - {"FR", "cmn" }, - {"FR", "EL" }, - - {"IT", "DE" }, - {"IT", "EL" }, // Greek - {"IT", "ES" }, - {"IT", "FR" }, - {"IT", "HU" }, - {"IT", "JA" }, - {"IT", "LA" }, // Latin - {"IT", "LV" }, // Latvian - {"IT", "NL" }, - {"IT", "PL" }, - {"IT", "RU" }, - {"IT", "SV" }, - {"IT", "TR" }, // Turkish - {"IT", "cmn" }, - - {"JA", "cmn" }, - {"JA", "AR" }, - {"JA", "KO" }, - - {"cmn", "AR" }, - {"cmn", "DE" }, - {"cmn", "ES" }, - {"cmn", "FR" }, - {"cmn", "IT" }, - {"cmn", "KO" }, - - {"NO", "SV" }, - {"NO", "FI" }, - {"FI", "SV" }, - - {"PL", "FR" }, // Polish - {"PL", "RU" }, // Polish - {"PL", "HU" }, // Polish - {"PL", "ES" }, // Polish - - {"TR", "EL" }, // Turkish, Greek - - {"FA", "HY" }, // Persian, Armenian, by request. - {"FA", "SV" }, // Persian, Swedish, by request. - {"NL", "PL" }, // Dutch, Polish, by request. - - }; - - - - static final Map isoToDedication = new LinkedHashMap(); - static { - isoToDedication.put("AF", "Wiktionary-based Afrikaans dictionary dedicated to Heiko and Mariëtte Horn."); - isoToDedication.put("HR", "Wiktionary-based Croatian dictionary dedicated to Ines Viskic and Miro Kresonja."); - isoToDedication.put("NL", "Wiktionary-based Dutch dictionary dedicated to Mike LeBeau."); - isoToDedication.put("DE", "@data/inputs/de-en_dedication.txt"); - isoToDedication.put("EL", "Wiktionary-based Greek dictionary dedicated to Noah Egge."); - isoToDedication.put("IT", "Wiktionary-based Italian dictionary dedicated to Carolina Tropini, my favorite stardust in the whole universe! Ti amo!"); - isoToDedication.put("KO", "Wiktionary-based Korean dictionary dedicated to Ande Elwood--fall fashion und Fernsehturms!"); - isoToDedication.put("PT", "Wiktionary-based Portuguese dictionary dedicated to Carlos Melo, one Tough Mudder."); - isoToDedication.put("RO", "Wiktionary-based Romanian dictionary dedicated to Radu Teodorescu."); - isoToDedication.put("RU", "Wiktionary-based Russian dictionary dedicated to Maxim Aronin--best friend always!."); - isoToDedication.put("SR", "Wiktionary-based Serbian dictionary dedicated to Filip Crnogorac--thanks for the honey!"); - isoToDedication.put("ES", "Wiktionary-based Spanish dictionary made especially for Carolina Tropini! <3 XoXoXXXXX!"); - isoToDedication.put("SV", "Wiktionary-based Swedish dictionary dedicated to Kajsa Palmblad--björn kramar!"); - } - private static String getEnDictionaryInfo(String iso) { - return isoToDedication.containsKey(iso) ? isoToDedication.get(iso) : String.format("Wiktionary-based %s dictionary.", iso); - } - - static final Map isoToStoplist = new LinkedHashMap(); - static { - isoToStoplist.put("DE", "de.txt"); - isoToStoplist.put("EN", "en.txt"); - isoToStoplist.put("ES", "es.txt"); - isoToStoplist.put("IT", "it.txt"); - isoToStoplist.put("FR", "fr.txt"); - } - private static String getStoplist(String iso) { - return isoToStoplist.containsKey(iso) ? isoToStoplist.get(iso) : "empty.txt"; - } - - static String getOtherLang(final String[] pair, final String first) { - assert Arrays.asList(pair).contains(first); - assert pair.length == 2; - return pair[0].equals(first) ? pair[1] : pair[0]; - } - - static List getMainArgs(final String[] pair) { - final List result = new ArrayList(); - - int i = 1; - - if (pair.length == 1) { + + static final String INPUTS = "data/inputs/"; + static final String STOPLISTS = "data/inputs/stoplists/"; + static final String OUTPUTS = "data/outputs/"; + + // Build the non EN ones. + static final String[][] nonEnPairs = new String[][] { + {"EN"}, + {"DE"}, + {"IT"}, + // This one takes a really long time, and the result is too big for code.google.com + //{"FR"}, + + // The 3 I use most: + {"IT", "EN" }, + {"DE", "EN" }, + {"DE", "IT" }, + + {"AR", "DE" }, + {"AR", "ES" }, + {"AR", "FR" }, + {"AR", "HE" }, + {"AR", "IT" }, + {"AR", "JA" }, + {"AR", "RU" }, + {"AR", "TR" }, // Turkish + {"AR", "cmn" }, + + {"DE", "AR" }, + {"DE", "FR" }, + {"DE", "CA" }, // Catalan + {"DE", "CS" }, // Czech + {"DE", "EO" }, // Esperanto + {"DE", "ES" }, + {"DE", "FR" }, + {"DE", "HE" }, + {"DE", "HU" }, // Hungarian + {"DE", "IT" }, + {"DE", "JA" }, + {"DE", "LA" }, // Latin + {"DE", "NL" }, // Dutch + {"DE", "PL" }, // Polish + {"DE", "RU" }, + {"DE", "SV" }, // Swedish + {"DE", "TR" }, // Turkish + {"DE", "cmn" }, + {"DE", "TA" }, // Tamil + + {"ES", "RU" }, // Spanish-Russian + + {"FR", "BG" }, // Bulgarian + {"FR", "CS" }, // Czech + {"FR", "DE" }, + {"FR", "ES" }, + {"FR", "IT" }, + {"FR", "JA" }, + {"FR", "LA" }, + {"FR", "NL" }, // Dutch + {"FR", "RU" }, + {"FR", "TR" }, // Turkish + {"FR", "cmn" }, + {"FR", "EL" }, + + {"IT", "DE" }, + {"IT", "EL" }, // Greek + {"IT", "ES" }, + {"IT", "FR" }, + {"IT", "HU" }, + {"IT", "JA" }, + {"IT", "LA" }, // Latin + {"IT", "LV" }, // Latvian + {"IT", "NL" }, + {"IT", "PL" }, + {"IT", "RU" }, + {"IT", "SV" }, + {"IT", "TR" }, // Turkish + {"IT", "cmn" }, + + {"JA", "cmn" }, + {"JA", "AR" }, + {"JA", "KO" }, + + {"cmn", "AR" }, + {"cmn", "DE" }, + {"cmn", "ES" }, + {"cmn", "FR" }, + {"cmn", "IT" }, + {"cmn", "KO" }, + + {"NO", "SV" }, + {"NO", "FI" }, + {"FI", "SV" }, + + {"PL", "FR" }, // Polish + {"PL", "RU" }, // Polish + {"PL", "HU" }, // Polish + {"PL", "ES" }, // Polish + + {"TR", "EL" }, // Turkish, Greek + + {"FA", "HY" }, // Persian, Armenian, by request. + {"FA", "SV" }, // Persian, Swedish, by request. + {"NL", "PL" }, // Dutch, Polish, by request. + + }; + + + + static final Map isoToDedication = new LinkedHashMap(); + static { + isoToDedication.put("AF", "Wiktionary-based Afrikaans dictionary dedicated to Heiko and Mariëtte Horn."); + isoToDedication.put("HR", "Wiktionary-based Croatian dictionary dedicated to Ines Viskic and Miro Kresonja."); + isoToDedication.put("NL", "Wiktionary-based Dutch dictionary dedicated to Mike LeBeau."); + isoToDedication.put("DE", "@data/inputs/de-en_dedication.txt"); + isoToDedication.put("EL", "Wiktionary-based Greek dictionary dedicated to Noah Egge."); + isoToDedication.put("IT", "Wiktionary-based Italian dictionary dedicated to Carolina Tropini, my favorite stardust in the whole universe! Ti amo!"); + isoToDedication.put("KO", "Wiktionary-based Korean dictionary dedicated to Ande Elwood--fall fashion und Fernsehturms!"); + isoToDedication.put("PT", "Wiktionary-based Portuguese dictionary dedicated to Carlos Melo, one Tough Mudder."); + isoToDedication.put("RO", "Wiktionary-based Romanian dictionary dedicated to Radu Teodorescu."); + isoToDedication.put("RU", "Wiktionary-based Russian dictionary dedicated to Maxim Aronin--best friend always!."); + isoToDedication.put("SR", "Wiktionary-based Serbian dictionary dedicated to Filip Crnogorac--thanks for the honey!"); + isoToDedication.put("ES", "Wiktionary-based Spanish dictionary made especially for Carolina Tropini! <3 XoXoXXXXX!"); + isoToDedication.put("SV", "Wiktionary-based Swedish dictionary dedicated to Kajsa Palmblad--björn kramar!"); + } + private static String getEnDictionaryInfo(String iso) { + return isoToDedication.containsKey(iso) ? isoToDedication.get(iso) : String.format("Wiktionary-based %s dictionary.", iso); + } + + static final Map isoToStoplist = new LinkedHashMap(); + static { + isoToStoplist.put("DE", "de.txt"); + isoToStoplist.put("EN", "en.txt"); + isoToStoplist.put("ES", "es.txt"); + isoToStoplist.put("IT", "it.txt"); + isoToStoplist.put("FR", "fr.txt"); + } + private static String getStoplist(String iso) { + return isoToStoplist.containsKey(iso) ? isoToStoplist.get(iso) : "empty.txt"; + } + + static String getOtherLang(final String[] pair, final String first) { + assert Arrays.asList(pair).contains(first); + assert pair.length == 2; + return pair[0].equals(first) ? pair[1] : pair[0]; + } + + static List getMainArgs(final String[] pair) { + final List result = new ArrayList(); + + int i = 1; + + if (pair.length == 1) { + final String lang1 = pair[0]; + final String dictFile = String.format("%s/%s.quickdic", OUTPUTS, lang1); + result.add(String.format("--dictOut=%s", dictFile)); + result.add(String.format("--lang1=%s", lang1)); + result.add(String.format("--lang1Stoplist=%s", STOPLISTS + getStoplist(lang1))); + result.add(String.format("--dictInfo=Wikitionary-based %s dictionary.", lang1)); + + + final String wikiSplitFile = String.format("%s/wikiSplit/%s/%s.data", INPUTS, lang1.toLowerCase(), lang1); + if (new File(wikiSplitFile).canRead()) { + result.add(String.format("--input%d=%s", i, wikiSplitFile)); + result.add(String.format("--input%dName=%s.wiktionary.org", i, lang1.toLowerCase())); + result.add(String.format("--input%dFormat=%s", i, WholeSectionToHtmlParser.NAME)); + result.add(String.format("--input%dTitleIndex=%d", i, 1)); + result.add(String.format("--input%dWiktionaryLang=%s", i, lang1)); + result.add(String.format("--input%dSkipLang=%s", i, lang1)); + result.add(String.format("--input%dWebUrlTemplate=http://%s.wiktionary.org/wiki/%%s", i, lang1.toLowerCase())); + //result.add(String.format("--input%dPageLimit=100", i)); + ++i; + } else { + System.err.println("Can't read file: " + wikiSplitFile); + } + + if (lang1.equals("EN") && !lang1.equals("EN")) { + // Add a parser that tries to use the definitions. This is + // not very pretty yet. + result.add(String.format("--input%d=%s/wikiSplit/en/%s.data", i, INPUTS, lang1)); + result.add(String.format("--input%dName=ENWiktionary.%s", i, lang1)) ; + result.add(String.format("--input%dFormat=enwiktionary", i)); + result.add(String.format("--input%dWiktionaryType=EnEnglish", i)); + result.add(String.format("--input%dLangPattern=%s", i, "English")); + result.add(String.format("--input%dLangCodePattern=%s", i, lang1.toLowerCase())); + result.add(String.format("--input%dEnIndex=%d", i, 1)); + //result.add(String.format("--input%dPageLimit=100", i)); + ++i; + } + + return result; + } // Single-lang dictionaries. + final String lang1 = pair[0]; - final String dictFile = String.format("%s/%s.quickdic", OUTPUTS, lang1); + final String lang2 = pair[1]; + + final String dictFile = String.format("%s/%s-%s.quickdic", + OUTPUTS, lang1, lang2); + result.add(String.format("--dictOut=%s", dictFile)); - result.add(String.format("--lang1=%s", lang1)); result.add(String.format("--lang1Stoplist=%s", STOPLISTS + getStoplist(lang1))); - result.add(String.format("--dictInfo=Wikitionary-based %s dictionary.", lang1)); + result.add(String.format("--lang2Stoplist=%s", STOPLISTS + getStoplist(lang2))); - - final String wikiSplitFile = String.format("%s/wikiSplit/%s/%s.data", INPUTS, lang1.toLowerCase(), lang1); - if (new File(wikiSplitFile).canRead()) { + // For a few langs, put the defs of the other language in DE/IT/FR using WholeSection. + for (final String wikitionaryLang : Arrays.asList("EN", "DE", "IT", "FR")) { + if (!Arrays.asList(pair).contains(wikitionaryLang)) { + continue; + } + final String foreignIso = getOtherLang(pair, wikitionaryLang); + final String wikiSplitFile = String.format("%s/wikiSplit/%s/%s.data", INPUTS, wikitionaryLang.toLowerCase(), foreignIso); + if (!new File(wikiSplitFile).canRead()) { + System.err.println("WARNING: Can't read file: " + wikiSplitFile); + continue; + } result.add(String.format("--input%d=%s", i, wikiSplitFile)); - result.add(String.format("--input%dName=%s.wiktionary.org", i, lang1.toLowerCase())); + result.add(String.format("--input%dName=%s.wiktionary.org", i, wikitionaryLang.toLowerCase())); result.add(String.format("--input%dFormat=%s", i, WholeSectionToHtmlParser.NAME)); - result.add(String.format("--input%dTitleIndex=%d", i, 1)); - result.add(String.format("--input%dWiktionaryLang=%s", i, lang1)); - result.add(String.format("--input%dSkipLang=%s", i, lang1)); - result.add(String.format("--input%dWebUrlTemplate=http://%s.wiktionary.org/wiki/%%s", i, lang1.toLowerCase())); - //result.add(String.format("--input%dPageLimit=100", i)); + result.add(String.format("--input%dTitleIndex=%d", i, Arrays.asList(pair).indexOf(foreignIso) + 1)); + result.add(String.format("--input%dWiktionaryLang=%s", i, wikitionaryLang)); + result.add(String.format("--input%dSkipLang=%s", i, foreignIso)); + result.add(String.format("--input%dWebUrlTemplate=http://%s.wiktionary.org/wiki/%%s", i, wikitionaryLang.toLowerCase())); ++i; - } else { - System.err.println("Can't read file: " + wikiSplitFile); } - - if (lang1.equals("EN") && !lang1.equals("EN")) { - // Add a parser that tries to use the definitions. This is - // not very pretty yet. - result.add(String.format("--input%d=%s/wikiSplit/en/%s.data", i, INPUTS, lang1)); - result.add(String.format("--input%dName=ENWiktionary.%s", i, lang1)) ; + + // Deal with the pairs where one is English. + if (Arrays.asList(pair).contains("EN")) { + final String foreignIso = getOtherLang(pair, "EN"); + String foreignRegex = WiktionaryLangs.isoCodeToEnWikiName.get(foreignIso); + + result.add(String.format("--lang1=%s", lang1)); + result.add(String.format("--lang2=%s", lang2)); + result.add(String.format("--dictInfo=%s", getEnDictionaryInfo(foreignIso))); + + // Foreign section. + result.add(String.format("--input%d=%s/wikiSplit/en/%s.data", i, INPUTS, foreignIso)); + result.add(String.format("--input%dName=ENWiktionary.%s", i, foreignIso)) ; result.add(String.format("--input%dFormat=enwiktionary", i)); - result.add(String.format("--input%dWiktionaryType=EnEnglish", i)); - result.add(String.format("--input%dLangPattern=%s", i, "English")); - result.add(String.format("--input%dLangCodePattern=%s", i, lang1.toLowerCase())); - result.add(String.format("--input%dEnIndex=%d", i, 1)); - //result.add(String.format("--input%dPageLimit=100", i)); + result.add(String.format("--input%dWiktionaryType=EnForeign", i)); + result.add(String.format("--input%dLangPattern=%s", i, foreignRegex)); + result.add(String.format("--input%dLangCodePattern=%s", i, foreignIso.toLowerCase())); + result.add(String.format("--input%dEnIndex=%d", i, Arrays.asList(pair).indexOf("EN") + 1)); + ++i; + + // Translation section. + result.add(String.format("--input%d=%swikiSplit/en/EN.data", i, INPUTS)); + result.add(String.format("--input%dName=enwiktionary.english", i)); + result.add(String.format("--input%dFormat=enwiktionary", i)); + result.add(String.format("--input%dWiktionaryType=EnToTranslation", i)); + result.add(String.format("--input%dLangPattern=%s", i, foreignRegex)); + result.add(String.format("--input%dLangCodePattern=%s", i, foreignIso.toLowerCase())); + result.add(String.format("--input%dEnIndex=%d", i, Arrays.asList(pair).indexOf("EN") + 1)); + ++i; + + if (foreignIso.equals("DE")) { + result.add(String.format("--input%d=%sde-en_chemnitz.txt", i, INPUTS)); + result.add(String.format("--input%dName=chemnitz", i)); + result.add(String.format("--input%dCharset=UTF8", i)); + result.add(String.format("--input%dFormat=chemnitz", i)); + ++i; + } + + } else { + // Pairs without English. + result.add(String.format("--lang1=%s", lang1)); + result.add(String.format("--lang2=%s", lang2)); + result.add(String.format("--dictInfo=Wikitionary-based %s-%s dictionary.", lang1, lang2)); + + result.add(String.format("--input%d=%swikiSplit/en/EN.data", i, INPUTS)); + result.add(String.format("--input%dName=BETA!enwiktionary.%s-%s", i, lang1, lang2)); + result.add(String.format("--input%dFormat=%s", i, EnTranslationToTranslationParser.NAME)); + result.add(String.format("--input%dLangPattern1=%s", i, lang1)); + result.add(String.format("--input%dLangPattern2=%s", i, lang2)); ++i; + + // TODO: Could use FR translation section here too. } - + return result; - } // Single-lang dictionaries. - - final String lang1 = pair[0]; - final String lang2 = pair[1]; - - final String dictFile = String.format("%s/%s-%s.quickdic", - OUTPUTS, lang1, lang2); - - result.add(String.format("--dictOut=%s", dictFile)); - result.add(String.format("--lang1Stoplist=%s", STOPLISTS + getStoplist(lang1))); - result.add(String.format("--lang2Stoplist=%s", STOPLISTS + getStoplist(lang2))); - - // For a few langs, put the defs of the other language in DE/IT/FR using WholeSection. - for (final String wikitionaryLang : Arrays.asList("EN", "DE", "IT", "FR")) { - if (!Arrays.asList(pair).contains(wikitionaryLang)) { - continue; + } + + public static void main(final String[] args) throws Exception { + + final List allPairs = new ArrayList(); + + allPairs.addAll(Arrays.asList(nonEnPairs)); + // Add all the EN-XX pairs. + for (final String isoCode : WiktionaryLangs.isoCodeToEnWikiName.keySet()) { + if (!isoCode.equals("EN")) { + allPairs.add(new String[] {"EN", isoCode}); + } } - final String foreignIso = getOtherLang(pair, wikitionaryLang); - final String wikiSplitFile = String.format("%s/wikiSplit/%s/%s.data", INPUTS, wikitionaryLang.toLowerCase(), foreignIso); - if (!new File(wikiSplitFile).canRead()) { - System.err.println("WARNING: Can't read file: " + wikiSplitFile); - continue; + + + final Set> done = new LinkedHashSet>(); + boolean go = true; + for (final String[] pair : allPairs) { + Arrays.sort(pair); + final List pairList = Arrays.asList(pair); + if (done.contains(pairList)) { + continue; + } + done.add(pairList); + + if (pairList.contains("EN") && pairList.contains("DE")) { + go = true; + } else { + go = false; + } + + if (!go) { + continue; + } + + DictionaryBuilder.main(getMainArgs(pair).toArray(new String[0])); } - result.add(String.format("--input%d=%s", i, wikiSplitFile)); - result.add(String.format("--input%dName=%s.wiktionary.org", i, wikitionaryLang.toLowerCase())); - result.add(String.format("--input%dFormat=%s", i, WholeSectionToHtmlParser.NAME)); - result.add(String.format("--input%dTitleIndex=%d", i, Arrays.asList(pair).indexOf(foreignIso) + 1)); - result.add(String.format("--input%dWiktionaryLang=%s", i, wikitionaryLang)); - result.add(String.format("--input%dSkipLang=%s", i, foreignIso)); - result.add(String.format("--input%dWebUrlTemplate=http://%s.wiktionary.org/wiki/%%s", i, wikitionaryLang.toLowerCase())); - ++i; - } - - // Deal with the pairs where one is English. - if (Arrays.asList(pair).contains("EN")) { - final String foreignIso = getOtherLang(pair, "EN"); - String foreignRegex = WiktionaryLangs.isoCodeToEnWikiName.get(foreignIso); - - result.add(String.format("--lang1=%s", lang1)); - result.add(String.format("--lang2=%s", lang2)); - result.add(String.format("--dictInfo=%s", getEnDictionaryInfo(foreignIso))); - - // Foreign section. - result.add(String.format("--input%d=%s/wikiSplit/en/%s.data", i, INPUTS, foreignIso)); - result.add(String.format("--input%dName=ENWiktionary.%s", i, foreignIso)) ; - result.add(String.format("--input%dFormat=enwiktionary", i)); - result.add(String.format("--input%dWiktionaryType=EnForeign", i)); - result.add(String.format("--input%dLangPattern=%s", i, foreignRegex)); - result.add(String.format("--input%dLangCodePattern=%s", i, foreignIso.toLowerCase())); - result.add(String.format("--input%dEnIndex=%d", i, Arrays.asList(pair).indexOf("EN") + 1)); - ++i; - - // Translation section. - result.add(String.format("--input%d=%swikiSplit/en/EN.data", i, INPUTS)); - result.add(String.format("--input%dName=enwiktionary.english", i)); - result.add(String.format("--input%dFormat=enwiktionary", i)); - result.add(String.format("--input%dWiktionaryType=EnToTranslation", i)); - result.add(String.format("--input%dLangPattern=%s", i, foreignRegex)); - result.add(String.format("--input%dLangCodePattern=%s", i, foreignIso.toLowerCase())); - result.add(String.format("--input%dEnIndex=%d", i, Arrays.asList(pair).indexOf("EN") + 1)); - ++i; - - if (foreignIso.equals("DE")) { - result.add(String.format("--input%d=%sde-en_chemnitz.txt", i, INPUTS)); - result.add(String.format("--input%dName=chemnitz", i)); - result.add(String.format("--input%dCharset=UTF8", i)); - result.add(String.format("--input%dFormat=chemnitz", i)); - ++i; - } - - } else { - // Pairs without English. - result.add(String.format("--lang1=%s", lang1)); - result.add(String.format("--lang2=%s", lang2)); - result.add(String.format("--dictInfo=Wikitionary-based %s-%s dictionary.", lang1, lang2)); - - result.add(String.format("--input%d=%swikiSplit/en/EN.data", i, INPUTS)); - result.add(String.format("--input%dName=BETA!enwiktionary.%s-%s", i, lang1, lang2)); - result.add(String.format("--input%dFormat=%s", i, EnTranslationToTranslationParser.NAME)); - result.add(String.format("--input%dLangPattern1=%s", i, lang1)); - result.add(String.format("--input%dLangPattern2=%s", i, lang2)); - ++i; - - // TODO: Could use FR translation section here too. - } - - return result; - } - - public static void main(final String[] args) throws Exception { - - final List allPairs = new ArrayList(); - - allPairs.addAll(Arrays.asList(nonEnPairs)); - // Add all the EN-XX pairs. - for (final String isoCode : WiktionaryLangs.isoCodeToEnWikiName.keySet()) { - if (!isoCode.equals("EN")) { - allPairs.add(new String[] {"EN", isoCode}); - } - } - - - final Set> done = new LinkedHashSet>(); - boolean go = true; - for (final String[] pair : allPairs) { - Arrays.sort(pair); - final List pairList = Arrays.asList(pair); - if (done.contains(pairList)) { - continue; - } - done.add(pairList); - - if (pairList.contains("EN") && pairList.contains("DE")) { - go = true; - } else { - go = false; - } - - if (!go) { - continue; - } - - DictionaryBuilder.main(getMainArgs(pair).toArray(new String[0])); + } - - } } diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java index 417df82..7669414 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java @@ -28,343 +28,343 @@ import com.hughes.util.FileUtil; import junit.framework.TestCase; public class DictionaryBuilderTest extends TestCase { - - public static final String TEST_INPUTS = "testdata/inputs/"; - public static final String WIKISPLIT = "data/inputs/wikiSplit/"; - public static final String WIKISPLIT_EN = "data/inputs/wikiSplit/en/"; - public static final String STOPLISTS = "data/inputs/stoplists/"; - public static final String GOLDENS = "testdata/goldens/"; - - public static final String TEST_OUTPUTS = "testdata/outputs/"; - - public void testItConj() throws Exception { - final String toParse = "{{it-conj-are|d|avere|pres2s=dai|pres3s=dà|pres3p=danno|prem1s=diedi|prem1s2=detti|prem2s=desti|prem3s=diede|prem3s2=dette|prem1p=demmo|prem2p=deste|prem3p=diedero|prem3p2=dettero|fut1s=darò|fut2s=darai|fut3s=darà|fut1p=daremo|fut2p=darete|fut3p=daranno|cond1s=darei|cond2s=daresti|cond3s=darebbe|cond1p=daremmo|cond2p=dareste|cond3p=darebbero|sub123s=dia|sub3p=diano|impsub12s=dessi|impsub3s=desse|impsub1p=dessimo|impsub2p=deste|impsub3p=dessero|imp2s=dà|imp2s2=dai|imp2s3=da'|imp3s=dia|imp3p=diano}}\n" + - "{{it-conj-are|accus|avere}}\n" + - "{{it-conj-care|pag|avere or essere}}\n" + - "{{it-conj-iare|studi|avere}}\n" + - "{{it-conj-iare-b|avvi|avere}}\n" + - "{{it-conj-ciare|pronunc|avere}}\n" + - "{{it-conj-ere|sed|essere|pres1s=siedo|pres1s2=seggo|pres2s=siedi|pres3s=siede|pres3p=siedono|pres3p2=seggono|fut1s2=siederò|fut2s2=siederai|fut3s2=siederà|fut1p2=siederemo|fut2p2=siederete|fut3p2=siederanno|cond1s2=siederei|cond2s2=siederesti|cond3s2=siederebbe|cond1p2=siederemmo|cond2p2=siedereste|cond3p2=siederebbero|sub123s=sieda|sub3p=siedano|imp2s=siedi|imp3s=sieda|imp3s2=segga|imp3p=siedano|imp3p2=seggano}}\n" + - "{{it-conj-ere|persuad|avere|pastp=persuaso|prem1s=persuasi|prem3s=persuase|prem3s2=''|prem3p=persuasero|prem3p2=''}}\n" + - "{{it-conj-ere|abbatt|avere}}\n" + - "{{it-conj-ire|copr|avere|pastp=coperto|prem1s2=copersi|prem3s2=coperse|prem3p2=copersero}}\n" + - "{{it-conj-ire-b|prefer|avere}}\n" + - "{{it-conj-urre|prod|avere}}\n" + - "{{it-conj-arsi|lav}}\n" + - "{{it-conj-ersi|abbatt}}\n" + - "{{it-conj-iarsi|annoi}}\n" + - "{{it-conj-carsi|coniug}}\n" + - "{{it-conj-ciarsi|affacc}}\n" + - "{{it-conj-irsi|vest}}\n" + - "{{it-conj-irsi-b|fer}}\n" + - "{{it-conj-ursi|rid|essere}}\n" + - "{{it-conj-cire|ricuc|avere}}\n" + - "{{it-conj-iarsi-b|riavvi|essere}}" + - "{{it-conj-fare|putre|avere}}\n" + - "{{it-conj-cirsi|cuc|essere}}\n" + - "{{it-conj-ere|smett|avere|pastp=smesso|prem1s=smisi|prem3s=smise|prem3s2=''|prem3p=smisero|prem3p2=''}}\n" + - "{{term||[[cor#Latin|Cor]] [[Carolus#Latin|Carolī]]|Charles' heart}}\n" + - "{{term|sc=Grek|λόγος|tr=lógos||word}}\n" + - "{{term|verbo|verbō|for the word}}\n" - ; - final DictionaryBuilder db = new DictionaryBuilder("", Language.en, Language.it, "", "", Collections.singleton("X"), Collections.singleton("X")); - WholeSectionToHtmlParser parser = new WholeSectionToHtmlParser(db.indexBuilders.get(0), null, "EN", "IT", "http://en.wiktionary.org/wiki/%s"); - parser.title = "dummyTitle"; - parser.entrySource = new EntrySource(0, "dummySource", 0); - parser.parseSection("dummyHeading", toParse); - db.build(); - - final String dictName = "testItConj.html"; - final PrintStream out = new PrintStream(new File(TEST_OUTPUTS, dictName)); - db.dictionary.print(out); - out.close(); - - assertFilesEqual(GOLDENS + dictName, TEST_OUTPUTS + dictName); - } - - public void doTestCustomDict(final String name, final String lang1, - final String lang2, final String inputFile) throws Exception { - final File result = new File(TEST_OUTPUTS + name); - System.out.println("Writing to: " + result); - DictionaryBuilder.main(new String[] { - "--dictOut=" + result.getAbsolutePath(), - "--lang1=" + lang1, - "--lang2=" + lang2, - "--lang1Stoplist=" + STOPLISTS + "empty.txt", - "--lang2Stoplist=" + STOPLISTS + "empty.txt", - "--dictInfo=bleh.", - - "--input1=testdata/inputs/" + inputFile, - "--input1Name=my_input_" + name, - "--input1Charset=ISO-8859-1", - "--input1Format=tab_separated", - - "--print=" + result.getPath() + ".text", - }); - - checkGolden(name, result); - } - - public void test_FR_NL() throws Exception { - doTestCustomDict("QuickDic-FR-NL.quickdic", "FR", "NL", "QuickDic-FR-NL.txt"); - } - - public void testWiktionary_en_de2fr() throws Exception { - wiktionaryTestWithEnTrans2Trans("wiktionary.de_fr.quickdic", "DE", "FR"); - } - - public void wiktionaryTestWithEnTrans2Trans(final String name, final String lang1, - final String lang2) throws Exception { - final File result = new File(TEST_OUTPUTS + name); - System.out.println("Writing to: " + result); - DictionaryBuilder.main(new String[] { - "--dictOut=" + result.getAbsolutePath(), - "--lang1=" + lang1, - "--lang2=" + lang2, - "--lang1Stoplist=" + STOPLISTS + "empty.txt", - "--lang2Stoplist=" + STOPLISTS + "empty.txt", - "--dictInfo=SomeWikiDataTrans2Trans", - - "--input4=" + WIKISPLIT_EN + "EN.data", - "--input4Name=" + name, - "--input4Format=" + EnTranslationToTranslationParser.NAME, - "--input4LangPattern1=" + lang1, - "--input4LangPattern2=" + lang2, - "--input4PageLimit=1000", - - "--print=" + result.getPath() + ".text", - }); - - checkGolden(name, result); - } - - public void testWiktionary_WholeSection_DE() throws Exception { - enWiktionaryTestWithWholeSectionToHtml("enwiktionary.WholeSection.DE.quickdic", "DE", 100); - } - - public void testWiktionary_WholeSection_EN() throws Exception { - enWiktionaryTestWithWholeSectionToHtml("enwiktionary.WholeSection.EN.quickdic", "EN", 100); - } - - public void testWiktionary_WholeSection_IT() throws Exception { - // Have to run to 800 to get a few verb conjugations (including essere!) - enWiktionaryTestWithWholeSectionToHtml("enwiktionary.WholeSection.IT.quickdic", "IT", 800); - } - - public void enWiktionaryTestWithWholeSectionToHtml(final String name, final String langCode, final int pageLimit) throws Exception { - final File result = new File(TEST_OUTPUTS + name); - System.out.println("Writing to: " + result); - DictionaryBuilder.main(new String[] { - "--dictOut=" + result.getAbsolutePath(), - "--lang1=" + langCode, - "--lang2=" + "EN", - "--lang1Stoplist=" + STOPLISTS + "empty.txt", - "--lang2Stoplist=" + STOPLISTS + "empty.txt", - "--dictInfo=SomeWikiDataWholeSection", - - "--input4=" + WIKISPLIT_EN + langCode + ".data", - "--input4Name=" + name, - "--input4Format=" + WholeSectionToHtmlParser.NAME, - "--input4WiktionaryLang=EN", - "--input4SkipLang=" + langCode, - "--input4TitleIndex=" + "1", - "--input4PageLimit=" + pageLimit, - - "--print=" + result.getPath() + ".text", - }); - checkGolden(name, result); - } - - //----------------------------------------------------------------- - - public void testSingleLang_EN() throws Exception { - wiktionaryTestSingleLang("SingleLang_EN.quickdic", "EN", 100); - } - - public void testSingleLang_DE() throws Exception { - wiktionaryTestSingleLang("SingleLang_DE.quickdic", "DE", 100); - } - - public void testSingleLang_IT() throws Exception { - wiktionaryTestSingleLang("SingleLang_IT.quickdic", "IT", 100); - } - - public void testSingleLang_FR() throws Exception { - wiktionaryTestSingleLang("SingleLang_FR.quickdic", "FR", 100); - } - - public void wiktionaryTestSingleLang(final String name, final String langCode, final int pageLimit) throws Exception { - final File result = new File(TEST_OUTPUTS + name); - System.out.println("Writing to: " + result); - DictionaryBuilder.main(new String[] { - "--dictOut=" + result.getAbsolutePath(), - "--lang1=" + langCode, - "--lang1Stoplist=" + STOPLISTS + "empty.txt", - "--dictInfo=SomeWikiDataWholeSection", - "--input4=" + WIKISPLIT + langCode.toLowerCase() + "/" + langCode + ".data", - "--input4Name=" + name, - "--input4Format=" + WholeSectionToHtmlParser.NAME, - "--input4WiktionaryLang=" + langCode, - "--input4SkipLang=" + langCode, - "--input4TitleIndex=" + "1", - "--input4PageLimit=" + pageLimit, - "--print=" + result.getPath() + ".text", - }); - checkGolden(name, result); + + public static final String TEST_INPUTS = "testdata/inputs/"; + public static final String WIKISPLIT = "data/inputs/wikiSplit/"; + public static final String WIKISPLIT_EN = "data/inputs/wikiSplit/en/"; + public static final String STOPLISTS = "data/inputs/stoplists/"; + public static final String GOLDENS = "testdata/goldens/"; + + public static final String TEST_OUTPUTS = "testdata/outputs/"; + + public void testItConj() throws Exception { + final String toParse = "{{it-conj-are|d|avere|pres2s=dai|pres3s=dà|pres3p=danno|prem1s=diedi|prem1s2=detti|prem2s=desti|prem3s=diede|prem3s2=dette|prem1p=demmo|prem2p=deste|prem3p=diedero|prem3p2=dettero|fut1s=darò|fut2s=darai|fut3s=darà|fut1p=daremo|fut2p=darete|fut3p=daranno|cond1s=darei|cond2s=daresti|cond3s=darebbe|cond1p=daremmo|cond2p=dareste|cond3p=darebbero|sub123s=dia|sub3p=diano|impsub12s=dessi|impsub3s=desse|impsub1p=dessimo|impsub2p=deste|impsub3p=dessero|imp2s=dà|imp2s2=dai|imp2s3=da'|imp3s=dia|imp3p=diano}}\n" + + "{{it-conj-are|accus|avere}}\n" + + "{{it-conj-care|pag|avere or essere}}\n" + + "{{it-conj-iare|studi|avere}}\n" + + "{{it-conj-iare-b|avvi|avere}}\n" + + "{{it-conj-ciare|pronunc|avere}}\n" + + "{{it-conj-ere|sed|essere|pres1s=siedo|pres1s2=seggo|pres2s=siedi|pres3s=siede|pres3p=siedono|pres3p2=seggono|fut1s2=siederò|fut2s2=siederai|fut3s2=siederà|fut1p2=siederemo|fut2p2=siederete|fut3p2=siederanno|cond1s2=siederei|cond2s2=siederesti|cond3s2=siederebbe|cond1p2=siederemmo|cond2p2=siedereste|cond3p2=siederebbero|sub123s=sieda|sub3p=siedano|imp2s=siedi|imp3s=sieda|imp3s2=segga|imp3p=siedano|imp3p2=seggano}}\n" + + "{{it-conj-ere|persuad|avere|pastp=persuaso|prem1s=persuasi|prem3s=persuase|prem3s2=''|prem3p=persuasero|prem3p2=''}}\n" + + "{{it-conj-ere|abbatt|avere}}\n" + + "{{it-conj-ire|copr|avere|pastp=coperto|prem1s2=copersi|prem3s2=coperse|prem3p2=copersero}}\n" + + "{{it-conj-ire-b|prefer|avere}}\n" + + "{{it-conj-urre|prod|avere}}\n" + + "{{it-conj-arsi|lav}}\n" + + "{{it-conj-ersi|abbatt}}\n" + + "{{it-conj-iarsi|annoi}}\n" + + "{{it-conj-carsi|coniug}}\n" + + "{{it-conj-ciarsi|affacc}}\n" + + "{{it-conj-irsi|vest}}\n" + + "{{it-conj-irsi-b|fer}}\n" + + "{{it-conj-ursi|rid|essere}}\n" + + "{{it-conj-cire|ricuc|avere}}\n" + + "{{it-conj-iarsi-b|riavvi|essere}}" + + "{{it-conj-fare|putre|avere}}\n" + + "{{it-conj-cirsi|cuc|essere}}\n" + + "{{it-conj-ere|smett|avere|pastp=smesso|prem1s=smisi|prem3s=smise|prem3s2=''|prem3p=smisero|prem3p2=''}}\n" + + "{{term||[[cor#Latin|Cor]] [[Carolus#Latin|Carolī]]|Charles' heart}}\n" + + "{{term|sc=Grek|λόγος|tr=lógos||word}}\n" + + "{{term|verbo|verbō|for the word}}\n" + ; + final DictionaryBuilder db = new DictionaryBuilder("", Language.en, Language.it, "", "", Collections.singleton("X"), Collections.singleton("X")); + WholeSectionToHtmlParser parser = new WholeSectionToHtmlParser(db.indexBuilders.get(0), null, "EN", "IT", "http://en.wiktionary.org/wiki/%s"); + parser.title = "dummyTitle"; + parser.entrySource = new EntrySource(0, "dummySource", 0); + parser.parseSection("dummyHeading", toParse); + db.build(); + + final String dictName = "testItConj.html"; + final PrintStream out = new PrintStream(new File(TEST_OUTPUTS, dictName)); + db.dictionary.print(out); + out.close(); + + assertFilesEqual(GOLDENS + dictName, TEST_OUTPUTS + dictName); + } + + public void doTestCustomDict(final String name, final String lang1, + final String lang2, final String inputFile) throws Exception { + final File result = new File(TEST_OUTPUTS + name); + System.out.println("Writing to: " + result); + DictionaryBuilder.main(new String[] { + "--dictOut=" + result.getAbsolutePath(), + "--lang1=" + lang1, + "--lang2=" + lang2, + "--lang1Stoplist=" + STOPLISTS + "empty.txt", + "--lang2Stoplist=" + STOPLISTS + "empty.txt", + "--dictInfo=bleh.", + + "--input1=testdata/inputs/" + inputFile, + "--input1Name=my_input_" + name, + "--input1Charset=ISO-8859-1", + "--input1Format=tab_separated", + + "--print=" + result.getPath() + ".text", + }); + + checkGolden(name, result); + } + + public void test_FR_NL() throws Exception { + doTestCustomDict("QuickDic-FR-NL.quickdic", "FR", "NL", "QuickDic-FR-NL.txt"); + } + + public void testWiktionary_en_de2fr() throws Exception { + wiktionaryTestWithEnTrans2Trans("wiktionary.de_fr.quickdic", "DE", "FR"); + } + + public void wiktionaryTestWithEnTrans2Trans(final String name, final String lang1, + final String lang2) throws Exception { + final File result = new File(TEST_OUTPUTS + name); + System.out.println("Writing to: " + result); + DictionaryBuilder.main(new String[] { + "--dictOut=" + result.getAbsolutePath(), + "--lang1=" + lang1, + "--lang2=" + lang2, + "--lang1Stoplist=" + STOPLISTS + "empty.txt", + "--lang2Stoplist=" + STOPLISTS + "empty.txt", + "--dictInfo=SomeWikiDataTrans2Trans", + + "--input4=" + WIKISPLIT_EN + "EN.data", + "--input4Name=" + name, + "--input4Format=" + EnTranslationToTranslationParser.NAME, + "--input4LangPattern1=" + lang1, + "--input4LangPattern2=" + lang2, + "--input4PageLimit=1000", + + "--print=" + result.getPath() + ".text", + }); + + checkGolden(name, result); + } + + public void testWiktionary_WholeSection_DE() throws Exception { + enWiktionaryTestWithWholeSectionToHtml("enwiktionary.WholeSection.DE.quickdic", "DE", 100); + } + + public void testWiktionary_WholeSection_EN() throws Exception { + enWiktionaryTestWithWholeSectionToHtml("enwiktionary.WholeSection.EN.quickdic", "EN", 100); + } + + public void testWiktionary_WholeSection_IT() throws Exception { + // Have to run to 800 to get a few verb conjugations (including essere!) + enWiktionaryTestWithWholeSectionToHtml("enwiktionary.WholeSection.IT.quickdic", "IT", 800); + } + + public void enWiktionaryTestWithWholeSectionToHtml(final String name, final String langCode, final int pageLimit) throws Exception { + final File result = new File(TEST_OUTPUTS + name); + System.out.println("Writing to: " + result); + DictionaryBuilder.main(new String[] { + "--dictOut=" + result.getAbsolutePath(), + "--lang1=" + langCode, + "--lang2=" + "EN", + "--lang1Stoplist=" + STOPLISTS + "empty.txt", + "--lang2Stoplist=" + STOPLISTS + "empty.txt", + "--dictInfo=SomeWikiDataWholeSection", + + "--input4=" + WIKISPLIT_EN + langCode + ".data", + "--input4Name=" + name, + "--input4Format=" + WholeSectionToHtmlParser.NAME, + "--input4WiktionaryLang=EN", + "--input4SkipLang=" + langCode, + "--input4TitleIndex=" + "1", + "--input4PageLimit=" + pageLimit, + + "--print=" + result.getPath() + ".text", + }); + checkGolden(name, result); + } + + //----------------------------------------------------------------- + + public void testSingleLang_EN() throws Exception { + wiktionaryTestSingleLang("SingleLang_EN.quickdic", "EN", 100); + } + + public void testSingleLang_DE() throws Exception { + wiktionaryTestSingleLang("SingleLang_DE.quickdic", "DE", 100); + } + + public void testSingleLang_IT() throws Exception { + wiktionaryTestSingleLang("SingleLang_IT.quickdic", "IT", 100); + } + + public void testSingleLang_FR() throws Exception { + wiktionaryTestSingleLang("SingleLang_FR.quickdic", "FR", 100); + } + + public void wiktionaryTestSingleLang(final String name, final String langCode, final int pageLimit) throws Exception { + final File result = new File(TEST_OUTPUTS + name); + System.out.println("Writing to: " + result); + DictionaryBuilder.main(new String[] { + "--dictOut=" + result.getAbsolutePath(), + "--lang1=" + langCode, + "--lang1Stoplist=" + STOPLISTS + "empty.txt", + "--dictInfo=SomeWikiDataWholeSection", + "--input4=" + WIKISPLIT + langCode.toLowerCase() + "/" + langCode + ".data", + "--input4Name=" + name, + "--input4Format=" + WholeSectionToHtmlParser.NAME, + "--input4WiktionaryLang=" + langCode, + "--input4SkipLang=" + langCode, + "--input4TitleIndex=" + "1", + "--input4PageLimit=" + pageLimit, + "--print=" + result.getPath() + ".text", + }); + checkGolden(name, result); + } + + //----------------------------------------------------------------- + + public void testWiktionary_IT_EN() throws Exception { + wiktionaryTestWithLangToEn("wiktionary.it_en.quickdic", "IT", "it.txt", + "EN.data", "enwiktionary.english", "Italian", "it", 1000); + } + + public void testWiktionary_cmn_EN() throws Exception { + wiktionaryTestWithLangToEn("wiktionary.cmn_en.quickdic", "cmn", "empty.txt", + // These missing "e" prevents a complete match, forcing the name to be printed + "EN.data", "enwiktionary.english", "Chinese|Mandarin", "cmn", 1000); + } + + public void testWiktionary_DE_EN() throws Exception { + wiktionaryTestWithLangToEn("wiktionary.de_en.quickdic", "DE", "de.txt", + "EN.data", "enwiktionary.english", "German", "de", 1000); + } + + public void testWiktionary_IT_IT() throws Exception { + wiktionaryTestWithLangToEn("wiktionary.it_it.quickdic", "IT", "it.txt", + "IT.data", "enwiktionary.italian", "Italian", "it", 1000); } - //----------------------------------------------------------------- - - public void testWiktionary_IT_EN() throws Exception { - wiktionaryTestWithLangToEn("wiktionary.it_en.quickdic", "IT", "it.txt", - "EN.data", "enwiktionary.english", "Italian", "it", 1000); - } - - public void testWiktionary_cmn_EN() throws Exception { - wiktionaryTestWithLangToEn("wiktionary.cmn_en.quickdic", "cmn", "empty.txt", - // These missing "e" prevents a complete match, forcing the name to be printed - "EN.data", "enwiktionary.english", "Chinese|Mandarin", "cmn", 1000); - } - - public void testWiktionary_DE_EN() throws Exception { - wiktionaryTestWithLangToEn("wiktionary.de_en.quickdic", "DE", "de.txt", - "EN.data", "enwiktionary.english", "German", "de", 1000); - } - - public void testWiktionary_IT_IT() throws Exception { - wiktionaryTestWithLangToEn("wiktionary.it_it.quickdic", "IT", "it.txt", - "IT.data", "enwiktionary.italian", "Italian", "it", 1000); - } - - // French - public void testWiktionary_FR_FR() throws Exception { - wiktionaryTestWithLangToEn("wiktionary.fr_fr.quickdic", "FR", "fr.txt", - "FR.data", "enwiktionary.french", "French", "fr", 1000); - } - - - // Arabic - public void testWiktionary_AR_AR() throws Exception { - // Arabic is really big for some reason, use fewer pages. - wiktionaryTestWithLangToEn("wiktionary.ar_ar.quickdic", "AR", "empty.txt", - "AR.data", "enwiktionary.arabic", "Arabic", "ar", 200); - } - - // Chinese - public void testWiktionary_cmn_cmn() throws Exception { - wiktionaryTestWithLangToEn("wiktionary.cmn_cmn.quickdic", "cmn", "empty.txt", - // These missing "e" prevents a complete match, forcing the name to be printed. - "cmn.data", "enwiktionary.chinese", "Chinese|Mandarin", "cmn", 1000); - } - - // German - public void testWiktionary_DE_DE() throws Exception { - wiktionaryTestWithLangToEn("wiktionary.de_de.quickdic", "DE", "de.txt", - "DE.data", "enwiktionary.german", "German", "de", 1000); - } - - // Thai - public void testWiktionary_TH_TH() throws Exception { - wiktionaryTestWithLangToEn("wiktionary.th_th.quickdic", "TH", "empty.txt", - // These missing "e" prevents a complete match, forcing the name to be printed. - "TH.data", "enwiktionary.thai", "Thai", "th", 1000); - } - - public void wiktionaryTestWithLangToEn(final String name, final String lang1, - final String stoplist, final String data, final String dictName, - final String langPattern, final String langCode, int pageLimit) throws Exception { - final File result = new File(TEST_OUTPUTS + name); - System.out.println("Writing to: " + result); - final String type = data.equals("EN.data") ? "EnToTranslation" : "EnForeign"; - DictionaryBuilder.main(new String[] { - "--dictOut=" + result.getAbsolutePath(), - "--lang1=" + lang1, - "--lang2=EN", - "--lang1Stoplist=" + STOPLISTS + stoplist, - "--lang2Stoplist=" + STOPLISTS + "en.txt", - "--dictInfo=SomeWikiData", - - "--input4=" + WIKISPLIT_EN + data, - "--input4Name=" + dictName, - "--input4Format=enwiktionary", - "--input4WiktionaryType=" + type, - "--input4LangPattern=" + langPattern, - "--input4LangCodePattern=" + langCode, - "--input4EnIndex=2", - "--input4PageLimit=" + pageLimit, - - "--print=" + result.getPath() + ".text", - }); - - checkGolden(name, result); - } - - public void testGermanCombined() throws Exception { - final String name = "de-en.quickdic"; - final File result = new File(TEST_OUTPUTS + name); - System.out.println("Writing to: " + result); - DictionaryBuilder.main(new String[] { - "--dictOut=" + result.getAbsolutePath(), - "--lang1=DE", - "--lang2=EN", - "--dictInfo=@" + TEST_INPUTS + "de-en_dictInfo.txt", - - "--input1=" + TEST_INPUTS + "de-en_chemnitz_100", - "--input1Name=chemnitz", - "--input1Charset=UTF8", - "--input1Format=chemnitz", - - "--input2=" + TEST_INPUTS + "de-en_dictcc_simulated", - "--input2Name=dictcc", - "--input2Charset=UTF8", - "--input2Format=tab_separated", - - "--print=" + result.getPath() + ".text", - }); - - checkGolden(name, result); - } - - public void testItalianTurkish() throws Exception { - final String name = "it-tr_dictcc.quickdic"; - final File result = new File(TEST_OUTPUTS + name); - System.out.println("Writing to: " + result); - DictionaryBuilder.main(new String[] { - "--dictOut=" + result.getAbsolutePath(), - "--lang1=IT", - "--lang2=TR", - "--dictInfo=it-tr_dictcc_simulated", - - "--input1=" + TEST_INPUTS + "it-tr_dictcc_simulated.txt", - "--input1Name=dictcc", - "--input1Charset=UTF8", - "--input1Format=tab_separated", - - "--print=" + result.getPath() + ".text", - }); - - checkGolden(name, result); + // French + public void testWiktionary_FR_FR() throws Exception { + wiktionaryTestWithLangToEn("wiktionary.fr_fr.quickdic", "FR", "fr.txt", + "FR.data", "enwiktionary.french", "French", "fr", 1000); } - private void checkGolden(final String dictName, final File dictFile) - throws IOException, FileNotFoundException { - // Check it once: - assertFilesEqual(GOLDENS + dictName + ".text", dictFile.getPath() + ".text"); - // Check it again. - final Dictionary dict = new Dictionary(new RandomAccessFile(dictFile.getAbsolutePath(), "r")); - final PrintStream out = new PrintStream(new File(dictFile.getPath() + ".text")); - dict.print(out); - out.close(); - assertFilesEqual(GOLDENS + dictName + ".text", dictFile.getPath() + ".text"); - } + // Arabic + public void testWiktionary_AR_AR() throws Exception { + // Arabic is really big for some reason, use fewer pages. + wiktionaryTestWithLangToEn("wiktionary.ar_ar.quickdic", "AR", "empty.txt", + "AR.data", "enwiktionary.arabic", "Arabic", "ar", 200); + } + // Chinese + public void testWiktionary_cmn_cmn() throws Exception { + wiktionaryTestWithLangToEn("wiktionary.cmn_cmn.quickdic", "cmn", "empty.txt", + // These missing "e" prevents a complete match, forcing the name to be printed. + "cmn.data", "enwiktionary.chinese", "Chinese|Mandarin", "cmn", 1000); + } + + // German + public void testWiktionary_DE_DE() throws Exception { + wiktionaryTestWithLangToEn("wiktionary.de_de.quickdic", "DE", "de.txt", + "DE.data", "enwiktionary.german", "German", "de", 1000); + } + + // Thai + public void testWiktionary_TH_TH() throws Exception { + wiktionaryTestWithLangToEn("wiktionary.th_th.quickdic", "TH", "empty.txt", + // These missing "e" prevents a complete match, forcing the name to be printed. + "TH.data", "enwiktionary.thai", "Thai", "th", 1000); + } + + public void wiktionaryTestWithLangToEn(final String name, final String lang1, + final String stoplist, final String data, final String dictName, + final String langPattern, final String langCode, int pageLimit) throws Exception { + final File result = new File(TEST_OUTPUTS + name); + System.out.println("Writing to: " + result); + final String type = data.equals("EN.data") ? "EnToTranslation" : "EnForeign"; + DictionaryBuilder.main(new String[] { + "--dictOut=" + result.getAbsolutePath(), + "--lang1=" + lang1, + "--lang2=EN", + "--lang1Stoplist=" + STOPLISTS + stoplist, + "--lang2Stoplist=" + STOPLISTS + "en.txt", + "--dictInfo=SomeWikiData", + + "--input4=" + WIKISPLIT_EN + data, + "--input4Name=" + dictName, + "--input4Format=enwiktionary", + "--input4WiktionaryType=" + type, + "--input4LangPattern=" + langPattern, + "--input4LangCodePattern=" + langCode, + "--input4EnIndex=2", + "--input4PageLimit=" + pageLimit, + + "--print=" + result.getPath() + ".text", + }); + + checkGolden(name, result); + } + + public void testGermanCombined() throws Exception { + final String name = "de-en.quickdic"; + final File result = new File(TEST_OUTPUTS + name); + System.out.println("Writing to: " + result); + DictionaryBuilder.main(new String[] { + "--dictOut=" + result.getAbsolutePath(), + "--lang1=DE", + "--lang2=EN", + "--dictInfo=@" + TEST_INPUTS + "de-en_dictInfo.txt", + + "--input1=" + TEST_INPUTS + "de-en_chemnitz_100", + "--input1Name=chemnitz", + "--input1Charset=UTF8", + "--input1Format=chemnitz", + + "--input2=" + TEST_INPUTS + "de-en_dictcc_simulated", + "--input2Name=dictcc", + "--input2Charset=UTF8", + "--input2Format=tab_separated", + + "--print=" + result.getPath() + ".text", + }); + + checkGolden(name, result); + } + + public void testItalianTurkish() throws Exception { + final String name = "it-tr_dictcc.quickdic"; + final File result = new File(TEST_OUTPUTS + name); + System.out.println("Writing to: " + result); + DictionaryBuilder.main(new String[] { + "--dictOut=" + result.getAbsolutePath(), + "--lang1=IT", + "--lang2=TR", + "--dictInfo=it-tr_dictcc_simulated", + + "--input1=" + TEST_INPUTS + "it-tr_dictcc_simulated.txt", + "--input1Name=dictcc", + "--input1Charset=UTF8", + "--input1Format=tab_separated", + + "--print=" + result.getPath() + ".text", + }); + + checkGolden(name, result); + } + + private void checkGolden(final String dictName, final File dictFile) + throws IOException, FileNotFoundException { + // Check it once: + assertFilesEqual(GOLDENS + dictName + ".text", dictFile.getPath() + ".text"); + + // Check it again. + final Dictionary dict = new Dictionary(new RandomAccessFile(dictFile.getAbsolutePath(), "r")); + final PrintStream out = new PrintStream(new File(dictFile.getPath() + ".text")); + dict.print(out); + out.close(); + assertFilesEqual(GOLDENS + dictName + ".text", dictFile.getPath() + ".text"); + } + + + void assertFilesEqual(final String expected, final String actual) throws IOException { + final String expectedString = FileUtil.readToString(new File(expected)); + final String actualString = FileUtil.readToString(new File(actual)); + assertEquals(expectedString, actualString); + } - void assertFilesEqual(final String expected, final String actual) throws IOException { - final String expectedString = FileUtil.readToString(new File(expected)); - final String actualString = FileUtil.readToString(new File(actual)); - assertEquals(expectedString, actualString); - } - } diff --git a/src/com/hughes/android/dictionary/engine/DictionaryTest.java b/src/com/hughes/android/dictionary/engine/DictionaryTest.java index 23747e1..16db723 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryTest.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryTest.java @@ -28,66 +28,66 @@ import com.hughes.util.CollectionUtil; public class DictionaryTest extends TestCase { - - static final String TEST_OUTPUTS = com.hughes.android.dictionary.engine.DictionaryBuilderTest.TEST_OUTPUTS; - public static final String OUTPUTS = "data/outputs/"; - - @Override - protected void setUp() { - while (!TransliteratorManager.init(null)) { - try { - Thread.sleep(10); - } catch (InterruptedException e) { - e.printStackTrace(); - } - } - } - - public void testURLFormatting() { - } - - public void testEnItWiktionary() throws IOException { - final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "EN-IT.quickdic", "r"); - final Dictionary dict = new Dictionary(raf); - final Index enIndex = dict.indices.get(0); - - final RowBase row = enIndex.rows.get(4); - assertEquals("-ical", row.getRawText(false)); - - final Index itIndex = dict.indices.get(1); - { - final List rows = itIndex.multiWordSearch("come mai", Arrays.asList("come", "mai"), new AtomicBoolean(false)); - System.out.println(CollectionUtil.join(rows, "\n ")); - assertTrue(rows.toString(), rows.size() > 0); - assertTrue(rows.get(0).toString().startsWith("come mai@")); - assertTrue(rows.get(0) instanceof TokenRow); - assertTrue(!((TokenRow)rows.get(0)).getIndexEntry().htmlEntries.isEmpty()); - } - { - final List rows = itIndex.multiWordSearch("buon g", Arrays.asList("buon", "g"), new AtomicBoolean(false)); - System.out.println(CollectionUtil.join(rows, "\n ")); - assertTrue(rows.toString(), rows.size() > 0); - assertTrue(rows.get(0).toString().startsWith("buon giorno@")); - assertTrue(rows.get(0) instanceof TokenRow); - assertTrue(!((TokenRow)rows.get(0)).getIndexEntry().htmlEntries.isEmpty()); + static final String TEST_OUTPUTS = com.hughes.android.dictionary.engine.DictionaryBuilderTest.TEST_OUTPUTS; + public static final String OUTPUTS = "data/outputs/"; + + @Override + protected void setUp() { + while (!TransliteratorManager.init(null)) { + try { + Thread.sleep(10); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } } - { - final IndexEntry searchResult = itIndex.findInsertionPoint("azzurro", new AtomicBoolean( - false)); - HtmlEntry htmlEntry = searchResult.htmlEntries.get(0); - System.out.println("azzurro:\n" + htmlEntry.getHtml()); + public void testURLFormatting() { } - raf.close(); - } + public void testEnItWiktionary() throws IOException { + final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "EN-IT.quickdic", "r"); + final Dictionary dict = new Dictionary(raf); + final Index enIndex = dict.indices.get(0); + + final RowBase row = enIndex.rows.get(4); + assertEquals("-ical", row.getRawText(false)); + + final Index itIndex = dict.indices.get(1); + { + final List rows = itIndex.multiWordSearch("come mai", Arrays.asList("come", "mai"), new AtomicBoolean(false)); + System.out.println(CollectionUtil.join(rows, "\n ")); + assertTrue(rows.toString(), rows.size() > 0); + assertTrue(rows.get(0).toString().startsWith("come mai@")); + assertTrue(rows.get(0) instanceof TokenRow); + assertTrue(!((TokenRow)rows.get(0)).getIndexEntry().htmlEntries.isEmpty()); + } + + { + final List rows = itIndex.multiWordSearch("buon g", Arrays.asList("buon", "g"), new AtomicBoolean(false)); + System.out.println(CollectionUtil.join(rows, "\n ")); + assertTrue(rows.toString(), rows.size() > 0); + assertTrue(rows.get(0).toString().startsWith("buon giorno@")); + assertTrue(rows.get(0) instanceof TokenRow); + assertTrue(!((TokenRow)rows.get(0)).getIndexEntry().htmlEntries.isEmpty()); + } + + { + final IndexEntry searchResult = itIndex.findInsertionPoint("azzurro", new AtomicBoolean( + false)); + HtmlEntry htmlEntry = searchResult.htmlEntries.get(0); + System.out.println("azzurro:\n" + htmlEntry.getHtml()); + } + + raf.close(); + } // public void testFr() throws IOException { // final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "FR.quickdic", "r"); // final Dictionary dict = new Dictionary(raf); // final Index frIndex = dict.indices.get(0); -// +// // // Now they're all cached, we shouldn't have to search. // for (final IndexEntry indexEntry : frIndex.sortedIndexEntries) { // System.out.println(indexEntry.token); @@ -96,302 +96,302 @@ public class DictionaryTest extends TestCase { // raf.close(); // } - - public void testDeEnWiktionary() throws IOException { - final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "DE-EN.quickdic", "r"); - final Dictionary dict = new Dictionary(raf); - - final Index deIndex = dict.indices.get(0); - - { - final IndexEntry searchResult = deIndex.findInsertionPoint("rot", new AtomicBoolean( - false)); - HtmlEntry htmlEntry = searchResult.htmlEntries.get(0); - System.out.println("rot:\n" + htmlEntry.getHtml()); - } - - raf.close(); - } - public void testGermanMetadata() throws IOException { - final RandomAccessFile raf = new RandomAccessFile(TEST_OUTPUTS + "de-en.quickdic", "r"); - final Dictionary dict = new Dictionary(raf); - final Index deIndex = dict.indices.get(0); - - assertEquals("DE", deIndex.shortName); - assertEquals("DE->EN", deIndex.longName); - - assertEquals(2, dict.sources.size()); - assertEquals("chemnitz", dict.sources.get(0).name); - assertEquals("dictcc", dict.sources.get(1).name); - - assertEquals("dictcc", dict.pairEntries.get(0).entrySource.name); - assertEquals("chemnitz", dict.pairEntries.get(1).entrySource.name); - - raf.close(); - } - - public void testGermanIndex() throws IOException { - final RandomAccessFile raf = new RandomAccessFile(TEST_OUTPUTS + "de-en.quickdic", "r"); - final Dictionary dict = new Dictionary(raf); - final Index deIndex = dict.indices.get(0); - - for (final Index.IndexEntry indexEntry : deIndex.sortedIndexEntries) { - System.out.println("testing: " + indexEntry.token); - final IndexEntry searchResult = deIndex.findInsertionPoint(indexEntry.token, new AtomicBoolean( - false)); - assertEquals("Looked up: " + indexEntry.token, indexEntry.token.toLowerCase(), searchResult.token.toLowerCase()); - } + public void testDeEnWiktionary() throws IOException { + final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "DE-EN.quickdic", "r"); + final Dictionary dict = new Dictionary(raf); - // TODO: maybe if user types capitalization, use it. - assertSearchResult("aaac", "aaac", deIndex.findInsertionPoint("aaac", new AtomicBoolean(false))); - assertSearchResult("aaac", "aaac", deIndex.findInsertionPoint("AAAC", new AtomicBoolean(false))); - assertSearchResult("aaac", "aaac", deIndex.findInsertionPoint("AAAc", new AtomicBoolean(false))); - assertSearchResult("aaac", "aaac", deIndex.findInsertionPoint("aAac", new AtomicBoolean(false))); - - // Before the beginning. - assertSearchResult("40", "40" /* special case */, deIndex.findInsertionPoint("", new AtomicBoolean(false))); - assertSearchResult("40", "40" /* special case */, deIndex.findInsertionPoint("__", new AtomicBoolean(false))); - - // After the end. - assertSearchResult("Zweckorientiertheit", "zählen", deIndex.findInsertionPoint("ZZZZZ", new AtomicBoolean(false))); - - assertSearchResult("ab", "aaac", deIndex.findInsertionPoint("aaaca", new AtomicBoolean(false))); - assertSearchResult("machen", "machen", deIndex.findInsertionPoint("m", new AtomicBoolean(false))); - assertSearchResult("machen", "machen", deIndex.findInsertionPoint("macdddd", new AtomicBoolean(false))); - - - assertSearchResult("überprüfe", "überprüfe", deIndex.findInsertionPoint("ueberprüfe", new AtomicBoolean(false))); - assertSearchResult("überprüfe", "überprüfe", deIndex.findInsertionPoint("ueberpruefe", new AtomicBoolean(false))); - - assertSearchResult("überprüfe", "überprüfe", deIndex.findInsertionPoint("ueberpBLEH", new AtomicBoolean(false))); - assertSearchResult("überprüfe", "überprüfe", deIndex.findInsertionPoint("überprBLEH", new AtomicBoolean(false))); - - assertSearchResult("überprüfen", "überprüfe", deIndex.findInsertionPoint("überprüfeBLEH", new AtomicBoolean(false))); - - // Check that search in lowercase works. - assertSearchResult("Alibi", "Alibi", deIndex.findInsertionPoint("alib", new AtomicBoolean(false))); - System.out.println(deIndex.findInsertionPoint("alib", new AtomicBoolean(false)).toString()); - - raf.close(); - } - - private void assertSearchResult(final String insertionPoint, final String longestPrefix, - final IndexEntry actual) { - assertEquals(insertionPoint, actual.token); - } - - public void testGermanTokenRows() throws IOException { - final RandomAccessFile raf = new RandomAccessFile(TEST_OUTPUTS + "de-en.quickdic", "r"); - final Dictionary dict = new Dictionary(raf); - final Index deIndex = dict.indices.get(0); - - // Pre-cache a few of these, just to make sure that's working. - for (int i = 0; i < deIndex.rows.size(); i += 7) { - deIndex.rows.get(i).getTokenRow(true); - } - - // Do the exhaustive searching. - TokenRow lastTokenRow = null; - for (final RowBase row : deIndex.rows) { - if (row instanceof TokenRow) { - lastTokenRow = (TokenRow) row; - } - assertEquals(lastTokenRow, row.getTokenRow(true)); - } + final Index deIndex = dict.indices.get(0); - // Now they're all cached, we shouldn't have to search. - for (final RowBase row : deIndex.rows) { - if (row instanceof TokenRow) { - lastTokenRow = (TokenRow) row; - } - // This will break if the Row cache isn't big enough. - assertEquals(lastTokenRow, row.getTokenRow(false)); + { + final IndexEntry searchResult = deIndex.findInsertionPoint("rot", new AtomicBoolean( + false)); + HtmlEntry htmlEntry = searchResult.htmlEntries.get(0); + System.out.println("rot:\n" + htmlEntry.getHtml()); + } + + raf.close(); } - - raf.close(); - } - - public void testChemnitz() throws IOException { - final RandomAccessFile raf = new RandomAccessFile(TEST_OUTPUTS + "de-en.quickdic", "r"); - final Dictionary dict = new Dictionary(raf); - final Index deIndex = dict.indices.get(0); - - assertSearchResult("Höschen", "Hos", deIndex.findInsertionPoint("Hos", new AtomicBoolean(false))); - assertSearchResult("Höschen", "hos", deIndex.findInsertionPoint("hos", new AtomicBoolean(false))); - - raf.close(); - } - - public void testMultiSearch() throws IOException { - final RandomAccessFile raf = new RandomAccessFile(TEST_OUTPUTS + "de-en.quickdic", "r"); - final Dictionary dict = new Dictionary(raf); - final Index deIndex = dict.indices.get(0); - - { - final List rows = deIndex.multiWordSearch("aaa aaab", Arrays.asList("aaa", "aaab"), new AtomicBoolean(false)); - System.out.println(CollectionUtil.join(rows, "\n ")); - assertTrue(rows.toString(), rows.size() > 0); + + public void testGermanMetadata() throws IOException { + final RandomAccessFile raf = new RandomAccessFile(TEST_OUTPUTS + "de-en.quickdic", "r"); + final Dictionary dict = new Dictionary(raf); + final Index deIndex = dict.indices.get(0); + + assertEquals("DE", deIndex.shortName); + assertEquals("DE->EN", deIndex.longName); + + assertEquals(2, dict.sources.size()); + assertEquals("chemnitz", dict.sources.get(0).name); + assertEquals("dictcc", dict.sources.get(1).name); + + assertEquals("dictcc", dict.pairEntries.get(0).entrySource.name); + assertEquals("chemnitz", dict.pairEntries.get(1).entrySource.name); + + raf.close(); } - - raf.close(); - } - - public void testMultiSearchIt() throws IOException { - final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "IT.quickdic", "r"); - final Dictionary dict = new Dictionary(raf); - final Index index = dict.indices.get(0); - - { - final List rows = index.multiWordSearch("fare centro", - Arrays.asList("fare", "centro"), new AtomicBoolean(false)); - System.out.println(CollectionUtil.join(rows, "\n ")); - assertTrue(rows.toString(), rows.size() > 0); - assertTrue(rows.get(0).toString().startsWith("fare centro@")); - } - } - - public void testMultiSearchDeBig() throws IOException { - final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "DE-EN.quickdic", "r"); - final Dictionary dict = new Dictionary(raf); - final Index enIndex = dict.indices.get(1); - - { - final List rows = enIndex.multiWordSearch("train station", Arrays.asList("train", "station"), new AtomicBoolean(false)); - System.out.println(CollectionUtil.join(rows, "\n ")); - assertTrue(rows.toString(), rows.size() > 0); - assertTrue(rows.get(0).toString().startsWith("train station@")); + + public void testGermanIndex() throws IOException { + final RandomAccessFile raf = new RandomAccessFile(TEST_OUTPUTS + "de-en.quickdic", "r"); + final Dictionary dict = new Dictionary(raf); + final Index deIndex = dict.indices.get(0); + + for (final Index.IndexEntry indexEntry : deIndex.sortedIndexEntries) { + System.out.println("testing: " + indexEntry.token); + final IndexEntry searchResult = deIndex.findInsertionPoint(indexEntry.token, new AtomicBoolean( + false)); + assertEquals("Looked up: " + indexEntry.token, indexEntry.token.toLowerCase(), searchResult.token.toLowerCase()); + } + + // TODO: maybe if user types capitalization, use it. + assertSearchResult("aaac", "aaac", deIndex.findInsertionPoint("aaac", new AtomicBoolean(false))); + assertSearchResult("aaac", "aaac", deIndex.findInsertionPoint("AAAC", new AtomicBoolean(false))); + assertSearchResult("aaac", "aaac", deIndex.findInsertionPoint("AAAc", new AtomicBoolean(false))); + assertSearchResult("aaac", "aaac", deIndex.findInsertionPoint("aAac", new AtomicBoolean(false))); + + // Before the beginning. + assertSearchResult("40", "40" /* special case */, deIndex.findInsertionPoint("", new AtomicBoolean(false))); + assertSearchResult("40", "40" /* special case */, deIndex.findInsertionPoint("__", new AtomicBoolean(false))); + + // After the end. + assertSearchResult("Zweckorientiertheit", "zählen", deIndex.findInsertionPoint("ZZZZZ", new AtomicBoolean(false))); + + assertSearchResult("ab", "aaac", deIndex.findInsertionPoint("aaaca", new AtomicBoolean(false))); + assertSearchResult("machen", "machen", deIndex.findInsertionPoint("m", new AtomicBoolean(false))); + assertSearchResult("machen", "machen", deIndex.findInsertionPoint("macdddd", new AtomicBoolean(false))); + + + assertSearchResult("überprüfe", "überprüfe", deIndex.findInsertionPoint("ueberprüfe", new AtomicBoolean(false))); + assertSearchResult("überprüfe", "überprüfe", deIndex.findInsertionPoint("ueberpruefe", new AtomicBoolean(false))); + + assertSearchResult("überprüfe", "überprüfe", deIndex.findInsertionPoint("ueberpBLEH", new AtomicBoolean(false))); + assertSearchResult("überprüfe", "überprüfe", deIndex.findInsertionPoint("überprBLEH", new AtomicBoolean(false))); + + assertSearchResult("überprüfen", "überprüfe", deIndex.findInsertionPoint("überprüfeBLEH", new AtomicBoolean(false))); + + // Check that search in lowercase works. + assertSearchResult("Alibi", "Alibi", deIndex.findInsertionPoint("alib", new AtomicBoolean(false))); + System.out.println(deIndex.findInsertionPoint("alib", new AtomicBoolean(false)).toString()); + + raf.close(); } - { - final List rows = enIndex.multiWordSearch("a train station", Arrays.asList("a", "train", "station"), new AtomicBoolean(false)); - System.out.println(CollectionUtil.join(rows, "\n ")); - assertTrue(rows.toString(), rows.size() > 0); - assertEquals("Bahnhofsuhr {{de-noun|g=f|plural=Bahnhofsuhren}}\tstation clock (at a train station)", rows.get(0).toString()); + private void assertSearchResult(final String insertionPoint, final String longestPrefix, + final IndexEntry actual) { + assertEquals(insertionPoint, actual.token); } - { - final List rows = enIndex.multiWordSearch("a station", Arrays.asList("a", "station"), new AtomicBoolean(false)); - System.out.println(CollectionUtil.join(rows, "\n ")); - assertTrue(rows.toString(), rows.size() > 0); - assertEquals("Abfahrthalle {en-noun}\tDeparture room of a station.", rows.get(0).toString()); + public void testGermanTokenRows() throws IOException { + final RandomAccessFile raf = new RandomAccessFile(TEST_OUTPUTS + "de-en.quickdic", "r"); + final Dictionary dict = new Dictionary(raf); + final Index deIndex = dict.indices.get(0); + + // Pre-cache a few of these, just to make sure that's working. + for (int i = 0; i < deIndex.rows.size(); i += 7) { + deIndex.rows.get(i).getTokenRow(true); + } + + // Do the exhaustive searching. + TokenRow lastTokenRow = null; + for (final RowBase row : deIndex.rows) { + if (row instanceof TokenRow) { + lastTokenRow = (TokenRow) row; + } + assertEquals(lastTokenRow, row.getTokenRow(true)); + } + + // Now they're all cached, we shouldn't have to search. + for (final RowBase row : deIndex.rows) { + if (row instanceof TokenRow) { + lastTokenRow = (TokenRow) row; + } + // This will break if the Row cache isn't big enough. + assertEquals(lastTokenRow, row.getTokenRow(false)); + } + + raf.close(); } - { - // Should print: Giving up, too many words with prefix: p - final List rows = enIndex.multiWordSearch("p eat", Arrays.asList("p", "eat"), new AtomicBoolean(false)); - System.out.println(CollectionUtil.join(rows, "\n ")); - assertTrue(rows.toString(), rows.size() > 0); - assertTrue(rows.toString().contains("verschlingen; verputzen\tto dispatch (eat)")); + public void testChemnitz() throws IOException { + final RandomAccessFile raf = new RandomAccessFile(TEST_OUTPUTS + "de-en.quickdic", "r"); + final Dictionary dict = new Dictionary(raf); + final Index deIndex = dict.indices.get(0); + + assertSearchResult("Höschen", "Hos", deIndex.findInsertionPoint("Hos", new AtomicBoolean(false))); + assertSearchResult("Höschen", "hos", deIndex.findInsertionPoint("hos", new AtomicBoolean(false))); + + raf.close(); } - { - // Should print: Giving up, too many words with prefix: p - final List rows = enIndex.multiWordSearch("p p", Arrays.asList("p", "p"), new AtomicBoolean(false)); - assertTrue(rows.size() >= 1000); + public void testMultiSearch() throws IOException { + final RandomAccessFile raf = new RandomAccessFile(TEST_OUTPUTS + "de-en.quickdic", "r"); + final Dictionary dict = new Dictionary(raf); + final Index deIndex = dict.indices.get(0); + + { + final List rows = deIndex.multiWordSearch("aaa aaab", Arrays.asList("aaa", "aaab"), new AtomicBoolean(false)); + System.out.println(CollectionUtil.join(rows, "\n ")); + assertTrue(rows.toString(), rows.size() > 0); + } + + raf.close(); } - { - // Should print: Giving up, too many words with prefix: a - final List rows = enIndex.multiWordSearch("a a", Arrays.asList("a", "a"), new AtomicBoolean(false)); - assertTrue(rows.size() >= 1000); + public void testMultiSearchIt() throws IOException { + final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "IT.quickdic", "r"); + final Dictionary dict = new Dictionary(raf); + final Index index = dict.indices.get(0); + + { + final List rows = index.multiWordSearch("fare centro", + Arrays.asList("fare", "centro"), new AtomicBoolean(false)); + System.out.println(CollectionUtil.join(rows, "\n ")); + assertTrue(rows.toString(), rows.size() > 0); + assertTrue(rows.get(0).toString().startsWith("fare centro@")); + } } - { - // Should print: Giving up, too many words with prefix: a - final List rows = enIndex.multiWordSearch("b ba", Arrays.asList("b", "ba"), new AtomicBoolean(false)); - assertTrue(rows.size() >= 1000); + public void testMultiSearchDeBig() throws IOException { + final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "DE-EN.quickdic", "r"); + final Dictionary dict = new Dictionary(raf); + final Index enIndex = dict.indices.get(1); + + { + final List rows = enIndex.multiWordSearch("train station", Arrays.asList("train", "station"), new AtomicBoolean(false)); + System.out.println(CollectionUtil.join(rows, "\n ")); + assertTrue(rows.toString(), rows.size() > 0); + assertTrue(rows.get(0).toString().startsWith("train station@")); + } + + { + final List rows = enIndex.multiWordSearch("a train station", Arrays.asList("a", "train", "station"), new AtomicBoolean(false)); + System.out.println(CollectionUtil.join(rows, "\n ")); + assertTrue(rows.toString(), rows.size() > 0); + assertEquals("Bahnhofsuhr {{de-noun|g=f|plural=Bahnhofsuhren}}\tstation clock (at a train station)", rows.get(0).toString()); + } + + { + final List rows = enIndex.multiWordSearch("a station", Arrays.asList("a", "station"), new AtomicBoolean(false)); + System.out.println(CollectionUtil.join(rows, "\n ")); + assertTrue(rows.toString(), rows.size() > 0); + assertEquals("Abfahrthalle {en-noun}\tDeparture room of a station.", rows.get(0).toString()); + } + + { + // Should print: Giving up, too many words with prefix: p + final List rows = enIndex.multiWordSearch("p eat", Arrays.asList("p", "eat"), new AtomicBoolean(false)); + System.out.println(CollectionUtil.join(rows, "\n ")); + assertTrue(rows.toString(), rows.size() > 0); + assertTrue(rows.toString().contains("verschlingen; verputzen\tto dispatch (eat)")); + } + + { + // Should print: Giving up, too many words with prefix: p + final List rows = enIndex.multiWordSearch("p p", Arrays.asList("p", "p"), new AtomicBoolean(false)); + assertTrue(rows.size() >= 1000); + } + + { + // Should print: Giving up, too many words with prefix: a + final List rows = enIndex.multiWordSearch("a a", Arrays.asList("a", "a"), new AtomicBoolean(false)); + assertTrue(rows.size() >= 1000); + } + + { + // Should print: Giving up, too many words with prefix: a + final List rows = enIndex.multiWordSearch("b ba", Arrays.asList("b", "ba"), new AtomicBoolean(false)); + assertTrue(rows.size() >= 1000); + } + + { + // Should print: Giving up, too many words with prefix: a + final List rows = enIndex.multiWordSearch("b ba", Arrays.asList("b", "ba"), new AtomicBoolean(false)); + assertTrue(rows.size() >= 1000); + } + + raf.close(); } - { - // Should print: Giving up, too many words with prefix: a - final List rows = enIndex.multiWordSearch("b ba", Arrays.asList("b", "ba"), new AtomicBoolean(false)); - assertTrue(rows.size() >= 1000); + public void testMultiSearchBigAF() throws IOException { + final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "AF-EN.quickdic", "r"); + final Dictionary dict = new Dictionary(raf); + final Index enIndex = dict.indices.get(1); + + { + final List rows = enIndex.multiWordSearch("pig eats", Arrays.asList("pig", "eats"), new AtomicBoolean(false)); + System.out.println(CollectionUtil.join(rows, "\n ")); + assertTrue(rows.toString(), rows.size() > 0); + assertEquals("vark\tpig (someone who overeats or eats rapidly) (noun)", rows.get(0).toString()); + } + + { + final List rows = enIndex.multiWordSearch("pig eat", Arrays.asList("pig", "eat"), new AtomicBoolean(false)); + System.out.println(CollectionUtil.join(rows, "\n ")); + assertTrue(rows.toString(), rows.size() > 0); + assertEquals("vark\tpig (someone who overeats or eats rapidly) (noun)", rows.get(0).toString()); + } + + { + final List rows = enIndex.multiWordSearch("pi ea", Arrays.asList("pi", "ea"), new AtomicBoolean(false)); + System.out.println(CollectionUtil.join(rows, "\n ")); + assertTrue(rows.toString(), rows.size() > 0); + assertTrue(rows.toString().contains("vark\tpig (someone who overeats or eats rapidly) (noun)")); + } + + { + final List rows = enIndex.multiWordSearch("p eat", Arrays.asList("p", "eat"), new AtomicBoolean(false)); + System.out.println(CollectionUtil.join(rows, "\n ")); + assertTrue(rows.toString(), rows.size() > 0); + assertTrue(rows.toString().contains("vark\tpig (someone who overeats or eats rapidly) (noun)")); + } + + + raf.close(); } - raf.close(); - } - public void testMultiSearchBigAF() throws IOException { - final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "AF-EN.quickdic", "r"); - final Dictionary dict = new Dictionary(raf); - final Index enIndex = dict.indices.get(1); + public void testExactSearch() throws IOException { + final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "EN-cmn.quickdic", "r"); + final Dictionary dict = new Dictionary(raf); + final Index cmnIndex = dict.indices.get(1); - { - final List rows = enIndex.multiWordSearch("pig eats", Arrays.asList("pig", "eats"), new AtomicBoolean(false)); - System.out.println(CollectionUtil.join(rows, "\n ")); - assertTrue(rows.toString(), rows.size() > 0); - assertEquals("vark\tpig (someone who overeats or eats rapidly) (noun)", rows.get(0).toString()); - } + final Random random = new Random(10); - { - final List rows = enIndex.multiWordSearch("pig eat", Arrays.asList("pig", "eat"), new AtomicBoolean(false)); - System.out.println(CollectionUtil.join(rows, "\n ")); - assertTrue(rows.toString(), rows.size() > 0); - assertEquals("vark\tpig (someone who overeats or eats rapidly) (noun)", rows.get(0).toString()); - } + for (int i = 0; i < 1000; ++i) { + final int ii = random.nextInt(cmnIndex.sortedIndexEntries.size()); + final IndexEntry indexEntry = cmnIndex.sortedIndexEntries.get(ii); + final IndexEntry found = cmnIndex.findExact(indexEntry.token); + assertNotNull(found); + assertEquals(indexEntry.token, found.token); + assertEquals(indexEntry, found); // Test of caching.... + } - { - final List rows = enIndex.multiWordSearch("pi ea", Arrays.asList("pi", "ea"), new AtomicBoolean(false)); - System.out.println(CollectionUtil.join(rows, "\n ")); - assertTrue(rows.toString(), rows.size() > 0); - assertTrue(rows.toString().contains("vark\tpig (someone who overeats or eats rapidly) (noun)")); + raf.close(); } - { - final List rows = enIndex.multiWordSearch("p eat", Arrays.asList("p", "eat"), new AtomicBoolean(false)); - System.out.println(CollectionUtil.join(rows, "\n ")); - assertTrue(rows.toString(), rows.size() > 0); - assertTrue(rows.toString().contains("vark\tpig (someone who overeats or eats rapidly) (noun)")); + public void testThai() throws IOException { + final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "EN-TH.quickdic", "r"); + final Dictionary dict = new Dictionary(raf); + final Index thIndex = dict.indices.get(1); + + final IndexEntry entry = thIndex.findInsertionPoint("ดี", new AtomicBoolean(false)); + assertEquals("di", entry.token); + + raf.close(); } - - raf.close(); - } - - - public void testExactSearch() throws IOException { - final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "EN-cmn.quickdic", "r"); - final Dictionary dict = new Dictionary(raf); - final Index cmnIndex = dict.indices.get(1); - - final Random random = new Random(10); - - for (int i = 0; i < 1000; ++i) { - final int ii = random.nextInt(cmnIndex.sortedIndexEntries.size()); - final IndexEntry indexEntry = cmnIndex.sortedIndexEntries.get(ii); - final IndexEntry found = cmnIndex.findExact(indexEntry.token); - assertNotNull(found); - assertEquals(indexEntry.token, found.token); - assertEquals(indexEntry, found); // Test of caching.... + public void testNorwegian() throws IOException { + final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "EN-NL.quickdic", "r"); + final Dictionary dict = new Dictionary(raf); + final Index nlIndex = dict.indices.get(1); + + IndexEntry entry = nlIndex.findInsertionPoint("Xhosa", new AtomicBoolean(false)); + assertEquals("Xhosa", entry.token); + + entry = nlIndex.findInsertionPoint("Zyne", new AtomicBoolean(false)); + assertEquals("Zyne", entry.token); + + raf.close(); } - - raf.close(); - } - - public void testThai() throws IOException { - final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "EN-TH.quickdic", "r"); - final Dictionary dict = new Dictionary(raf); - final Index thIndex = dict.indices.get(1); - - final IndexEntry entry = thIndex.findInsertionPoint("ดี", new AtomicBoolean(false)); - assertEquals("di", entry.token); - - raf.close(); - } - - public void testNorwegian() throws IOException { - final RandomAccessFile raf = new RandomAccessFile(OUTPUTS + "EN-NL.quickdic", "r"); - final Dictionary dict = new Dictionary(raf); - final Index nlIndex = dict.indices.get(1); - - IndexEntry entry = nlIndex.findInsertionPoint("Xhosa", new AtomicBoolean(false)); - assertEquals("Xhosa", entry.token); - - entry = nlIndex.findInsertionPoint("Zyne", new AtomicBoolean(false)); - assertEquals("Zyne", entry.token); - - raf.close(); - } } diff --git a/src/com/hughes/android/dictionary/engine/IndexBuilder.java b/src/com/hughes/android/dictionary/engine/IndexBuilder.java index 0c3fa13..e7e1b43 100644 --- a/src/com/hughes/android/dictionary/engine/IndexBuilder.java +++ b/src/com/hughes/android/dictionary/engine/IndexBuilder.java @@ -29,149 +29,150 @@ import com.hughes.android.dictionary.engine.Index.IndexEntry; import com.hughes.android.dictionary.parser.DictFileParser; public class IndexBuilder { - - final DictionaryBuilder dictionaryBuilder; - public final Index index; - final Set stoplist; - - final SortedMap tokenToData; - - IndexBuilder(final DictionaryBuilder dictionaryBuilder, final String shortName, final String longName, final Language language, final String normalizerRules, final Set stoplist, final boolean swapPairEntries) { - this.dictionaryBuilder = dictionaryBuilder; - index = new Index(dictionaryBuilder.dictionary, shortName, longName, language, normalizerRules, swapPairEntries, stoplist); - tokenToData = new TreeMap(index.getSortComparator()); - this.stoplist = stoplist; - } - - public void build() { - final Set tokenIndexedEntries = new HashSet(); - final List rows = index.rows; - index.mainTokenCount = 0; - for (final TokenData tokenData : tokenToData.values()) { - tokenIndexedEntries.clear(); - final int indexIndex = index.sortedIndexEntries.size(); - final int startRow = rows.size(); - - TokenRow tokenRow = null; - if (!tokenData.htmlEntries.isEmpty()) { - tokenRow = new TokenRow(indexIndex, rows.size(), index, tokenData.hasMainEntry); - rows.add(tokenRow); - } - + + final DictionaryBuilder dictionaryBuilder; + public final Index index; + final Set stoplist; + + final SortedMap tokenToData; + + IndexBuilder(final DictionaryBuilder dictionaryBuilder, final String shortName, final String longName, final Language language, final String normalizerRules, final Set stoplist, final boolean swapPairEntries) { + this.dictionaryBuilder = dictionaryBuilder; + index = new Index(dictionaryBuilder.dictionary, shortName, longName, language, normalizerRules, swapPairEntries, stoplist); + tokenToData = new TreeMap(index.getSortComparator()); + this.stoplist = stoplist; + } + + public void build() { + final Set tokenIndexedEntries = new HashSet(); + final List rows = index.rows; + index.mainTokenCount = 0; + for (final TokenData tokenData : tokenToData.values()) { + tokenIndexedEntries.clear(); + final int indexIndex = index.sortedIndexEntries.size(); + final int startRow = rows.size(); + + TokenRow tokenRow = null; + if (!tokenData.htmlEntries.isEmpty()) { + tokenRow = new TokenRow(indexIndex, rows.size(), index, tokenData.hasMainEntry); + rows.add(tokenRow); + } + // System.out.println("Added TokenRow: " + rows.get(rows.size() - 1)); - - int numRows = 0; // off by one--doesn't count the token row! + + int numRows = 0; // off by one--doesn't count the token row! // System.out.println("TOKEN: " + tokenData.token); - for (final Map.Entry> typeToIndexedEntries : tokenData.typeToEntries.entrySet()) { - for (final IndexedEntry indexedEntry : typeToIndexedEntries.getValue()) { - if (!indexedEntry.isValid) { - continue; - } - - if (tokenRow == null) { - tokenRow = new TokenRow(indexIndex, rows.size(), index, tokenData.hasMainEntry); - rows.add(tokenRow); - } - - if (indexedEntry.entry.index() == -1) { - indexedEntry.entry.addToDictionary(dictionaryBuilder.dictionary); - assert indexedEntry.entry.index() >= 0; - } - if (tokenIndexedEntries.add(indexedEntry) && !tokenData.htmlEntries.contains(indexedEntry.entry)) { - rows.add(indexedEntry.entry.CreateRow(rows.size(), index)); - ++indexedEntry.entry.entrySource.numEntries; - ++numRows; - + for (final Map.Entry> typeToIndexedEntries : tokenData.typeToEntries.entrySet()) { + for (final IndexedEntry indexedEntry : typeToIndexedEntries.getValue()) { + if (!indexedEntry.isValid) { + continue; + } + + if (tokenRow == null) { + tokenRow = new TokenRow(indexIndex, rows.size(), index, tokenData.hasMainEntry); + rows.add(tokenRow); + } + + if (indexedEntry.entry.index() == -1) { + indexedEntry.entry.addToDictionary(dictionaryBuilder.dictionary); + assert indexedEntry.entry.index() >= 0; + } + if (tokenIndexedEntries.add(indexedEntry) && !tokenData.htmlEntries.contains(indexedEntry.entry)) { + rows.add(indexedEntry.entry.CreateRow(rows.size(), index)); + ++indexedEntry.entry.entrySource.numEntries; + ++numRows; + // System.out.print(" " + typeToEntry.getKey() + ": "); - // rows.get(rows.size() - 1).print(System.out); + // rows.get(rows.size() - 1).print(System.out); // System.out.println(); - } + } + } + } + + if (tokenRow != null) { + if (tokenRow.hasMainEntry) { + index.mainTokenCount++; + } + + final Index.IndexEntry indexEntry = new Index.IndexEntry(index, tokenData.token, index + .normalizer().transliterate(tokenData.token), startRow, numRows); + indexEntry.htmlEntries.addAll(tokenData.htmlEntries); + index.sortedIndexEntries.add(indexEntry); + } + } + + final List entriesSortedByNumRows = new ArrayList(index.sortedIndexEntries); + Collections.sort(entriesSortedByNumRows, new Comparator() { + @Override + public int compare(IndexEntry object1, IndexEntry object2) { + return object2.numRows - object1.numRows; + } + }); + System.out.println("Most common tokens:"); + for (int i = 0; i < 50 && i < entriesSortedByNumRows.size(); ++i) { + System.out.println(" " + entriesSortedByNumRows.get(i)); } - } - - if (tokenRow != null) { - if (tokenRow.hasMainEntry) { - index.mainTokenCount++; - } - - final Index.IndexEntry indexEntry = new Index.IndexEntry(index, tokenData.token, index - .normalizer().transliterate(tokenData.token), startRow, numRows); - indexEntry.htmlEntries.addAll(tokenData.htmlEntries); - index.sortedIndexEntries.add(indexEntry); - } } - - final List entriesSortedByNumRows = new ArrayList(index.sortedIndexEntries); - Collections.sort(entriesSortedByNumRows, new Comparator() { - @Override - public int compare(IndexEntry object1, IndexEntry object2) { - return object2.numRows - object1.numRows; - }}); - System.out.println("Most common tokens:"); - for (int i = 0; i < 50 && i < entriesSortedByNumRows.size(); ++i) { - System.out.println(" " + entriesSortedByNumRows.get(i)); + + public static class TokenData { + final String token; + + final Map> typeToEntries = new EnumMap>(EntryTypeName.class); + public boolean hasMainEntry = false; + + public List htmlEntries = new ArrayList(); + + TokenData(final String token) { + assert token.equals(token.trim()); + assert token.length() > 0; + this.token = token; + } } - } - - public static class TokenData { - final String token; - - final Map> typeToEntries = new EnumMap>(EntryTypeName.class); - public boolean hasMainEntry = false; - - public List htmlEntries = new ArrayList(); - - TokenData(final String token) { - assert token.equals(token.trim()); - assert token.length() > 0; - this.token = token; + + public TokenData getOrCreateTokenData(final String token) { + TokenData tokenData = tokenToData.get(token); + if (tokenData == null) { + tokenData = new TokenData(token); + tokenToData.put(token, tokenData); + } + return tokenData; } - } - public TokenData getOrCreateTokenData(final String token) { - TokenData tokenData = tokenToData.get(token); - if (tokenData == null) { - tokenData = new TokenData(token); - tokenToData.put(token, tokenData); + private List getOrCreateEntries(final String token, final EntryTypeName entryTypeName) { + final TokenData tokenData = getOrCreateTokenData(token); + List entries = tokenData.typeToEntries.get(entryTypeName); + if (entryTypeName.mainWord) { + tokenData.hasMainEntry = true; + } + if (entries == null) { + entries = new ArrayList(); + tokenData.typeToEntries.put(entryTypeName, entries); + } + return entries; } - return tokenData; - } - - private List getOrCreateEntries(final String token, final EntryTypeName entryTypeName) { - final TokenData tokenData = getOrCreateTokenData(token); - List entries = tokenData.typeToEntries.get(entryTypeName); - if (entryTypeName.mainWord) { - tokenData.hasMainEntry = true; + + public void addEntryWithTokens(final IndexedEntry indexedEntry, final Set tokens, + final EntryTypeName entryTypeName) { + if (indexedEntry == null) { + System.out.println("asdfasdf"); + } + assert indexedEntry != null; + for (final String token : tokens) { + if (entryTypeName.overridesStopList || !stoplist.contains(token)) { + getOrCreateEntries(token, entryTypeName).add(indexedEntry); + } + } } - if (entries == null) { - entries = new ArrayList(); - tokenData.typeToEntries.put(entryTypeName, entries); + + public void addEntryWithString(final IndexedEntry indexedEntry, final String untokenizedString, + final EntryTypeName entryTypeName) { + final Set tokens = DictFileParser.tokenize(untokenizedString, DictFileParser.NON_CHAR); + addEntryWithTokens(indexedEntry, tokens, tokens.size() == 1 ? entryTypeName.singleWordInstance : entryTypeName); } - return entries; - } - public void addEntryWithTokens(final IndexedEntry indexedEntry, final Set tokens, - final EntryTypeName entryTypeName) { - if (indexedEntry == null) { - System.out.println("asdfasdf"); + public void addEntryWithStringNoSingle(final IndexedEntry indexedEntry, final String untokenizedString, + final EntryTypeName entryTypeName) { + final Set tokens = DictFileParser.tokenize(untokenizedString, DictFileParser.NON_CHAR); + addEntryWithTokens(indexedEntry, tokens, entryTypeName); } - assert indexedEntry != null; - for (final String token : tokens) { - if (entryTypeName.overridesStopList || !stoplist.contains(token)) { - getOrCreateEntries(token, entryTypeName).add(indexedEntry); - } - } - } - - public void addEntryWithString(final IndexedEntry indexedEntry, final String untokenizedString, - final EntryTypeName entryTypeName) { - final Set tokens = DictFileParser.tokenize(untokenizedString, DictFileParser.NON_CHAR); - addEntryWithTokens(indexedEntry, tokens, tokens.size() == 1 ? entryTypeName.singleWordInstance : entryTypeName); - } - - public void addEntryWithStringNoSingle(final IndexedEntry indexedEntry, final String untokenizedString, - final EntryTypeName entryTypeName) { - final Set tokens = DictFileParser.tokenize(untokenizedString, DictFileParser.NON_CHAR); - addEntryWithTokens(indexedEntry, tokens, entryTypeName); - } } diff --git a/src/com/hughes/android/dictionary/engine/IndexedEntry.java b/src/com/hughes/android/dictionary/engine/IndexedEntry.java index faf11fd..d708951 100644 --- a/src/com/hughes/android/dictionary/engine/IndexedEntry.java +++ b/src/com/hughes/android/dictionary/engine/IndexedEntry.java @@ -16,10 +16,10 @@ package com.hughes.android.dictionary.engine; public class IndexedEntry { - AbstractEntry entry; - public boolean isValid = false; - - public IndexedEntry(final AbstractEntry entry) { - this.entry = entry; - } + AbstractEntry entry; + public boolean isValid = false; + + public IndexedEntry(final AbstractEntry entry) { + this.entry = entry; + } } \ No newline at end of file diff --git a/src/com/hughes/android/dictionary/engine/LanguageTest.java b/src/com/hughes/android/dictionary/engine/LanguageTest.java index d81ad87..24fe094 100644 --- a/src/com/hughes/android/dictionary/engine/LanguageTest.java +++ b/src/com/hughes/android/dictionary/engine/LanguageTest.java @@ -28,170 +28,170 @@ import com.hughes.android.dictionary.parser.wiktionary.WiktionaryLangs; import com.ibm.icu.text.Transliterator; public class LanguageTest extends TestCase { - - public void testGermanSort() { - final Transliterator normalizer = Transliterator.createFromRules("", Language.de.getDefaultNormalizerRules(), Transliterator.FORWARD); - assertEquals("aüääss", normalizer.transform("aueAeAEß")); - final List words = Arrays.asList( - "er-ben", - "erben", - "Erben", - "Erbse", - "Erbsen", - "essen", - "Essen", - "Grosformat", - "Grosformats", - "Grossformat", - "Großformat", - "Grossformats", - "Großformats", - "Großpoo", - "Großpoos", - "Hörvermögen", - "Hörweite", - "hos", - "Höschen", - "Hostel", - "hulle", - "Hulle", - "huelle", - "Huelle", - "hülle", - "Hülle", - "Huellen", - "Hüllen", - "Hum" - ); - final NormalizeComparator comparator = new NormalizeComparator(normalizer, Language.de.getCollator(), 7); - assertEquals(1, comparator.compare("hülle", "huelle")); - assertEquals(-1, comparator.compare("huelle", "hülle")); - - assertEquals(-1, comparator.compare("hülle", "Hülle")); - - assertEquals("hülle", normalizer.transform("Hülle")); - assertEquals("hulle", normalizer.transform("Hulle")); - - - final List sorted = new ArrayList(words); + + public void testGermanSort() { + final Transliterator normalizer = Transliterator.createFromRules("", Language.de.getDefaultNormalizerRules(), Transliterator.FORWARD); + assertEquals("aüääss", normalizer.transform("aueAeAEß")); + final List words = Arrays.asList( + "er-ben", + "erben", + "Erben", + "Erbse", + "Erbsen", + "essen", + "Essen", + "Grosformat", + "Grosformats", + "Grossformat", + "Großformat", + "Grossformats", + "Großformats", + "Großpoo", + "Großpoos", + "Hörvermögen", + "Hörweite", + "hos", + "Höschen", + "Hostel", + "hulle", + "Hulle", + "huelle", + "Huelle", + "hülle", + "Hülle", + "Huellen", + "Hüllen", + "Hum" + ); + final NormalizeComparator comparator = new NormalizeComparator(normalizer, Language.de.getCollator(), 7); + assertEquals(1, comparator.compare("hülle", "huelle")); + assertEquals(-1, comparator.compare("huelle", "hülle")); + + assertEquals(-1, comparator.compare("hülle", "Hülle")); + + assertEquals("hülle", normalizer.transform("Hülle")); + assertEquals("hulle", normalizer.transform("Hulle")); + + + final List sorted = new ArrayList(words); // Collections.shuffle(shuffled, new Random(0)); - Collections.sort(sorted, comparator); - System.out.println(sorted.toString()); - for (int i = 0; i < words.size(); ++i) { - System.out.println(words.get(i) + "\t" + sorted.get(i)); - assertEquals(words.get(i), sorted.get(i)); + Collections.sort(sorted, comparator); + System.out.println(sorted.toString()); + for (int i = 0; i < words.size(); ++i) { + System.out.println(words.get(i) + "\t" + sorted.get(i)); + assertEquals(words.get(i), sorted.get(i)); + } + } + + public void testEnglishSort() { + final Transliterator normalizer = Transliterator.createFromRules("", Language.en.getDefaultNormalizerRules(), Transliterator.FORWARD); + + final List words = Arrays.asList( + "pre-print", + "preppie", + "preppy", + "preprocess"); + + final List sorted = new ArrayList(words); + final NormalizeComparator comparator = new NormalizeComparator(normalizer, Language.en.getCollator(), 7); + Collections.sort(sorted, comparator); + for (int i = 0; i < words.size(); ++i) { + if (i > 0) { + assertTrue(comparator.compare(words.get(i-1), words.get(i)) < 0); + } + System.out.println(words.get(i) + "\t" + sorted.get(i)); + assertEquals(words.get(i), sorted.get(i)); + } + + assertTrue(comparator.compare("pre-print", "preppy") < 0); + + } + + public void testLanguage() { + assertEquals(Language.de, Language.lookup("de")); + assertEquals(Language.en, Language.lookup("en")); + assertEquals("es", Language.lookup("es").getIsoCode()); + } + + public void testTextNorm() { + //final Transliterator transliterator = Transliterator.getInstance("Any-Latin; Upper; Lower; 'oe' > 'o'; NFD; [:Nonspacing Mark:] Remove; NFC", Transliterator.FORWARD); + final Transliterator transliterator = Transliterator.createFromRules("", ":: Any-Latin; :: Upper; :: Lower; 'oe' > 'o'; :: NFD; :: [:Nonspacing Mark:] Remove; :: NFC ;", Transliterator.FORWARD); + assertEquals("hoschen", transliterator.transliterate("Höschen")); + assertEquals("hoschen", transliterator.transliterate("Hoeschen")); + assertEquals("grosspoo", transliterator.transliterate("Großpoo")); + + assertEquals("kyanpasu", transliterator.transliterate("キャンパス")); + assertEquals("alphabetikos katalogos", transliterator.transliterate("Αλφαβητικός Κατάλογος")); + assertEquals("biologiceskom", transliterator.transliterate("биологическом")); + } + public void testHalfTextNorm() { + final Transliterator transliterator = Transliterator.createFromRules("", ":: Any-Latin; ' ' > ; :: Lower; ", Transliterator.FORWARD); + assertEquals("kyanpasu", transliterator.transliterate("キャンパス")); + assertEquals("alphabētikóskatálogos", transliterator.transliterate("Αλφαβητικός Κατάλογος")); + assertEquals("biologičeskom", transliterator.transliterate("биологическом")); + + assertEquals("xièxiè", transliterator.transliterate("謝謝")); + assertEquals("xièxiè", transliterator.transliterate("谢谢")); + + assertEquals("diànnǎo", transliterator.transliterate("電腦")); + assertEquals("diànnǎo", transliterator.transliterate("电脑")); + assertEquals("jìsuànjī", transliterator.transliterate("計算機")); + assertEquals("jìsuànjī", transliterator.transliterate("计算机")); + } + + + public void testChinese() { + final Language cmn = Language.lookup("cmn"); + final Transliterator transliterator = Transliterator.createFromRules("", cmn.getDefaultNormalizerRules(), Transliterator.FORWARD); + + assertEquals("xiexie", transliterator.transliterate("謝謝")); + assertEquals("xiexie", transliterator.transliterate("谢谢")); + + assertEquals("diannao", transliterator.transliterate("電腦")); + assertEquals("diannao", transliterator.transliterate("电脑")); + assertEquals("jisuanji", transliterator.transliterate("計算機")); + assertEquals("jisuanji", transliterator.transliterate("计算机")); + + assertEquals("chengjiu", transliterator.transliterate("成就")); + + } + + public void testArabic() { + final Language ar = Language.lookup("ar"); + final Transliterator transliterator = Transliterator.createFromRules("", ar.getDefaultNormalizerRules(), Transliterator.FORWARD); + // These don't seem quite right.... + assertEquals("haswb", transliterator.transliterate("حاسوب")); + assertEquals("kmbywtr", transliterator.transliterate("كمبيوتر")); + + assertEquals("{\u200e كمبيوتر \u200e}", Language.fixBidiText("{كمبيوتر}")); + assertEquals("{a=\u200e كمبيوتر \u200e}", Language.fixBidiText("{a=كمبيوتر}")); + assertEquals("(\u200e كمبيوتر \u200e)", Language.fixBidiText("(كمبيوتر)")); + assertEquals("أنثى أنْثَى (’únθā) {f}, إناث (’ināθ) {p}, اناثى (’anāθā) {p}", Language.fixBidiText("أنثى أنْثَى (’únθā) {f}, إناث (’ināθ) {p}, اناثى (’anāθā) {p}")); + } - } - - public void testEnglishSort() { - final Transliterator normalizer = Transliterator.createFromRules("", Language.en.getDefaultNormalizerRules(), Transliterator.FORWARD); - - final List words = Arrays.asList( - "pre-print", - "preppie", - "preppy", - "preprocess"); - - final List sorted = new ArrayList(words); - final NormalizeComparator comparator = new NormalizeComparator(normalizer, Language.en.getCollator(), 7); - Collections.sort(sorted, comparator); - for (int i = 0; i < words.size(); ++i) { - if (i > 0) { - assertTrue(comparator.compare(words.get(i-1), words.get(i)) < 0); - } - System.out.println(words.get(i) + "\t" + sorted.get(i)); - assertEquals(words.get(i), sorted.get(i)); + + public void testThai() { + final Language th = Language.lookup("TH"); + final Transliterator transliterator = Transliterator.createFromRules("", th.getDefaultNormalizerRules(), Transliterator.FORWARD); + // Not sure these are right, just to know... + assertEquals("d", transliterator.transliterate("ด")); + assertEquals("di", transliterator.transliterate("ด ี")); + assertEquals("dii", transliterator.transliterate("ดีี")); + + assertEquals(Collections.singleton("ดีี"), DictFileParser.tokenize("ดีี", DictFileParser.NON_CHAR)); } - - assertTrue(comparator.compare("pre-print", "preppy") < 0); - - } - - public void testLanguage() { - assertEquals(Language.de, Language.lookup("de")); - assertEquals(Language.en, Language.lookup("en")); - assertEquals("es", Language.lookup("es").getIsoCode()); - } - - public void testTextNorm() { - //final Transliterator transliterator = Transliterator.getInstance("Any-Latin; Upper; Lower; 'oe' > 'o'; NFD; [:Nonspacing Mark:] Remove; NFC", Transliterator.FORWARD); - final Transliterator transliterator = Transliterator.createFromRules("", ":: Any-Latin; :: Upper; :: Lower; 'oe' > 'o'; :: NFD; :: [:Nonspacing Mark:] Remove; :: NFC ;", Transliterator.FORWARD); - assertEquals("hoschen", transliterator.transliterate("Höschen")); - assertEquals("hoschen", transliterator.transliterate("Hoeschen")); - assertEquals("grosspoo", transliterator.transliterate("Großpoo")); - - assertEquals("kyanpasu", transliterator.transliterate("キャンパス")); - assertEquals("alphabetikos katalogos", transliterator.transliterate("Αλφαβητικός Κατάλογος")); - assertEquals("biologiceskom", transliterator.transliterate("биологическом")); - } - public void testHalfTextNorm() { - final Transliterator transliterator = Transliterator.createFromRules("", ":: Any-Latin; ' ' > ; :: Lower; ", Transliterator.FORWARD); - assertEquals("kyanpasu", transliterator.transliterate("キャンパス")); - assertEquals("alphabētikóskatálogos", transliterator.transliterate("Αλφαβητικός Κατάλογος")); - assertEquals("biologičeskom", transliterator.transliterate("биологическом")); - - assertEquals("xièxiè", transliterator.transliterate("謝謝")); - assertEquals("xièxiè", transliterator.transliterate("谢谢")); - - assertEquals("diànnǎo", transliterator.transliterate("電腦")); - assertEquals("diànnǎo", transliterator.transliterate("电脑")); - assertEquals("jìsuànjī", transliterator.transliterate("計算機")); - assertEquals("jìsuànjī", transliterator.transliterate("计算机")); - } - - - public void testChinese() { - final Language cmn = Language.lookup("cmn"); - final Transliterator transliterator = Transliterator.createFromRules("", cmn.getDefaultNormalizerRules(), Transliterator.FORWARD); - - assertEquals("xiexie", transliterator.transliterate("謝謝")); - assertEquals("xiexie", transliterator.transliterate("谢谢")); - - assertEquals("diannao", transliterator.transliterate("電腦")); - assertEquals("diannao", transliterator.transliterate("电脑")); - assertEquals("jisuanji", transliterator.transliterate("計算機")); - assertEquals("jisuanji", transliterator.transliterate("计算机")); - - assertEquals("chengjiu", transliterator.transliterate("成就")); - - } - - public void testArabic() { - final Language ar = Language.lookup("ar"); - final Transliterator transliterator = Transliterator.createFromRules("", ar.getDefaultNormalizerRules(), Transliterator.FORWARD); - // These don't seem quite right.... - assertEquals("haswb", transliterator.transliterate("حاسوب")); - assertEquals("kmbywtr", transliterator.transliterate("كمبيوتر")); - - assertEquals("{\u200e كمبيوتر \u200e}", Language.fixBidiText("{كمبيوتر}")); - assertEquals("{a=\u200e كمبيوتر \u200e}", Language.fixBidiText("{a=كمبيوتر}")); - assertEquals("(\u200e كمبيوتر \u200e)", Language.fixBidiText("(كمبيوتر)")); - assertEquals("أنثى أنْثَى (’únθā) {f}, إناث (’ināθ) {p}, اناثى (’anāθā) {p}", Language.fixBidiText("أنثى أنْثَى (’únθā) {f}, إناث (’ināθ) {p}, اناثى (’anāθā) {p}")); - - } - - public void testThai() { - final Language th = Language.lookup("TH"); - final Transliterator transliterator = Transliterator.createFromRules("", th.getDefaultNormalizerRules(), Transliterator.FORWARD); - // Not sure these are right, just to know... - assertEquals("d", transliterator.transliterate("ด")); - assertEquals("di", transliterator.transliterate("ด ี")); - assertEquals("dii", transliterator.transliterate("ดีี")); - - assertEquals(Collections.singleton("ดีี"), DictFileParser.tokenize("ดีี", DictFileParser.NON_CHAR)); - } - - - public void testEnWiktionaryNames() { - final Set enLangs = new LinkedHashSet(WiktionaryLangs.isoCodeToEnWikiName.keySet()); - final List names = new ArrayList(); - for (final String code : WiktionaryLangs.isoCodeToEnWikiName.keySet()) { - names.add(WiktionaryLangs.isoCodeToEnWikiName.get(code)); - enLangs.add(code.toLowerCase()); + + + public void testEnWiktionaryNames() { + final Set enLangs = new LinkedHashSet(WiktionaryLangs.isoCodeToEnWikiName.keySet()); + final List names = new ArrayList(); + for (final String code : WiktionaryLangs.isoCodeToEnWikiName.keySet()) { + names.add(WiktionaryLangs.isoCodeToEnWikiName.get(code)); + enLangs.add(code.toLowerCase()); + } + Collections.sort(names); + System.out.println(names); + //assertEquals(enLangs, Language.isoCodeToResources.keySet()); } - Collections.sort(names); - System.out.println(names); - //assertEquals(enLangs, Language.isoCodeToResources.keySet()); - } } diff --git a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java index 12b0c52..6839904 100644 --- a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java +++ b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java @@ -37,277 +37,276 @@ import com.hughes.android.dictionary.parser.wiktionary.WiktionaryLangs; public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { - // The matches the whole line, otherwise regexes don't work well on French: - // {{=uk=}} - // Spanish has no initial headings, tried to also detect {{ES as such - // with "^(\\{\\{ES|(=+)[^=]).*$" but that broke English. - static final Pattern headingStart = Pattern.compile("^(=+)[^=].*$", Pattern.MULTILINE); - - final Map> pathToSelectors = new LinkedHashMap>(); - List currentSelectors = null; - - StringBuilder titleBuilder; - StringBuilder textBuilder; - StringBuilder currentBuilder = null; - - public static void main(final String[] args) throws Exception { - final WiktionarySplitter wiktionarySplitter = new WiktionarySplitter(); - wiktionarySplitter.go(); - } - - private WiktionarySplitter() { - List selectors; - for (final String code : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.keySet()) { - //if (!code.equals("fr")) {continue;} - selectors = new ArrayList(); - pathToSelectors.put(String.format("data/inputs/%swiktionary-pages-articles.xml", code), selectors); - for (final Map.Entry entry : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.get(code).entrySet()) { - final String dir = String.format("data/inputs/wikiSplit/%s", code); - new File(dir).mkdirs(); - selectors.add(new Selector(String.format("%s/%s.data", dir, entry.getKey()), entry.getValue())); - } - } - } - - private void go() throws Exception { - final SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser(); - - // Configure things. - for (final Map.Entry> pathToSelectorsEntry : pathToSelectors.entrySet()) { - - currentSelectors = pathToSelectorsEntry.getValue(); - - for (final Selector selector : currentSelectors) { - selector.out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(selector.outFilename))); - } - - // Do it. - try { - parser.parse(new File(pathToSelectorsEntry.getKey()), this); - } catch (Exception e) { - System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString()); - throw e; - } - - // Shutdown. - for (final Selector selector : currentSelectors) { - selector.out.close(); - } - - } - } - - String lastPageTitle = null; - int pageCount = 0; - private void endPage() { - final String title = titleBuilder.toString(); - lastPageTitle = title; - if (++pageCount % 1000 == 0) { - System.out.println("endPage: " + title + ", count=" + pageCount); + // The matches the whole line, otherwise regexes don't work well on French: + // {{=uk=}} + // Spanish has no initial headings, tried to also detect {{ES as such + // with "^(\\{\\{ES|(=+)[^=]).*$" but that broke English. + static final Pattern headingStart = Pattern.compile("^(=+)[^=].*$", Pattern.MULTILINE); + + final Map> pathToSelectors = new LinkedHashMap>(); + List currentSelectors = null; + + StringBuilder titleBuilder; + StringBuilder textBuilder; + StringBuilder currentBuilder = null; + + public static void main(final String[] args) throws Exception { + final WiktionarySplitter wiktionarySplitter = new WiktionarySplitter(); + wiktionarySplitter.go(); } - if (title.startsWith("Wiktionary:") || - title.startsWith("Appendix:") || - title.startsWith("Help:") || - title.startsWith("Index:") || - title.startsWith("MediaWiki:") || - title.startsWith("Citations:") || - title.startsWith("Concordance:") || - title.startsWith("Glossary:") || - title.startsWith("Rhymes:") || - title.startsWith("Category:") || - title.startsWith("Wikisaurus:") || - title.startsWith("Unsupported titles/") || - title.startsWith("Transwiki:") || - title.startsWith("File:") || - title.startsWith("Thread:") || - title.startsWith("Template:") || - title.startsWith("Summary:") || - title.startsWith("Module:") || - // DE - title.startsWith("Datei:") || - title.startsWith("Verzeichnis:") || - title.startsWith("Vorlage:") || - title.startsWith("Thesaurus:") || - title.startsWith("Kategorie:") || - title.startsWith("Hilfe:") || - title.startsWith("Reim:") || - // FR: - title.startsWith("Annexe:") || - title.startsWith("Catégori:") || - title.startsWith("Modèle:") || - title.startsWith("Thésaurus:") || - title.startsWith("Projet:") || - title.startsWith("Aide:") || - title.startsWith("Fichier:") || - title.startsWith("Wiktionnaire:") || - title.startsWith("Catégorie:") || - title.startsWith("Portail:") || - title.startsWith("utiliusateur:") || - title.startsWith("Kategorio:") || - // IT - title.startsWith("Wikizionario:") || - title.startsWith("Appendice:") || - title.startsWith("Categoria:") || - title.startsWith("Aiuto:") || - title.startsWith("Portail:") || - // ES - title.startsWith("Apéndice:") || - title.startsWith("Archivo:") || - title.startsWith("Ayuda:") || - title.startsWith("Categoría:") || - title.startsWith("Plantilla:") || - title.startsWith("Wikcionario:") || - - // sentinel - false - ) { - return; + + private WiktionarySplitter() { + List selectors; + for (final String code : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.keySet()) { + //if (!code.equals("fr")) {continue;} + selectors = new ArrayList(); + pathToSelectors.put(String.format("data/inputs/%swiktionary-pages-articles.xml", code), selectors); + for (final Map.Entry entry : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.get(code).entrySet()) { + final String dir = String.format("data/inputs/wikiSplit/%s", code); + new File(dir).mkdirs(); + selectors.add(new Selector(String.format("%s/%s.data", dir, entry.getKey()), entry.getValue())); + } + } } - if (title.contains(":")) { - if (!title.startsWith("Sign gloss:")) { - System.err.println("title with colon: " + title); + + private void go() throws Exception { + final SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser(); + + // Configure things. + for (final Map.Entry> pathToSelectorsEntry : pathToSelectors.entrySet()) { + + currentSelectors = pathToSelectorsEntry.getValue(); + + for (final Selector selector : currentSelectors) { + selector.out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(selector.outFilename))); + } + + // Do it. + try { + parser.parse(new File(pathToSelectorsEntry.getKey()), this); + } catch (Exception e) { + System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString()); + throw e; + } + + // Shutdown. + for (final Selector selector : currentSelectors) { + selector.out.close(); + } + } } - - String text = textBuilder.toString(); - String translingual = ""; - - while (text.length() > 0) { - // Find start. - final Matcher startMatcher = headingStart.matcher(text); - if (!startMatcher.find()) { - return; - } - text = text.substring(startMatcher.end()); - - final String heading = startMatcher.group(); - for (final Selector selector : currentSelectors) { - if (heading.indexOf("Translingual") != -1) { - // Find end. - final int depth = startMatcher.group(1).length(); - final Pattern endPattern = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE); - - final Matcher endMatcher = endPattern.matcher(text); - if (endMatcher.find()) { - int end = endMatcher.start(); - translingual = text.substring(0, endMatcher.start()); - text = text.substring(end); - break; - } + + String lastPageTitle = null; + int pageCount = 0; + private void endPage() { + final String title = titleBuilder.toString(); + lastPageTitle = title; + if (++pageCount % 1000 == 0) { + System.out.println("endPage: " + title + ", count=" + pageCount); + } + if (title.startsWith("Wiktionary:") || + title.startsWith("Appendix:") || + title.startsWith("Help:") || + title.startsWith("Index:") || + title.startsWith("MediaWiki:") || + title.startsWith("Citations:") || + title.startsWith("Concordance:") || + title.startsWith("Glossary:") || + title.startsWith("Rhymes:") || + title.startsWith("Category:") || + title.startsWith("Wikisaurus:") || + title.startsWith("Unsupported titles/") || + title.startsWith("Transwiki:") || + title.startsWith("File:") || + title.startsWith("Thread:") || + title.startsWith("Template:") || + title.startsWith("Summary:") || + title.startsWith("Module:") || + // DE + title.startsWith("Datei:") || + title.startsWith("Verzeichnis:") || + title.startsWith("Vorlage:") || + title.startsWith("Thesaurus:") || + title.startsWith("Kategorie:") || + title.startsWith("Hilfe:") || + title.startsWith("Reim:") || + // FR: + title.startsWith("Annexe:") || + title.startsWith("Catégori:") || + title.startsWith("Modèle:") || + title.startsWith("Thésaurus:") || + title.startsWith("Projet:") || + title.startsWith("Aide:") || + title.startsWith("Fichier:") || + title.startsWith("Wiktionnaire:") || + title.startsWith("Catégorie:") || + title.startsWith("Portail:") || + title.startsWith("utiliusateur:") || + title.startsWith("Kategorio:") || + // IT + title.startsWith("Wikizionario:") || + title.startsWith("Appendice:") || + title.startsWith("Categoria:") || + title.startsWith("Aiuto:") || + title.startsWith("Portail:") || + // ES + title.startsWith("Apéndice:") || + title.startsWith("Archivo:") || + title.startsWith("Ayuda:") || + title.startsWith("Categoría:") || + title.startsWith("Plantilla:") || + title.startsWith("Wikcionario:") || + + // sentinel + false + ) { + return; } - if (selector.pattern.matcher(heading).find()) { - - // Find end. - final int depth = startMatcher.group(1).length(); - final Pattern endPattern = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE); - - final Matcher endMatcher = endPattern.matcher(text); - final int end; - if (endMatcher.find()) { - end = endMatcher.start(); - } else { - end = text.length(); - } - - String sectionText = text.substring(0, end); - // Hack to remove empty dummy section from French - if (sectionText.startsWith("\n=== {{S|étymologie}} ===\n: {{ébauche-étym")) - { - int dummy_end = sectionText.indexOf("}}", 41) + 2; - while (dummy_end + 1 < sectionText.length() && - sectionText.charAt(dummy_end) == '\n' && - sectionText.charAt(dummy_end + 1) == '\n') ++dummy_end; - sectionText = sectionText.substring(dummy_end); - } - if (heading.indexOf("Japanese") == -1) sectionText += translingual; - final Section section = new Section(title, heading, sectionText); - - try { - selector.out.writeUTF(section.title); - selector.out.writeUTF(section.heading); - final byte[] bytes = section.text.getBytes("UTF8"); - selector.out.writeInt(bytes.length); - selector.out.write(bytes); - } catch (IOException e) { - throw new RuntimeException(e); - } - - text = text.substring(end); - break; + if (title.contains(":")) { + if (!title.startsWith("Sign gloss:")) { + System.err.println("title with colon: " + title); + } } - } + + String text = textBuilder.toString(); + String translingual = ""; + + while (text.length() > 0) { + // Find start. + final Matcher startMatcher = headingStart.matcher(text); + if (!startMatcher.find()) { + return; + } + text = text.substring(startMatcher.end()); + + final String heading = startMatcher.group(); + for (final Selector selector : currentSelectors) { + if (heading.indexOf("Translingual") != -1) { + // Find end. + final int depth = startMatcher.group(1).length(); + final Pattern endPattern = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE); + + final Matcher endMatcher = endPattern.matcher(text); + if (endMatcher.find()) { + int end = endMatcher.start(); + translingual = text.substring(0, endMatcher.start()); + text = text.substring(end); + break; + } + } + if (selector.pattern.matcher(heading).find()) { + + // Find end. + final int depth = startMatcher.group(1).length(); + final Pattern endPattern = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE); + + final Matcher endMatcher = endPattern.matcher(text); + final int end; + if (endMatcher.find()) { + end = endMatcher.start(); + } else { + end = text.length(); + } + + String sectionText = text.substring(0, end); + // Hack to remove empty dummy section from French + if (sectionText.startsWith("\n=== {{S|étymologie}} ===\n: {{ébauche-étym")) { + int dummy_end = sectionText.indexOf("}}", 41) + 2; + while (dummy_end + 1 < sectionText.length() && + sectionText.charAt(dummy_end) == '\n' && + sectionText.charAt(dummy_end + 1) == '\n') ++dummy_end; + sectionText = sectionText.substring(dummy_end); + } + if (heading.indexOf("Japanese") == -1) sectionText += translingual; + final Section section = new Section(title, heading, sectionText); + + try { + selector.out.writeUTF(section.title); + selector.out.writeUTF(section.heading); + final byte[] bytes = section.text.getBytes("UTF8"); + selector.out.writeInt(bytes.length); + selector.out.write(bytes); + } catch (IOException e) { + throw new RuntimeException(e); + } + + text = text.substring(end); + break; + } + } + } + } - - } - - // ----------------------------------------------------------------------- - - static class Section implements java.io.Serializable { - private static final long serialVersionUID = -7676549898325856822L; - - final String title; - final String heading; - final String text; - - public Section(final String title, final String heading, final String text) { - this.title = title; - this.heading = heading; - this.text = text; - - //System.out.printf("TITLE:%s\nHEADING:%s\nTEXT:%s\n\n\n\n\n\n", title, heading, text); + + // ----------------------------------------------------------------------- + + static class Section implements java.io.Serializable { + private static final long serialVersionUID = -7676549898325856822L; + + final String title; + final String heading; + final String text; + + public Section(final String title, final String heading, final String text) { + this.title = title; + this.heading = heading; + this.text = text; + + //System.out.printf("TITLE:%s\nHEADING:%s\nTEXT:%s\n\n\n\n\n\n", title, heading, text); + } } - } - - static class Selector { - final String outFilename; - final Pattern pattern; - DataOutputStream out; + static class Selector { + final String outFilename; + final Pattern pattern; + + DataOutputStream out; - public Selector(final String filename, final String pattern) { - this.outFilename = filename; - this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE); + public Selector(final String filename, final String pattern) { + this.outFilename = filename; + this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE); + } } - } - // ----------------------------------------------------------------------- - + // ----------------------------------------------------------------------- + @Override public void startElement(String uri, String localName, String qName, - Attributes attributes) { - currentBuilder = null; - if ("page".equals(qName)) { - titleBuilder = new StringBuilder(); - - // Start with "\n" to better match certain strings. - textBuilder = new StringBuilder("\n"); - } else if ("title".equals(qName)) { - currentBuilder = titleBuilder; - } else if ("text".equals(qName)) { - currentBuilder = textBuilder; - } + Attributes attributes) { + currentBuilder = null; + if ("page".equals(qName)) { + titleBuilder = new StringBuilder(); + + // Start with "\n" to better match certain strings. + textBuilder = new StringBuilder("\n"); + } else if ("title".equals(qName)) { + currentBuilder = titleBuilder; + } else if ("text".equals(qName)) { + currentBuilder = textBuilder; + } } @Override public void characters(char[] ch, int start, int length) throws SAXException { - if (currentBuilder != null) { - currentBuilder.append(ch, start, length); - } + if (currentBuilder != null) { + currentBuilder.append(ch, start, length); + } } @Override public void endElement(String uri, String localName, String qName) - throws SAXException { - currentBuilder = null; - if ("page".equals(qName)) { - endPage(); - } + throws SAXException { + currentBuilder = null; + if ("page".equals(qName)) { + endPage(); + } } - + public void parse(final File file) throws ParserConfigurationException, SAXException, IOException { - final SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser(); - parser.parse(file, this); + final SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser(); + parser.parse(file, this); } - + } diff --git a/src/com/hughes/android/dictionary/parser/DictFileParser.java b/src/com/hughes/android/dictionary/parser/DictFileParser.java index 8015f9a..07d0775 100644 --- a/src/com/hughes/android/dictionary/parser/DictFileParser.java +++ b/src/com/hughes/android/dictionary/parser/DictFileParser.java @@ -39,241 +39,241 @@ import com.hughes.android.dictionary.engine.PairEntry; import com.hughes.android.dictionary.engine.PairEntry.Pair; public class DictFileParser implements Parser { - - static final Logger logger = Logger.getLogger(DictFileParser.class.getName()); - - // Dictcc - public static final Pattern TAB = Pattern.compile("\\t"); - - // Chemnitz - public static final Pattern DOUBLE_COLON = Pattern.compile(" :: "); - public static final Pattern PIPE = Pattern.compile("\\|"); - - static final Pattern SPACES = Pattern.compile("\\s+"); - - static final Pattern BRACKETED = Pattern.compile("\\[([^]]+)\\]"); - static final Pattern PARENTHESIZED = Pattern.compile("\\(([^)]+)\\)"); - static final Pattern CURLY_BRACED = Pattern.compile("\\{([^}]+)\\}"); - - // http://www.regular-expressions.info/unicode.html - static final Pattern NON_CHAR_DASH = Pattern.compile("[^-'\\p{L}\\p{M}\\p{N}]+"); - public static final Pattern NON_CHAR = Pattern.compile("[^\\p{L}\\p{M}\\p{N}]+"); - - static final Pattern TRIM_PUNC = Pattern.compile("^[^\\p{L}\\p{M}\\p{N}]+|[^\\p{L}\\p{M}\\p{N}]+$"); - - final Charset charset; - final boolean flipCols; - - final Pattern fieldSplit; - final Pattern subfieldSplit; - - final DictionaryBuilder dictBuilder; - final IndexBuilder[] langIndexBuilders; - final IndexBuilder bothIndexBuilder; - - EntrySource entrySource; - - // final Set alreadyDone = new HashSet(); - - public DictFileParser(final Charset charset, boolean flipCols, - final Pattern fieldSplit, final Pattern subfieldSplit, - final DictionaryBuilder dictBuilder, final IndexBuilder[] langIndexBuilders, - final IndexBuilder bothIndexBuilder) { - this.charset = charset; - this.flipCols = flipCols; - this.fieldSplit = fieldSplit; - this.subfieldSplit = subfieldSplit; - this.dictBuilder = dictBuilder; - this.langIndexBuilders = langIndexBuilders; - this.bothIndexBuilder = bothIndexBuilder; - } - - @Override - public void parse(final File file, final EntrySource entrySouce, final int pageLimit) throws IOException { - this.entrySource = entrySouce; - final BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset)); - String line; - int count = 0; - while ((line = reader.readLine()) != null) { - if (pageLimit >= 0 && count >= pageLimit) { - return; - } - if (count % 10000 == 0) { - logger.info("count=" + count + ", line=" + line); - } - parseLine(line); - ++count; - } - } - - private void parseLine(final String line) { - if (line.startsWith("#") || line.length() == 0) { - logger.info("Skipping comment line: " + line); - return; - } - final String[] fields = fieldSplit.split(line); - // dictcc now has a part of speech field as field #3. - if (fields.length < 2 || fields.length > 3) { - logger.warning("Malformed line: " + line); - return; - } - - fields[0] = SPACES.matcher(fields[0]).replaceAll(" ").trim(); - fields[1] = SPACES.matcher(fields[1]).replaceAll(" ").trim(); - if (flipCols) { - final String temp = fields[0]; - fields[0] = fields[1]; - fields[1] = temp; - } - final String[][] subfields = new String[2][]; - if (subfieldSplit != null) { - subfields[0] = subfieldSplit.split(fields[0]); - subfields[1] = subfieldSplit.split(fields[1]); - if (subfields[0].length != subfields[1].length) { - logger.warning("Number of subfields doesn't match: " + line); - return; - } - } else { - subfields[0] = new String[] { fields[0] }; - subfields[1] = new String[] { fields[1] }; - } - - final PairEntry pairEntry = new PairEntry(entrySource); - for (int i = 0; i < subfields[0].length; ++i) { - subfields[0][i] = subfields[0][i].trim(); - subfields[1][i] = subfields[1][i].trim(); - if (subfields[0][i].length() == 0 && subfields[1][i].length() == 0) { - logger.warning("Empty pair: " + line); - continue; - } - if (subfields[0][i].length() == 0) { - subfields[0][i] = "__"; - } - if (subfields[1][i].length() == 0) { - subfields[1][i] = "__"; - } - pairEntry.pairs.add(new Pair(subfields[0][i], subfields[1][i])); + static final Logger logger = Logger.getLogger(DictFileParser.class.getName()); + + // Dictcc + public static final Pattern TAB = Pattern.compile("\\t"); + + // Chemnitz + public static final Pattern DOUBLE_COLON = Pattern.compile(" :: "); + public static final Pattern PIPE = Pattern.compile("\\|"); + + static final Pattern SPACES = Pattern.compile("\\s+"); + + static final Pattern BRACKETED = Pattern.compile("\\[([^]]+)\\]"); + static final Pattern PARENTHESIZED = Pattern.compile("\\(([^)]+)\\)"); + static final Pattern CURLY_BRACED = Pattern.compile("\\{([^}]+)\\}"); + + // http://www.regular-expressions.info/unicode.html + static final Pattern NON_CHAR_DASH = Pattern.compile("[^-'\\p{L}\\p{M}\\p{N}]+"); + public static final Pattern NON_CHAR = Pattern.compile("[^\\p{L}\\p{M}\\p{N}]+"); + + static final Pattern TRIM_PUNC = Pattern.compile("^[^\\p{L}\\p{M}\\p{N}]+|[^\\p{L}\\p{M}\\p{N}]+$"); + + final Charset charset; + final boolean flipCols; + + final Pattern fieldSplit; + final Pattern subfieldSplit; + + final DictionaryBuilder dictBuilder; + final IndexBuilder[] langIndexBuilders; + final IndexBuilder bothIndexBuilder; + + EntrySource entrySource; + + // final Set alreadyDone = new HashSet(); + + public DictFileParser(final Charset charset, boolean flipCols, + final Pattern fieldSplit, final Pattern subfieldSplit, + final DictionaryBuilder dictBuilder, final IndexBuilder[] langIndexBuilders, + final IndexBuilder bothIndexBuilder) { + this.charset = charset; + this.flipCols = flipCols; + this.fieldSplit = fieldSplit; + this.subfieldSplit = subfieldSplit; + this.dictBuilder = dictBuilder; + this.langIndexBuilders = langIndexBuilders; + this.bothIndexBuilder = bothIndexBuilder; } - final IndexedEntry entryData = new IndexedEntry(pairEntry); - entryData.isValid = true; - - for (int l = 0; l < 2; ++l) { - // alreadyDone.clear(); - - for (int j = 0; j < subfields[l].length; ++j) { - String subfield = subfields[l][j]; - final IndexBuilder indexBuilder = langIndexBuilders[l]; - if (indexBuilder.index.sortLanguage == Language.de) { - subfield = parseField_DE(indexBuilder, subfield, entryData, j); - } else if (indexBuilder.index.sortLanguage == Language.en) { - subfield = parseField_EN(indexBuilder, subfield, entryData, j); + + @Override + public void parse(final File file, final EntrySource entrySouce, final int pageLimit) throws IOException { + this.entrySource = entrySouce; + final BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset)); + String line; + int count = 0; + while ((line = reader.readLine()) != null) { + if (pageLimit >= 0 && count >= pageLimit) { + return; + } + if (count % 10000 == 0) { + logger.info("count=" + count + ", line=" + line); + } + parseLine(line); + ++count; } - parseFieldGeneric(indexBuilder, subfield, entryData, j, subfields[l].length); - } - } - } - - private void parseFieldGeneric(final IndexBuilder indexBuilder, String field, - final IndexedEntry entryData, final int subfieldIdx, final int numSubFields) { - // remove bracketed and parenthesized stuff. - final StringBuilder bracketed = new StringBuilder(); - final StringBuilder parenthesized = new StringBuilder(); - - Matcher matcher; - while ((matcher = BRACKETED.matcher(field)).find()) { - bracketed.append(matcher.group(1)).append(" "); - field = matcher.replaceFirst(" "); } - while ((matcher = PARENTHESIZED.matcher(field)).find()) { - parenthesized.append(matcher.group(1)).append(" "); - field = matcher.replaceFirst(" "); - } - - field = SPACES.matcher(field).replaceAll(" ").trim(); - - // split words on non -A-z0-9, do them. - final String[] tokens = NON_CHAR_DASH.split(field); - - final EntryTypeName entryTypeName; - if (numSubFields == 1) { - assert subfieldIdx == 0; - if (tokens.length == 1) { - entryTypeName = EntryTypeName.ONE_WORD; - } else if (tokens.length == 2) { - entryTypeName = EntryTypeName.TWO_WORDS; - } else if (tokens.length == 3) { - entryTypeName = EntryTypeName.THREE_WORDS; - } else if (tokens.length == 4) { - entryTypeName = EntryTypeName.FOUR_WORDS; - } else { - entryTypeName = EntryTypeName.FIVE_OR_MORE_WORDS; - } - } else { - assert numSubFields > 1; - if (subfieldIdx == 0) { - if (tokens.length == 1) { - entryTypeName = EntryTypeName.MULTIROW_HEAD_ONE_WORD; - } else { - entryTypeName = EntryTypeName.MULTIROW_HEAD_MANY_WORDS; + private void parseLine(final String line) { + if (line.startsWith("#") || line.length() == 0) { + logger.info("Skipping comment line: " + line); + return; + } + final String[] fields = fieldSplit.split(line); + // dictcc now has a part of speech field as field #3. + if (fields.length < 2 || fields.length > 3) { + logger.warning("Malformed line: " + line); + return; } - } else { - assert subfieldIdx > 0; - if (tokens.length == 1) { - entryTypeName = EntryTypeName.MULTIROW_TAIL_ONE_WORD; + + fields[0] = SPACES.matcher(fields[0]).replaceAll(" ").trim(); + fields[1] = SPACES.matcher(fields[1]).replaceAll(" ").trim(); + if (flipCols) { + final String temp = fields[0]; + fields[0] = fields[1]; + fields[1] = temp; + } + + final String[][] subfields = new String[2][]; + if (subfieldSplit != null) { + subfields[0] = subfieldSplit.split(fields[0]); + subfields[1] = subfieldSplit.split(fields[1]); + if (subfields[0].length != subfields[1].length) { + logger.warning("Number of subfields doesn't match: " + line); + return; + } } else { - entryTypeName = EntryTypeName.MULTIROW_TAIL_MANY_WORDS; + subfields[0] = new String[] { fields[0] }; + subfields[1] = new String[] { fields[1] }; } - } - } - for (String token : tokens) { - token = TRIM_PUNC.matcher(token).replaceAll(""); - if (/*!alreadyDone.contains(token) && */token.length() > 0) { - indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), entryTypeName); - // alreadyDone.add(token); - - // also split words on dashes, do them, too. - if (token.contains("-")) { - final String[] dashed = token.split("-"); - for (final String dashedToken : dashed) { - if (/*!alreadyDone.contains(dashedToken) && */dashedToken.length() > 0) { - indexBuilder.addEntryWithTokens(entryData, Collections.singleton(dashedToken), EntryTypeName.PART_OF_HYPHENATED); + final PairEntry pairEntry = new PairEntry(entrySource); + for (int i = 0; i < subfields[0].length; ++i) { + subfields[0][i] = subfields[0][i].trim(); + subfields[1][i] = subfields[1][i].trim(); + if (subfields[0][i].length() == 0 && subfields[1][i].length() == 0) { + logger.warning("Empty pair: " + line); + continue; + } + if (subfields[0][i].length() == 0) { + subfields[0][i] = "__"; + } + if (subfields[1][i].length() == 0) { + subfields[1][i] = "__"; } - } + pairEntry.pairs.add(new Pair(subfields[0][i], subfields[1][i])); } + final IndexedEntry entryData = new IndexedEntry(pairEntry); + entryData.isValid = true; - } // if (!alreadyDone.contains(token)) { - } // for (final String token : tokens) { - - // process bracketed stuff (split on spaces and dashes always) - final String[] bracketedTokens = NON_CHAR.split(bracketed.toString()); - for (final String token : bracketedTokens) { - assert !token.contains("-"); - if (/*!alreadyDone.contains(token) && */token.length() > 0) { - indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), EntryTypeName.BRACKETED); - } + for (int l = 0; l < 2; ++l) { + // alreadyDone.clear(); + + for (int j = 0; j < subfields[l].length; ++j) { + String subfield = subfields[l][j]; + final IndexBuilder indexBuilder = langIndexBuilders[l]; + if (indexBuilder.index.sortLanguage == Language.de) { + subfield = parseField_DE(indexBuilder, subfield, entryData, j); + } else if (indexBuilder.index.sortLanguage == Language.en) { + subfield = parseField_EN(indexBuilder, subfield, entryData, j); + } + parseFieldGeneric(indexBuilder, subfield, entryData, j, subfields[l].length); + } + } } - - // process paren stuff - final String[] parenTokens = NON_CHAR.split(parenthesized.toString()); - for (final String token : parenTokens) { - assert !token.contains("-"); - if (/*!alreadyDone.contains(token) && */token.length() > 0) { - indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), EntryTypeName.PARENTHESIZED); - } + + private void parseFieldGeneric(final IndexBuilder indexBuilder, String field, + final IndexedEntry entryData, final int subfieldIdx, final int numSubFields) { + // remove bracketed and parenthesized stuff. + final StringBuilder bracketed = new StringBuilder(); + final StringBuilder parenthesized = new StringBuilder(); + + Matcher matcher; + while ((matcher = BRACKETED.matcher(field)).find()) { + bracketed.append(matcher.group(1)).append(" "); + field = matcher.replaceFirst(" "); + } + + while ((matcher = PARENTHESIZED.matcher(field)).find()) { + parenthesized.append(matcher.group(1)).append(" "); + field = matcher.replaceFirst(" "); + } + + field = SPACES.matcher(field).replaceAll(" ").trim(); + + // split words on non -A-z0-9, do them. + final String[] tokens = NON_CHAR_DASH.split(field); + + final EntryTypeName entryTypeName; + if (numSubFields == 1) { + assert subfieldIdx == 0; + if (tokens.length == 1) { + entryTypeName = EntryTypeName.ONE_WORD; + } else if (tokens.length == 2) { + entryTypeName = EntryTypeName.TWO_WORDS; + } else if (tokens.length == 3) { + entryTypeName = EntryTypeName.THREE_WORDS; + } else if (tokens.length == 4) { + entryTypeName = EntryTypeName.FOUR_WORDS; + } else { + entryTypeName = EntryTypeName.FIVE_OR_MORE_WORDS; + } + } else { + assert numSubFields > 1; + if (subfieldIdx == 0) { + if (tokens.length == 1) { + entryTypeName = EntryTypeName.MULTIROW_HEAD_ONE_WORD; + } else { + entryTypeName = EntryTypeName.MULTIROW_HEAD_MANY_WORDS; + } + } else { + assert subfieldIdx > 0; + if (tokens.length == 1) { + entryTypeName = EntryTypeName.MULTIROW_TAIL_ONE_WORD; + } else { + entryTypeName = EntryTypeName.MULTIROW_TAIL_MANY_WORDS; + } + } + } + + for (String token : tokens) { + token = TRIM_PUNC.matcher(token).replaceAll(""); + if (/*!alreadyDone.contains(token) && */token.length() > 0) { + indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), entryTypeName); + // alreadyDone.add(token); + + // also split words on dashes, do them, too. + if (token.contains("-")) { + final String[] dashed = token.split("-"); + for (final String dashedToken : dashed) { + if (/*!alreadyDone.contains(dashedToken) && */dashedToken.length() > 0) { + indexBuilder.addEntryWithTokens(entryData, Collections.singleton(dashedToken), EntryTypeName.PART_OF_HYPHENATED); + } + } + } + + } // if (!alreadyDone.contains(token)) { + } // for (final String token : tokens) { + + // process bracketed stuff (split on spaces and dashes always) + final String[] bracketedTokens = NON_CHAR.split(bracketed.toString()); + for (final String token : bracketedTokens) { + assert !token.contains("-"); + if (/*!alreadyDone.contains(token) && */token.length() > 0) { + indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), EntryTypeName.BRACKETED); + } + } + + // process paren stuff + final String[] parenTokens = NON_CHAR.split(parenthesized.toString()); + for (final String token : parenTokens) { + assert !token.contains("-"); + if (/*!alreadyDone.contains(token) && */token.length() > 0) { + indexBuilder.addEntryWithTokens(entryData, Collections.singleton(token), EntryTypeName.PARENTHESIZED); + } + } + } - - } - private String parseField_DE(final IndexBuilder indexBuilder, String field, - final IndexedEntry entryData, final int subfieldIdx) { - + private String parseField_DE(final IndexBuilder indexBuilder, String field, + final IndexedEntry entryData, final int subfieldIdx) { + // final Matcher matcher = DE_NOUN.matcher(field); // while (matcher.find()) { // final String noun = matcher.group(1); - //final String gender = matcher.group(2); + //final String gender = matcher.group(2); // if (alreadyDone.add(noun)) { // System.out.println("Found DE noun " + noun + ", " + gender); // final List entries = indexBuilder.getOrCreateEntries(noun, EntryTypeName.NOUN); @@ -281,26 +281,26 @@ public class DictFileParser implements Parser { // } // } - // In English, curly braces are used for different tenses. - field = CURLY_BRACED.matcher(field).replaceAll(" "); + // In English, curly braces are used for different tenses. + field = CURLY_BRACED.matcher(field).replaceAll(" "); + + return field; + } + + private String parseField_EN(final IndexBuilder indexBuilder, String field, + final IndexedEntry entryData, final int subfieldIdx) { + if (field.startsWith("to ")) { + field = field.substring(3); + } + return field; + } - return field; - } - - private String parseField_EN(final IndexBuilder indexBuilder, String field, - final IndexedEntry entryData, final int subfieldIdx) { - if (field.startsWith("to ")) { - field = field.substring(3); + public static final Set tokenize(final String text, final Pattern pattern) { + final String[] split = pattern.split(text); + final Set result = new LinkedHashSet(Arrays.asList(split)); + result.remove(""); + return result; } - return field; - } - - public static final Set tokenize(final String text, final Pattern pattern) { - final String[] split = pattern.split(text); - final Set result = new LinkedHashSet(Arrays.asList(split)); - result.remove(""); - return result; - } } diff --git a/src/com/hughes/android/dictionary/parser/Parser.java b/src/com/hughes/android/dictionary/parser/Parser.java index b0f2e96..969796d 100644 --- a/src/com/hughes/android/dictionary/parser/Parser.java +++ b/src/com/hughes/android/dictionary/parser/Parser.java @@ -20,7 +20,7 @@ import java.io.IOException; import com.hughes.android.dictionary.engine.EntrySource; public interface Parser { - - void parse(final File file, final EntrySource entrySource, final int pageLimit) throws IOException; + + void parse(final File file, final EntrySource entrySource, final int pageLimit) throws IOException; } diff --git a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java index d6c8901..8cf882e 100644 --- a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java +++ b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java @@ -22,625 +22,625 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; public final class WikiTokenizer { - - public static interface Callback { - void onPlainText(final String text); - void onMarkup(WikiTokenizer wikiTokenizer); - void onWikiLink(WikiTokenizer wikiTokenizer); - void onNewline(WikiTokenizer wikiTokenizer); - void onFunction(final WikiTokenizer tokenizer, String functionName, List functionPositionArgs, - Map functionNamedArgs); - void onHeading(WikiTokenizer wikiTokenizer); - void onListItem(WikiTokenizer wikiTokenizer); - void onComment(WikiTokenizer wikiTokenizer); - void onHtml(WikiTokenizer wikiTokenizer); - } - - public static class DoNothingCallback implements Callback { - - @Override - public void onPlainText(String text) { - } - - @Override - public void onMarkup(WikiTokenizer wikiTokenizer) { - } - - @Override - public void onWikiLink(WikiTokenizer wikiTokenizer) { - } - - @Override - public void onNewline(WikiTokenizer wikiTokenizer) { - } - - @Override - public void onFunction(WikiTokenizer tokenizer, String functionName, - List functionPositionArgs, Map functionNamedArgs) { - } - - @Override - public void onHeading(WikiTokenizer wikiTokenizer) { - } - - @Override - public void onListItem(WikiTokenizer wikiTokenizer) { - } - - @Override - public void onComment(WikiTokenizer wikiTokenizer) { - } - - @Override - public void onHtml(WikiTokenizer wikiTokenizer) { - } - } - - //private static final Pattern wikiTokenEvent = Pattern.compile("($)", Pattern.MULTILINE); - private static final Pattern wikiTokenEvent = Pattern.compile("(" + - "\\{\\{|\\}\\}|" + - "\\[\\[|\\]\\]|" + - "\\||" + // Need the | because we might have to find unescaped pipes - "=|" + // Need the = because we might have to find unescaped = - "", "\n"); + return this; + } + + if (wikiText.startsWith("}}", start) || wikiText.startsWith("]]", start)) { + errors.add("Close without open!"); + end += 2; + return this; + } + + if (wikiText.charAt(start) == '|' || wikiText.charAt(start) == '=') { + isPlainText = true; + ++end; + return this; + } + + + if (this.matcher.find(start)) { + end = this.matcher.start(1); + isPlainText = true; + if (end == start) { + errors.add("Empty group: " + this.matcher.group()); + assert false; + } + return this; + } + + end = wikiText.length(); + return this; + + } finally { + if (!errors.isEmpty()) { + System.err.println("Errors: " + errors + ", token=" + token()); + } } - } - } - } - - public List errors() { - return errors; - } - - public boolean isNewline() { - return justReturnedNewline; - } - - public void returnToLineStart() { - end = start = lastLineStart; - justReturnedNewline = true; - } - - public boolean isHeading() { - return headingWikiText != null; - } - - public String headingWikiText() { - assert isHeading(); - return headingWikiText; - } - - public int headingDepth() { - assert isHeading(); - return headingDepth; - } - - public boolean isMarkup() { - return isMarkup; - } - - public boolean isComment() { - return isComment; - } - - public boolean isListItem() { - return listPrefixEnd != -1; - } - - public String listItemPrefix() { - assert isListItem(); - return wikiText.substring(start, listPrefixEnd); - } - - public static String getListTag(char c) { - if (c == '#') { - return "ol"; - } - return "ul"; - } - - public String listItemWikiText() { - assert isListItem(); - return wikiText.substring(listPrefixEnd, end); - } - - public boolean isFunction() { - return isFunction; - } - - public String functionName() { - assert isFunction(); - // "{{.." - if (firstUnescapedPipePos != -1) { - return trimNewlines(wikiText.substring(start + 2, firstUnescapedPipePos).trim()); - } - final int safeEnd = Math.max(start + 2, end - 2); - return trimNewlines(wikiText.substring(start + 2, safeEnd).trim()); - } - - public List functionPositionArgs() { - return positionArgs; - } - - public Map functionNamedArgs() { - return namedArgs; - } - - public boolean isPlainText() { - return isPlainText; - } - - public boolean isWikiLink() { - return isWikiLink; - } - - public String wikiLinkText() { - assert isWikiLink(); - // "[[.." - if (lastUnescapedPipePos != -1) { - return trimNewlines(wikiText.substring(lastUnescapedPipePos + 1, end - 2)); - } - assert start + 2 < wikiText.length() && end >= 2: wikiText; - return trimNewlines(wikiText.substring(start + 2, end - 2)); - } - - public String wikiLinkDest() { - assert isWikiLink(); - // "[[.." - if (firstUnescapedPipePos != -1) { - return trimNewlines(wikiText.substring(start + 2, firstUnescapedPipePos)); - } - return null; - } - - public boolean isHtml() { - return isHtml; - } - - public boolean remainderStartsWith(final String prefix) { - return wikiText.startsWith(prefix, start); - } - - public void nextLine() { - final int oldStart = start; - while(nextToken() != null && !isNewline()) {} - if (isNewline()) { - --end; - } - start = oldStart; - } - - - public WikiTokenizer nextToken() { - this.clear(); - - start = end; - - if (justReturnedNewline) { - lastLineStart = start; - } - - try { - - final int len = wikiText.length(); - if (start >= len) { - return null; - } - - // Eat a newline if we're looking at one: - final boolean atNewline = wikiText.charAt(end) == '\n' || wikiText.charAt(end) == '\u2028'; - if (atNewline) { - justReturnedNewline = true; - ++end; - return this; - } - - if (justReturnedNewline) { - justReturnedNewline = false; - - final char firstChar = wikiText.charAt(end); - if (firstChar == '=') { - final int headerStart = end; - // Skip ===... - while (++end < len && wikiText.charAt(end) == '=') {} - final int headerTitleStart = end; - headingDepth = headerTitleStart - headerStart; - // Skip non-=... - if (end < len) { - final int nextNewline = safeIndexOf(wikiText, end, "\n", "\n"); - final int closingEquals = escapedFindEnd(end, "="); - if (wikiText.charAt(closingEquals - 1) == '=') { - end = closingEquals - 1; - } else { - end = nextNewline; - } + + } + + public String token() { + final String token = wikiText.substring(start, end); + assert token.equals("\n") || !token.endsWith("\n") : "token='" + token + "'"; + return token; + } + + final static String[] patterns = { "\n", "{{", "}}", "[[", "]]", "|", "=", ""); + if (end == -1) { + errors.add("Unmatched ", "\n"); - return this; - } - - if (wikiText.startsWith("}}", start) || wikiText.startsWith("]]", start)) { - errors.add("Close without open!"); - end += 2; - return this; - } - - if (wikiText.charAt(start) == '|' || wikiText.charAt(start) == '=') { - isPlainText = true; - ++end; - return this; - } - - - if (this.matcher.find(start)) { - end = this.matcher.start(1); - isPlainText = true; - if (end == start) { - errors.add("Empty group: " + this.matcher.group()); - assert false; - } - return this; - } - - end = wikiText.length(); - return this; - - } finally { - if (!errors.isEmpty()) { - System.err.println("Errors: " + errors + ", token=" + token()); - } - } - - } - - public String token() { - final String token = wikiText.substring(start, end); - assert token.equals("\n") || !token.endsWith("\n") : "token='" + token + "'"; - return token; - } - - final static String[] patterns = { "\n", "{{", "}}", "[[", "]]", "|", "=", ""); - if (end == -1) { - errors.add("Unmatched '''pretty''' cool '''''over''''' there." + "\n" + - "hi " + "\n" + - "" + "\n" + - "asdf\n" + - "{{template_not_in_list}}" + "\n" + - "# {{template_in_list}}" + "\n" + - "[[wikitext]]:[[wikitext]]" + "\n" + // don't want this to trigger a list - ": but this is a list!" + "\n" + - "*:* and so is this :::" + "\n" + - "here's [[some blah|some]] wikitext." + "\n" + - "here's a {{template|this has an = sign|blah=2|blah2=3|" + "\n" + - "blah3=3,[[asdf]|[asdf asdf]|[asdf asdf asdf]],blah4=4}} and some more text." + "\n" + - "== Header 2 ==" + "\n" + - "{{some-func|blah={{nested-func|n2}}|blah2=asdf}}" + "\n" + - "{{mismatched]]" + "\n" + - "[[mismatched}}" + "\n" + - "{extraterminated}}" + "\n" + - "[extraterminated]]" + "\n" + - "=== {{header-template}} ===" + "\n"; - - final String[] expectedTokens = new String[] { - "Hi", - "\n", - "Hello ", - "=", - "thad", - "|", - " you're ", - "", - " ", - "'''", - "pretty", - "'''", - " cool ", - "'''", - "''", - "over", - "'''", - "''", - " there.", - "\n", - "hi ", - "", - "\n", - "\n", - "asdf", - "\n", - "{{template_not_in_list}}", - "\n", - "# {{template_in_list}}", - "\n", - "[[wikitext]]", - ":", - "[[wikitext]]", - "\n", - ": but this is a list!", - "\n", - "*:* and so is this :::", - "\n", - "here's ", - "[[some blah|some]]", - " wikitext.", - "\n", - "here's a ", - "{{template|this has an = sign|blah=2|blah2=3|\nblah3=3,[[asdf]|[asdf asdf]|[asdf asdf asdf]],blah4=4}}", - " and some more text.", - "\n", - "== Header 2 ==", - "\n", - "{{some-func|blah={{nested-func|n2}}|blah2=asdf}}", - "\n", - "{{mismatched]]", - "\n", - "[[mismatched}}", - "\n", - "{extraterminated", - "}}", - "\n", - "[extraterminated", - "]]", - "\n", - "=== {{header-template}} ===", - "\n", + public void testReturn() { + String wikiText; + + wikiText = "hello\n=Heading=\nhello2"; + + final WikiTokenizer tokenizer = new WikiTokenizer(wikiText); + + assertEquals("hello", tokenizer.nextToken().token()); + tokenizer.returnToLineStart(); + assertEquals("hello", tokenizer.nextToken().token()); + assertEquals("\n", tokenizer.nextToken().token()); + tokenizer.returnToLineStart(); + assertEquals("hello", tokenizer.nextToken().token()); + assertEquals("\n", tokenizer.nextToken().token()); + + assertEquals("=Heading=", tokenizer.nextToken().token()); + tokenizer.returnToLineStart(); + assertEquals("=Heading=", tokenizer.nextToken().token()); + assertEquals("\n", tokenizer.nextToken().token()); + tokenizer.returnToLineStart(); + assertEquals("=Heading=", tokenizer.nextToken().token()); + assertEquals("\n", tokenizer.nextToken().token()); + + assertEquals("hello2", tokenizer.nextToken().token()); + assertEquals(null, tokenizer.nextToken()); + tokenizer.returnToLineStart(); + assertEquals("hello2", tokenizer.nextToken().token()); + assertEquals(null, tokenizer.nextToken()); + + + } + + public void testWikiHeading() { + String wikiText; + + wikiText = "=="; + assertEquals("==", new WikiTokenizer(wikiText).nextToken().token()); + assertTrue(new WikiTokenizer(wikiText).nextToken().isHeading()); + assertEquals(2, new WikiTokenizer(wikiText).nextToken().headingDepth()); + assertEquals("", new WikiTokenizer(wikiText).nextToken().headingWikiText()); + assertEquals(1, new WikiTokenizer(wikiText).nextToken().errors.size()); + + + wikiText = "=a"; + assertEquals("=a", new WikiTokenizer(wikiText).nextToken().token()); + assertTrue(new WikiTokenizer(wikiText).nextToken().isHeading()); + assertEquals(1, new WikiTokenizer(wikiText).nextToken().headingDepth()); + assertEquals("a", new WikiTokenizer(wikiText).nextToken().headingWikiText()); + assertEquals(2, new WikiTokenizer(wikiText).nextToken().errors.size()); + + wikiText = "=a=="; + assertEquals("=a==", new WikiTokenizer(wikiText).nextToken().token()); + assertTrue(new WikiTokenizer(wikiText).nextToken().isHeading()); + assertEquals(1, new WikiTokenizer(wikiText).nextToken().headingDepth()); + assertEquals("a", new WikiTokenizer(wikiText).nextToken().headingWikiText()); + assertEquals(1, new WikiTokenizer(wikiText).nextToken().errors.size()); + + wikiText = "a="; + assertEquals("a", new WikiTokenizer(wikiText).nextToken().token()); + assertFalse(new WikiTokenizer(wikiText).nextToken().isHeading()); + + wikiText = "=a="; + assertEquals("=a=", new WikiTokenizer(wikiText).nextToken().token()); + assertTrue(new WikiTokenizer(wikiText).nextToken().isHeading()); + assertEquals(1, new WikiTokenizer(wikiText).nextToken().headingDepth()); + assertEquals("a", new WikiTokenizer(wikiText).nextToken().headingWikiText()); + assertEquals(0, new WikiTokenizer(wikiText).nextToken().errors.size()); + + wikiText = "==aa[[|=]] {{|={{=}} }}=="; + assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token()); + assertTrue(new WikiTokenizer(wikiText).nextToken().isHeading()); + assertEquals(2, new WikiTokenizer(wikiText).nextToken().headingDepth()); + assertEquals("aa[[|=]] {{|={{=}} }}", new WikiTokenizer(wikiText).nextToken().headingWikiText()); + assertEquals(0, new WikiTokenizer(wikiText).nextToken().errors.size()); + + } + + + + public void testSimple() { + final String wikiText = + "Hi" + "\n" + + "Hello =thad| you're '''pretty''' cool '''''over''''' there." + "\n" + + "hi " + "\n" + + "" + "\n" + + "asdf\n" + + "{{template_not_in_list}}" + "\n" + + "# {{template_in_list}}" + "\n" + + "[[wikitext]]:[[wikitext]]" + "\n" + // don't want this to trigger a list + ": but this is a list!" + "\n" + + "*:* and so is this :::" + "\n" + + "here's [[some blah|some]] wikitext." + "\n" + + "here's a {{template|this has an = sign|blah=2|blah2=3|" + "\n" + + "blah3=3,[[asdf]|[asdf asdf]|[asdf asdf asdf]],blah4=4}} and some more text." + "\n" + + "== Header 2 ==" + "\n" + + "{{some-func|blah={{nested-func|n2}}|blah2=asdf}}" + "\n" + + "{{mismatched]]" + "\n" + + "[[mismatched}}" + "\n" + + "{extraterminated}}" + "\n" + + "[extraterminated]]" + "\n" + + "=== {{header-template}} ===" + "\n"; + + final String[] expectedTokens = new String[] { + "Hi", + "\n", + "Hello ", + "=", + "thad", + "|", + " you're ", + "", + " ", + "'''", + "pretty", + "'''", + " cool ", + "'''", + "''", + "over", + "'''", + "''", + " there.", + "\n", + "hi ", + "", + "\n", + "\n", + "asdf", + "\n", + "{{template_not_in_list}}", + "\n", + "# {{template_in_list}}", + "\n", + "[[wikitext]]", + ":", + "[[wikitext]]", + "\n", + ": but this is a list!", + "\n", + "*:* and so is this :::", + "\n", + "here's ", + "[[some blah|some]]", + " wikitext.", + "\n", + "here's a ", + "{{template|this has an = sign|blah=2|blah2=3|\nblah3=3,[[asdf]|[asdf asdf]|[asdf asdf asdf]],blah4=4}}", + " and some more text.", + "\n", + "== Header 2 ==", + "\n", + "{{some-func|blah={{nested-func|n2}}|blah2=asdf}}", + "\n", + "{{mismatched]]", + "\n", + "[[mismatched}}", + "\n", + "{extraterminated", + "}}", + "\n", + "[extraterminated", + "]]", + "\n", + "=== {{header-template}} ===", + "\n", }; - - final List actualTokens = new ArrayList(); - - final WikiTokenizer wikiTokenizer = new WikiTokenizer(wikiText); - WikiTokenizer token; - int i = 0; - while ((token = wikiTokenizer.nextToken()) != null) { - actualTokens.add(token.token()); - System.out.println("\"" + token.token().replace("\n", "\\n") + "\","); - assertEquals(expectedTokens[i++], token.token()); + + final List actualTokens = new ArrayList(); + + final WikiTokenizer wikiTokenizer = new WikiTokenizer(wikiText); + WikiTokenizer token; + int i = 0; + while ((token = wikiTokenizer.nextToken()) != null) { + actualTokens.add(token.token()); + System.out.println("\"" + token.token().replace("\n", "\\n") + "\","); + assertEquals(expectedTokens[i++], token.token()); + } + assertEquals(Arrays.asList(expectedTokens), actualTokens); } - assertEquals(Arrays.asList(expectedTokens), actualTokens); - } - - public void testHtml() { - String wikiText; - - { - wikiText = " zz
 asdf 
ZZ 1234 XX "; - final WikiTokenizer tokenizer = new WikiTokenizer(wikiText); - assertEquals(" zz ", tokenizer.nextToken().token()); - assertEquals("
 asdf 
", tokenizer.nextToken().token()); - assertEquals(" ZZ ", tokenizer.nextToken().token()); - assertEquals(" 1234 ", tokenizer.nextToken().token()); - assertEquals(" XX ", tokenizer.nextToken().token()); - } - { - wikiText = "\n 1234 "; - final WikiTokenizer tokenizer = new WikiTokenizer(wikiText); - assertEquals(" 1234 ", tokenizer.nextToken().nextToken().token()); - } - - { - wikiText = "# z'' is the '''free''' variable in \"\\forall x\\exists y:xy=z\".''"; - final WikiTokenizer tokenizer = new WikiTokenizer(wikiText); - assertEquals(wikiText, tokenizer.nextToken().token()); - } - - - } - + + public void testHtml() { + String wikiText; + + { + wikiText = " zz
 asdf 
ZZ 1234 XX "; + final WikiTokenizer tokenizer = new WikiTokenizer(wikiText); + assertEquals(" zz ", tokenizer.nextToken().token()); + assertEquals("
 asdf 
", tokenizer.nextToken().token()); + assertEquals(" ZZ ", tokenizer.nextToken().token()); + assertEquals(" 1234 ", tokenizer.nextToken().token()); + assertEquals(" XX ", tokenizer.nextToken().token()); + } + { + wikiText = "\n 1234 "; + final WikiTokenizer tokenizer = new WikiTokenizer(wikiText); + assertEquals(" 1234 ", tokenizer.nextToken().nextToken().token()); + } + + { + wikiText = "# z'' is the '''free''' variable in \"\\forall x\\exists y:xy=z\".''"; + final WikiTokenizer tokenizer = new WikiTokenizer(wikiText); + assertEquals(wikiText, tokenizer.nextToken().token()); + } + + + } + } diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java index ea60658..b77c341 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java @@ -42,242 +42,242 @@ import com.hughes.util.EnumUtil; public abstract class AbstractWiktionaryParser implements Parser { - static final Logger LOG = Logger.getLogger("WiktionaryParser"); - - final SortedMap counters = new TreeMap(); - final Set pairsAdded = new LinkedHashSet(); - - public EntrySource entrySource; - public String title; - - - abstract void parseSection(final String heading, final String text); - - abstract void removeUselessArgs(final Map namedArgs); - - @Override - public void parse(final File file, final EntrySource entrySource, final int pageLimit) throws IOException { - this.entrySource = entrySource; - int pageCount = 0; - final DataInputStream dis = new DataInputStream(new BufferedInputStream(new FileInputStream(file))); - try { - while (true) { - if (pageLimit >= 0 && pageCount >= pageLimit) { - return; - } - - try { - title = dis.readUTF(); - } catch (EOFException e) { - LOG.log(Level.INFO, "EOF reading split."); - dis.close(); - return; - } - final String heading = dis.readUTF(); - final int bytesLength = dis.readInt(); - final byte[] bytes = new byte[bytesLength]; - dis.readFully(bytes); - final String text = new String(bytes, "UTF8"); - - parseSection(heading, text); - - ++pageCount; - if (pageCount % 1000 == 0) { - LOG.info("pageCount=" + pageCount); - } - } - } finally { - dis.close(); - LOG.info("***COUNTERS***"); - for (final Map.Entry entry : counters.entrySet()) { - LOG.info(entry.getKey() + ": " + entry.getValue()); - } - } - } - - static final Pattern whitespace = Pattern.compile("\\s+"); - static String trim(final String s) { - return whitespace.matcher(s).replaceAll(" ").trim(); - } - - public void incrementCount(final String string) { - AtomicInteger counter = counters.get(string); - if (counter == null) { - counter = new AtomicInteger(); - counters.put(string, counter); - } - counter.incrementAndGet(); - } - - public void addLinkToCurrentEntry(final String token, final String lang, final EntryTypeName entryTypeName) { - assert false : token + ", title=" + title; - } - - - // ------------------------------------------------------------------------- - - static class AppendAndIndexWikiCallback implements WikiTokenizer.Callback { - - final T parser; - StringBuilder builder; - IndexedEntry indexedEntry; - IndexBuilder indexBuilder; - final Map> functionCallbacks = new LinkedHashMap>(); - - boolean entryTypeNameSticks = false; - EntryTypeName entryTypeName = null; - - final Map langCodeToTCount = new LinkedHashMap(); - - final NameAndArgs nameAndArgs = new NameAndArgs(); - - public AppendAndIndexWikiCallback(final T parser) { - this.parser = parser; - } - - public void reset(final StringBuilder builder, final IndexedEntry indexedEntry) { - this.builder = builder; - this.indexedEntry = indexedEntry; - this.indexBuilder = null; - entryTypeName = null; - entryTypeNameSticks = false; - } - - public void dispatch(final String wikiText, final IndexBuilder indexBuilder, final EntryTypeName entryTypeName) { - final IndexBuilder oldIndexBuilder = this.indexBuilder; - final EntryTypeName oldEntryTypeName = this.entryTypeName; - this.indexBuilder = indexBuilder; - if (!entryTypeNameSticks) { - this.entryTypeName = EnumUtil.min(entryTypeName, this.entryTypeName); - } - if (entryTypeName == null) this.entryTypeName = null; - WikiTokenizer.dispatch(wikiText, false, this); - this.indexBuilder = oldIndexBuilder; - this.entryTypeName = oldEntryTypeName; - } - - public String dispatch(final String wikiText, final EntryTypeName entryTypeName) { - final int start = builder.length(); - dispatch(wikiText, this.indexBuilder, entryTypeName); - return builder.substring(start); - } + static final Logger LOG = Logger.getLogger("WiktionaryParser"); - @Override - public void onPlainText(final String plainText) { - // The only non-recursive callback. Just appends to the builder, and indexes. - builder.append(plainText); - if (indexBuilder != null && entryTypeName != null && indexedEntry != null) { - indexBuilder.addEntryWithString(indexedEntry, plainText, entryTypeName); - } - } + final SortedMap counters = new TreeMap(); + final Set pairsAdded = new LinkedHashSet(); - @Override - public void onWikiLink(WikiTokenizer wikiTokenizer) { - final String text = wikiTokenizer.wikiLinkText(); - @SuppressWarnings("unused") - final String link = wikiTokenizer.wikiLinkDest(); - dispatch(text, entryTypeName); - } + public EntrySource entrySource; + public String title; - @Override - public void onFunction( - final WikiTokenizer wikiTokenizer, - final String name, - final List args, - final Map namedArgs) { - - FunctionCallback functionCallback = functionCallbacks.get(name); - if (functionCallback == null || !functionCallback.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, this)) { - // Default function handling: - parser.removeUselessArgs(namedArgs); - final boolean single = args.isEmpty() && namedArgs.isEmpty(); - builder.append(single ? "{" : "{{"); - - final IndexBuilder oldIndexBuilder = indexBuilder; - indexBuilder = null; - nameAndArgs.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, this); - indexBuilder = oldIndexBuilder; - - builder.append(single ? "}" : "}}"); - } - } - - @Override - public void onHtml(WikiTokenizer wikiTokenizer) { - if (wikiTokenizer.token().startsWith("")) { - // Do nothing. - return; - } - // Unindexed for now. - builder.append(wikiTokenizer.token()); - } + + abstract void parseSection(final String heading, final String text); + + abstract void removeUselessArgs(final Map namedArgs); @Override - public void onMarkup(WikiTokenizer wikiTokenizer) { - // Do nothing. + public void parse(final File file, final EntrySource entrySource, final int pageLimit) throws IOException { + this.entrySource = entrySource; + int pageCount = 0; + final DataInputStream dis = new DataInputStream(new BufferedInputStream(new FileInputStream(file))); + try { + while (true) { + if (pageLimit >= 0 && pageCount >= pageLimit) { + return; + } + + try { + title = dis.readUTF(); + } catch (EOFException e) { + LOG.log(Level.INFO, "EOF reading split."); + dis.close(); + return; + } + final String heading = dis.readUTF(); + final int bytesLength = dis.readInt(); + final byte[] bytes = new byte[bytesLength]; + dis.readFully(bytes); + final String text = new String(bytes, "UTF8"); + + parseSection(heading, text); + + ++pageCount; + if (pageCount % 1000 == 0) { + LOG.info("pageCount=" + pageCount); + } + } + } finally { + dis.close(); + LOG.info("***COUNTERS***"); + for (final Map.Entry entry : counters.entrySet()) { + LOG.info(entry.getKey() + ": " + entry.getValue()); + } + } } - @Override - public final void onComment(WikiTokenizer wikiTokenizer) { - // Do nothing. + static final Pattern whitespace = Pattern.compile("\\s+"); + static String trim(final String s) { + return whitespace.matcher(s).replaceAll(" ").trim(); } - @Override - public void onNewline(WikiTokenizer wikiTokenizer) { - assert false; + public void incrementCount(final String string) { + AtomicInteger counter = counters.get(string); + if (counter == null) { + counter = new AtomicInteger(); + counters.put(string, counter); + } + counter.incrementAndGet(); } - @Override - public void onHeading(WikiTokenizer wikiTokenizer) { - assert false; + public void addLinkToCurrentEntry(final String token, final String lang, final EntryTypeName entryTypeName) { + assert false : token + ", title=" + title; } - @Override - public void onListItem(WikiTokenizer wikiTokenizer) { - assert false; + + // ------------------------------------------------------------------------- + + static class AppendAndIndexWikiCallback implements WikiTokenizer.Callback { + + final T parser; + StringBuilder builder; + IndexedEntry indexedEntry; + IndexBuilder indexBuilder; + final Map> functionCallbacks = new LinkedHashMap>(); + + boolean entryTypeNameSticks = false; + EntryTypeName entryTypeName = null; + + final Map langCodeToTCount = new LinkedHashMap(); + + final NameAndArgs nameAndArgs = new NameAndArgs(); + + public AppendAndIndexWikiCallback(final T parser) { + this.parser = parser; + } + + public void reset(final StringBuilder builder, final IndexedEntry indexedEntry) { + this.builder = builder; + this.indexedEntry = indexedEntry; + this.indexBuilder = null; + entryTypeName = null; + entryTypeNameSticks = false; + } + + public void dispatch(final String wikiText, final IndexBuilder indexBuilder, final EntryTypeName entryTypeName) { + final IndexBuilder oldIndexBuilder = this.indexBuilder; + final EntryTypeName oldEntryTypeName = this.entryTypeName; + this.indexBuilder = indexBuilder; + if (!entryTypeNameSticks) { + this.entryTypeName = EnumUtil.min(entryTypeName, this.entryTypeName); + } + if (entryTypeName == null) this.entryTypeName = null; + WikiTokenizer.dispatch(wikiText, false, this); + this.indexBuilder = oldIndexBuilder; + this.entryTypeName = oldEntryTypeName; + } + + public String dispatch(final String wikiText, final EntryTypeName entryTypeName) { + final int start = builder.length(); + dispatch(wikiText, this.indexBuilder, entryTypeName); + return builder.substring(start); + } + + @Override + public void onPlainText(final String plainText) { + // The only non-recursive callback. Just appends to the builder, and indexes. + builder.append(plainText); + if (indexBuilder != null && entryTypeName != null && indexedEntry != null) { + indexBuilder.addEntryWithString(indexedEntry, plainText, entryTypeName); + } + } + + @Override + public void onWikiLink(WikiTokenizer wikiTokenizer) { + final String text = wikiTokenizer.wikiLinkText(); + @SuppressWarnings("unused") + final String link = wikiTokenizer.wikiLinkDest(); + dispatch(text, entryTypeName); + } + + @Override + public void onFunction( + final WikiTokenizer wikiTokenizer, + final String name, + final List args, + final Map namedArgs) { + + FunctionCallback functionCallback = functionCallbacks.get(name); + if (functionCallback == null || !functionCallback.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, this)) { + // Default function handling: + parser.removeUselessArgs(namedArgs); + final boolean single = args.isEmpty() && namedArgs.isEmpty(); + builder.append(single ? "{" : "{{"); + + final IndexBuilder oldIndexBuilder = indexBuilder; + indexBuilder = null; + nameAndArgs.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, this); + indexBuilder = oldIndexBuilder; + + builder.append(single ? "}" : "}}"); + } + } + + @Override + public void onHtml(WikiTokenizer wikiTokenizer) { + if (wikiTokenizer.token().startsWith("")) { + // Do nothing. + return; + } + // Unindexed for now. + builder.append(wikiTokenizer.token()); + } + + @Override + public void onMarkup(WikiTokenizer wikiTokenizer) { + // Do nothing. + } + + @Override + public final void onComment(WikiTokenizer wikiTokenizer) { + // Do nothing. + } + + @Override + public void onNewline(WikiTokenizer wikiTokenizer) { + assert false; + } + + @Override + public void onHeading(WikiTokenizer wikiTokenizer) { + assert false; + } + + @Override + public void onListItem(WikiTokenizer wikiTokenizer) { + assert false; + } + } - } - - // -------------------------------------------------------------------- - - static final class NameAndArgs implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, final T parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - - if (name != null) { - appendAndIndexWikiCallback.dispatch(name, null); - } - for (int i = 0; i < args.size(); ++i) { - if (args.get(i).length() > 0) { - appendAndIndexWikiCallback.builder.append("|"); - appendAndIndexWikiCallback.dispatch(args.get(i), null, null); + // -------------------------------------------------------------------- + + static final class NameAndArgs implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, final T parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + + if (name != null) { + appendAndIndexWikiCallback.dispatch(name, null); + } + for (int i = 0; i < args.size(); ++i) { + if (args.get(i).length() > 0) { + appendAndIndexWikiCallback.builder.append("|"); + appendAndIndexWikiCallback.dispatch(args.get(i), null, null); + } + } + appendNamedArgs(namedArgs, appendAndIndexWikiCallback); + return true; } - } - appendNamedArgs(namedArgs, appendAndIndexWikiCallback); - return true; } - } - static NameAndArgs NAME_AND_ARGS = new NameAndArgs(); - - static void appendNamedArgs(final Map namedArgs, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - for (final Map.Entry entry : namedArgs.entrySet()) { - appendAndIndexWikiCallback.builder.append("|"); - appendAndIndexWikiCallback.dispatch(entry.getKey(), null, null); - appendAndIndexWikiCallback.builder.append("="); - EntryTypeName entryTypeName = null; - IndexBuilder indexBuilder = null; - // This doesn't work: we'd need to add to word-forms. + static NameAndArgs NAME_AND_ARGS = new NameAndArgs(); + + static void appendNamedArgs(final Map namedArgs, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + for (final Map.Entry entry : namedArgs.entrySet()) { + appendAndIndexWikiCallback.builder.append("|"); + appendAndIndexWikiCallback.dispatch(entry.getKey(), null, null); + appendAndIndexWikiCallback.builder.append("="); + EntryTypeName entryTypeName = null; + IndexBuilder indexBuilder = null; + // This doesn't work: we'd need to add to word-forms. // System.out.println(entry.getKey()); // if (entry.getKey().equals("tr")) { // entryTypeName = EntryTypeName.WIKTIONARY_TRANSLITERATION; // indexBuilder = appendAndIndexWikiCallback.parser.foreignIndexBuilder; // } - appendAndIndexWikiCallback.dispatch(entry.getValue(), indexBuilder, entryTypeName); + appendAndIndexWikiCallback.dispatch(entry.getValue(), indexBuilder, entryTypeName); + } } - } } diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/DeFunctionCallbacks.java b/src/com/hughes/android/dictionary/parser/wiktionary/DeFunctionCallbacks.java index 9118446..8711191 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/DeFunctionCallbacks.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/DeFunctionCallbacks.java @@ -22,56 +22,56 @@ import java.util.List; import java.util.Map; class DeFunctionCallbacks { - - static void addGenericCallbacks(Map> callbacks) { - FunctionCallback callback = new MakeHeadingFromName("===="); - callbacks.put("Aussprache", callback); - callbacks.put("Worttrennung", callback); - callbacks.put("Bedeutungen", callback); - callbacks.put("Herkunft", callback); - callbacks.put("Synonyme", callback); - callbacks.put("Gegenwörter", callback); - callbacks.put("Verkleinerungsformen", callback); - callbacks.put("Oberbegriffe", callback); - callbacks.put("Unterbegriffe", callback); - callbacks.put("Beispiele", callback); - callbacks.put("Redewendungen", callback); - callbacks.put("Charakteristische Wortkombinationen", callback); - callbacks.put("Abgeleitete Begriffe", callback); - callbacks.put("Übersetzungen", callback); - callbacks.put("Referenzen", callback); - callbacks.put("Grammatische Merkmale", callback); - callbacks.put("Abkürzungen", callback); - - // TODO: - // {{Anmerkung}} - // {{Anmerkungen}} - // {{Anmerkung|zum Gebrauch}} - } - - static final NameAndArgs NAME_AND_ARGS = new NameAndArgs(); + static void addGenericCallbacks(Map> callbacks) { + FunctionCallback callback = new MakeHeadingFromName("===="); + callbacks.put("Aussprache", callback); + callbacks.put("Worttrennung", callback); + callbacks.put("Bedeutungen", callback); + callbacks.put("Herkunft", callback); + callbacks.put("Synonyme", callback); + callbacks.put("Gegenwörter", callback); + callbacks.put("Verkleinerungsformen", callback); + callbacks.put("Oberbegriffe", callback); + callbacks.put("Unterbegriffe", callback); + callbacks.put("Beispiele", callback); + callbacks.put("Redewendungen", callback); + callbacks.put("Charakteristische Wortkombinationen", callback); + callbacks.put("Abgeleitete Begriffe", callback); + callbacks.put("Übersetzungen", callback); + callbacks.put("Referenzen", callback); + callbacks.put("Grammatische Merkmale", callback); + callbacks.put("Abkürzungen", callback); - - static final class MakeHeadingFromName implements FunctionCallback { - final String header; - public MakeHeadingFromName(String header) { - this.header = header; + // TODO: + // {{Anmerkung}} + // {{Anmerkungen}} + // {{Anmerkung|zum Gebrauch}} } - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final T parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - if (!namedArgs.isEmpty() || args.size() != 0) { - return false; + + static final NameAndArgs NAME_AND_ARGS = new NameAndArgs(); + + + static final class MakeHeadingFromName implements FunctionCallback { + final String header; + public MakeHeadingFromName(String header) { + this.header = header; + } + + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final T parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + if (!namedArgs.isEmpty() || args.size() != 0) { + return false; + } + //appendAndIndexWikiCallback.builder.append(String.format("<%s>", header)); + appendAndIndexWikiCallback.dispatch("\n" + header + name + header, null); + //appendAndIndexWikiCallback.builder.append(String.format("\n", header)); + return true; } - //appendAndIndexWikiCallback.builder.append(String.format("<%s>", header)); - appendAndIndexWikiCallback.dispatch("\n" + header + name + header, null); - //appendAndIndexWikiCallback.builder.append(String.format("\n", header)); - return true; - } } diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/EnForeignParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/EnForeignParser.java index 7dd933e..670462f 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/EnForeignParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/EnForeignParser.java @@ -30,312 +30,312 @@ import com.hughes.android.dictionary.parser.WikiTokenizer; public final class EnForeignParser extends EnParser { public EnForeignParser(final IndexBuilder enIndexBuilder, - final IndexBuilder otherIndexBuilder, final Pattern langPattern, - final Pattern langCodePattern, final boolean swap) { - super(enIndexBuilder, otherIndexBuilder, langPattern, langCodePattern, swap); + final IndexBuilder otherIndexBuilder, final Pattern langPattern, + final Pattern langCodePattern, final boolean swap) { + super(enIndexBuilder, otherIndexBuilder, langPattern, langCodePattern, swap); } @Override void parseSection(String heading, String text) { - if (isIgnorableTitle(title)) { - return; - } - final String lang = heading.replace("=", "").trim(); - if (!langPattern.matcher(lang).find()){ - return; - } - - final WikiTokenizer wikiTokenizer = new WikiTokenizer(text); - while (wikiTokenizer.nextToken() != null) { - if (wikiTokenizer.isHeading()) { - final String headingName = wikiTokenizer.headingWikiText(); - if (headingName.equals("Translations")) { - LOG.warning("Translations not in English section: " + title); - incrementCount("WARNING: Translations not in English section"); - } else if (headingName.equals("Pronunciation")) { - //doPronunciation(wikiLineReader); - } else if (headingName.startsWith(" {{S|")) { - // HACK to support parsing frwiktionary - String[] parts = headingName.split("\\|"); - if (parts.length > 2 && langCodePattern.matcher(parts[2]).find() && - (parts.length < 4 || !parts[3].startsWith("flexion"))) { - doForeignPartOfSpeech(lang, headingName, wikiTokenizer.headingDepth(), wikiTokenizer); + if (isIgnorableTitle(title)) { + return; + } + final String lang = heading.replace("=", "").trim(); + if (!langPattern.matcher(lang).find()) { + return; + } + + final WikiTokenizer wikiTokenizer = new WikiTokenizer(text); + while (wikiTokenizer.nextToken() != null) { + if (wikiTokenizer.isHeading()) { + final String headingName = wikiTokenizer.headingWikiText(); + if (headingName.equals("Translations")) { + LOG.warning("Translations not in English section: " + title); + incrementCount("WARNING: Translations not in English section"); + } else if (headingName.equals("Pronunciation")) { + //doPronunciation(wikiLineReader); + } else if (headingName.startsWith(" {{S|")) { + // HACK to support parsing frwiktionary + String[] parts = headingName.split("\\|"); + if (parts.length > 2 && langCodePattern.matcher(parts[2]).find() && + (parts.length < 4 || !parts[3].startsWith("flexion"))) { + doForeignPartOfSpeech(lang, headingName, wikiTokenizer.headingDepth(), wikiTokenizer); + } + } else if (partOfSpeechHeader.matcher(headingName).matches()) { + doForeignPartOfSpeech(lang, headingName, wikiTokenizer.headingDepth(), wikiTokenizer); + } + } else { + // It's not a heading. + // TODO: optimization: skip to next heading. } - } else if (partOfSpeechHeader.matcher(headingName).matches()) { - doForeignPartOfSpeech(lang, headingName, wikiTokenizer.headingDepth(), wikiTokenizer); - } - } else { - // It's not a heading. - // TODO: optimization: skip to next heading. } - } } - + static final class ListSection { - final String firstPrefix; - final String firstLine; - final List nextPrefixes = new ArrayList(); - final List nextLines = new ArrayList(); - - public ListSection(String firstPrefix, String firstLine) { - this.firstPrefix = firstPrefix; - this.firstLine = firstLine; - } - - @Override - public String toString() { - return firstPrefix + firstLine + "{ " + nextPrefixes + "}"; - } + final String firstPrefix; + final String firstLine; + final List nextPrefixes = new ArrayList(); + final List nextLines = new ArrayList(); + + public ListSection(String firstPrefix, String firstLine) { + this.firstPrefix = firstPrefix; + this.firstLine = firstLine; + } + + @Override + public String toString() { + return firstPrefix + firstLine + "{ " + nextPrefixes + "}"; + } } int foreignCount = 0; private void doForeignPartOfSpeech(final String lang, String posHeading, final int posDepth, WikiTokenizer wikiTokenizer) { - if (++foreignCount % 1000 == 0) { - LOG.info("***" + lang + ", " + title + ", pos=" + posHeading + ", foreignCount=" + foreignCount); - } - if (title.equals("6")) { - System.out.println(); - } - - final StringBuilder foreignBuilder = new StringBuilder(); - final List listSections = new ArrayList(); - - appendAndIndexWikiCallback.reset(foreignBuilder, null); - this.state = State.ENGLISH_DEF_OF_FOREIGN; // TODO: this is wrong, need new category.... - titleAppended = false; - wordForms.clear(); - - try { - - EnForeignParser.ListSection lastListSection = null; - - int currentHeadingDepth = posDepth; - while (wikiTokenizer.nextToken() != null) { - if (wikiTokenizer.isHeading()) { - currentHeadingDepth = wikiTokenizer.headingDepth(); - - if (currentHeadingDepth <= posDepth) { - wikiTokenizer.returnToLineStart(); - return; - } - } // heading - - if (currentHeadingDepth > posDepth) { - // TODO: deal with other neat info sections inside POS - continue; - } - - if (wikiTokenizer.isFunction()) { - final String name = wikiTokenizer.functionName(); - final List args = wikiTokenizer.functionPositionArgs(); - final Map namedArgs = wikiTokenizer.functionNamedArgs(); - // First line is generally a repeat of the title with some extra information. - // We need to build up the left side (foreign text, tokens) separately from the - // right side (English). The left-side may get paired with multiple right sides. - // The left side should get filed under every form of the word in question (singular, plural). - - // For verbs, the conjugation comes later on in a deeper section. - // Ideally, we'd want to file every English entry with the verb - // under every verb form coming from the conjugation. - // Ie. under "fa": see: "make :: fare" and "do :: fare" - // But then where should we put the conjugation table? - // I think just under fare. But then we need a way to link to the entry (actually the row, since entries doesn't show up!) - // for the conjugation table from "fa". - // Would like to be able to link to a lang#token. - - - String head = namedArgs.remove("head"); - final String tr = namedArgs.remove("tr"); - if (head == null && tr != null && !titleAppended) { - head = title; - } - if (head != null) { - final String form = appendAndIndexWikiCallback.dispatch(head, EntryTypeName.WIKTIONARY_TITLE_MULTI); - wordForms.add(form); - appendAndIndexWikiCallback.builder.append(" "); - titleAppended = true; - } - if (tr != null) { - appendAndIndexWikiCallback.builder.append(" ("); - final String form = appendAndIndexWikiCallback.dispatch(tr, EntryTypeName.WIKTIONARY_TRANSLITERATION); - wordForms.add(form); - appendAndIndexWikiCallback.builder.append(") "); - } - - appendAndIndexWikiCallback.onFunction(wikiTokenizer, name, args, namedArgs); - - } else if (wikiTokenizer.isListItem()) { - final String prefix = wikiTokenizer.listItemPrefix(); - if (lastListSection != null && - prefix.startsWith(lastListSection.firstPrefix) && - prefix.length() > lastListSection.firstPrefix.length()) { - lastListSection.nextPrefixes.add(prefix); - lastListSection.nextLines.add(wikiTokenizer.listItemWikiText()); - } else { - lastListSection = new ListSection(prefix, wikiTokenizer.listItemWikiText()); - listSections.add(lastListSection); - } - } else if (lastListSection != null) { - // Don't append anything after the lists, because there's crap. - } else if (wikiTokenizer.isWikiLink()) { - // Unindexed! - foreignBuilder.append(wikiTokenizer.wikiLinkText()); - - } else if (wikiTokenizer.isPlainText()) { - // Unindexed! - foreignBuilder.append(wikiTokenizer.token()); - } else if (wikiTokenizer.isHtml()) { - if (!wikiTokenizer.token().startsWith("")) { - foreignBuilder.append(wikiTokenizer.token()); - } - } else if (wikiTokenizer.isMarkup() || - wikiTokenizer.isNewline() || - wikiTokenizer.isComment()) { - // Do nothing. - } else { - LOG.warning("Unexpected token: " + wikiTokenizer.token()); - assert !wikiTokenizer.errors().isEmpty(); + if (++foreignCount % 1000 == 0) { + LOG.info("***" + lang + ", " + title + ", pos=" + posHeading + ", foreignCount=" + foreignCount); } - } - - } finally { - // Here's where we exit. - // Should we make an entry even if there are no foreign list items? - String foreign = foreignBuilder.toString().trim(); - if (!titleAppended && !foreign.toLowerCase().startsWith(title.toLowerCase())) { - foreign = String.format("%s %s", title, foreign); + if (title.equals("6")) { + System.out.println(); } - if (!langPattern.matcher(lang).matches()) { - foreign = String.format("(%s) %s", lang, foreign); - } - for (final EnForeignParser.ListSection listSection : listSections) { - doForeignListSection(foreign, title, wordForms, listSection); + + final StringBuilder foreignBuilder = new StringBuilder(); + final List listSections = new ArrayList(); + + appendAndIndexWikiCallback.reset(foreignBuilder, null); + this.state = State.ENGLISH_DEF_OF_FOREIGN; // TODO: this is wrong, need new category.... + titleAppended = false; + wordForms.clear(); + + try { + + EnForeignParser.ListSection lastListSection = null; + + int currentHeadingDepth = posDepth; + while (wikiTokenizer.nextToken() != null) { + if (wikiTokenizer.isHeading()) { + currentHeadingDepth = wikiTokenizer.headingDepth(); + + if (currentHeadingDepth <= posDepth) { + wikiTokenizer.returnToLineStart(); + return; + } + } // heading + + if (currentHeadingDepth > posDepth) { + // TODO: deal with other neat info sections inside POS + continue; + } + + if (wikiTokenizer.isFunction()) { + final String name = wikiTokenizer.functionName(); + final List args = wikiTokenizer.functionPositionArgs(); + final Map namedArgs = wikiTokenizer.functionNamedArgs(); + // First line is generally a repeat of the title with some extra information. + // We need to build up the left side (foreign text, tokens) separately from the + // right side (English). The left-side may get paired with multiple right sides. + // The left side should get filed under every form of the word in question (singular, plural). + + // For verbs, the conjugation comes later on in a deeper section. + // Ideally, we'd want to file every English entry with the verb + // under every verb form coming from the conjugation. + // Ie. under "fa": see: "make :: fare" and "do :: fare" + // But then where should we put the conjugation table? + // I think just under fare. But then we need a way to link to the entry (actually the row, since entries doesn't show up!) + // for the conjugation table from "fa". + // Would like to be able to link to a lang#token. + + + String head = namedArgs.remove("head"); + final String tr = namedArgs.remove("tr"); + if (head == null && tr != null && !titleAppended) { + head = title; + } + if (head != null) { + final String form = appendAndIndexWikiCallback.dispatch(head, EntryTypeName.WIKTIONARY_TITLE_MULTI); + wordForms.add(form); + appendAndIndexWikiCallback.builder.append(" "); + titleAppended = true; + } + if (tr != null) { + appendAndIndexWikiCallback.builder.append(" ("); + final String form = appendAndIndexWikiCallback.dispatch(tr, EntryTypeName.WIKTIONARY_TRANSLITERATION); + wordForms.add(form); + appendAndIndexWikiCallback.builder.append(") "); + } + + appendAndIndexWikiCallback.onFunction(wikiTokenizer, name, args, namedArgs); + + } else if (wikiTokenizer.isListItem()) { + final String prefix = wikiTokenizer.listItemPrefix(); + if (lastListSection != null && + prefix.startsWith(lastListSection.firstPrefix) && + prefix.length() > lastListSection.firstPrefix.length()) { + lastListSection.nextPrefixes.add(prefix); + lastListSection.nextLines.add(wikiTokenizer.listItemWikiText()); + } else { + lastListSection = new ListSection(prefix, wikiTokenizer.listItemWikiText()); + listSections.add(lastListSection); + } + } else if (lastListSection != null) { + // Don't append anything after the lists, because there's crap. + } else if (wikiTokenizer.isWikiLink()) { + // Unindexed! + foreignBuilder.append(wikiTokenizer.wikiLinkText()); + + } else if (wikiTokenizer.isPlainText()) { + // Unindexed! + foreignBuilder.append(wikiTokenizer.token()); + } else if (wikiTokenizer.isHtml()) { + if (!wikiTokenizer.token().startsWith("")) { + foreignBuilder.append(wikiTokenizer.token()); + } + } else if (wikiTokenizer.isMarkup() || + wikiTokenizer.isNewline() || + wikiTokenizer.isComment()) { + // Do nothing. + } else { + LOG.warning("Unexpected token: " + wikiTokenizer.token()); + assert !wikiTokenizer.errors().isEmpty(); + } + } + + } finally { + // Here's where we exit. + // Should we make an entry even if there are no foreign list items? + String foreign = foreignBuilder.toString().trim(); + if (!titleAppended && !foreign.toLowerCase().startsWith(title.toLowerCase())) { + foreign = String.format("%s %s", title, foreign); + } + if (!langPattern.matcher(lang).matches()) { + foreign = String.format("(%s) %s", lang, foreign); + } + for (final EnForeignParser.ListSection listSection : listSections) { + doForeignListSection(foreign, title, wordForms, listSection); + } } - } } - + private void doForeignListSection(final String foreignText, String title, final Collection forms, final EnForeignParser.ListSection listSection) { - state = State.ENGLISH_DEF_OF_FOREIGN; - final String prefix = listSection.firstPrefix; - if (prefix.length() > 1) { - // Could just get looser and say that any prefix longer than first is a sublist. - LOG.warning("Prefix '" + prefix + "' too long: " + listSection); - incrementCount("WARNING: Prefix too long"); - return; - } - - final PairEntry pairEntry = new PairEntry(entrySource); - final IndexedEntry indexedEntry = new IndexedEntry(pairEntry); - indexedEntry.isValid = true; - - entryIsFormOfSomething = false; - final StringBuilder englishBuilder = new StringBuilder(); - final String mainLine = listSection.firstLine; - appendAndIndexWikiCallback.reset(englishBuilder, indexedEntry); - appendAndIndexWikiCallback.dispatch(mainLine, enIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF); - - final String english = trim(englishBuilder.toString()); - if (english.length() > 0) { - final Pair pair = new Pair(english, trim(foreignText), this.swap); - pairEntry.pairs.add(pair); - foreignIndexBuilder.addEntryWithString(indexedEntry, title, entryIsFormOfSomething ? EntryTypeName.WIKTIONARY_IS_FORM_OF_SOMETHING_ELSE : EntryTypeName.WIKTIONARY_TITLE_MULTI); - for (final String form : forms) { - foreignIndexBuilder.addEntryWithString(indexedEntry, form, EntryTypeName.WIKTIONARY_INFLECTED_FORM_MULTI); - } - } - - // Do examples. - String lastForeign = null; - for (int i = 0; i < listSection.nextPrefixes.size(); ++i) { - final String nextPrefix = listSection.nextPrefixes.get(i); - String nextLine = listSection.nextLines.get(i); - - // TODO: This splitting is not sensitive to wiki code. - int dash = nextLine.indexOf("—"); - int mdashLen = 7; - if (dash == -1) { - dash = nextLine.indexOf("—"); - mdashLen = 1; - } - if (dash == -1) { - dash = nextLine.indexOf(" - "); - mdashLen = 3; + state = State.ENGLISH_DEF_OF_FOREIGN; + final String prefix = listSection.firstPrefix; + if (prefix.length() > 1) { + // Could just get looser and say that any prefix longer than first is a sublist. + LOG.warning("Prefix '" + prefix + "' too long: " + listSection); + incrementCount("WARNING: Prefix too long"); + return; } - - if ((nextPrefix.equals("#:") || nextPrefix.equals("##:")) && dash != -1) { - final String foreignEx = nextLine.substring(0, dash); - final String englishEx = nextLine.substring(dash + mdashLen); - final Pair pair = new Pair(formatAndIndexExampleString(englishEx, enIndexBuilder, indexedEntry), formatAndIndexExampleString(foreignEx, foreignIndexBuilder, indexedEntry), swap); - if (pair.lang1 != "--" && pair.lang1 != "--") { - pairEntry.pairs.add(pair); - } - lastForeign = null; - // TODO: make #* and #*: work - } else if (nextPrefix.equals("#:") || nextPrefix.equals("##:")/* || nextPrefix.equals("#*")*/){ - final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap); - lastForeign = nextLine; - if (pair.lang1 != "--" && pair.lang1 != "--") { + + final PairEntry pairEntry = new PairEntry(entrySource); + final IndexedEntry indexedEntry = new IndexedEntry(pairEntry); + indexedEntry.isValid = true; + + entryIsFormOfSomething = false; + final StringBuilder englishBuilder = new StringBuilder(); + final String mainLine = listSection.firstLine; + appendAndIndexWikiCallback.reset(englishBuilder, indexedEntry); + appendAndIndexWikiCallback.dispatch(mainLine, enIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF); + + final String english = trim(englishBuilder.toString()); + if (english.length() > 0) { + final Pair pair = new Pair(english, trim(foreignText), this.swap); pairEntry.pairs.add(pair); - } - } else if (nextPrefix.equals("#::") || nextPrefix.equals("#**")/* || nextPrefix.equals("#*:")*/) { - if (lastForeign != null && pairEntry.pairs.size() > 0) { - if (i + 1 < listSection.nextPrefixes.size()) { - // Chinese has sometimes multiple foreign lines - final String nextNextPrefix = listSection.nextPrefixes.get(i + 1); - if (nextNextPrefix.equals("#::") || nextNextPrefix.equals("#**")) { - ++i; - nextLine += "\n" + listSection.nextLines.get(i); - } + foreignIndexBuilder.addEntryWithString(indexedEntry, title, entryIsFormOfSomething ? EntryTypeName.WIKTIONARY_IS_FORM_OF_SOMETHING_ELSE : EntryTypeName.WIKTIONARY_TITLE_MULTI); + for (final String form : forms) { + foreignIndexBuilder.addEntryWithString(indexedEntry, form, EntryTypeName.WIKTIONARY_INFLECTED_FORM_MULTI); } - pairEntry.pairs.remove(pairEntry.pairs.size() - 1); - final Pair pair = new Pair(formatAndIndexExampleString(nextLine, enIndexBuilder, indexedEntry), formatAndIndexExampleString(lastForeign, foreignIndexBuilder, indexedEntry), swap); - if (pair.lang1 != "--" || pair.lang2 != "--") { - pairEntry.pairs.add(pair); + } + + // Do examples. + String lastForeign = null; + for (int i = 0; i < listSection.nextPrefixes.size(); ++i) { + final String nextPrefix = listSection.nextPrefixes.get(i); + String nextLine = listSection.nextLines.get(i); + + // TODO: This splitting is not sensitive to wiki code. + int dash = nextLine.indexOf("—"); + int mdashLen = 7; + if (dash == -1) { + dash = nextLine.indexOf("—"); + mdashLen = 1; } - lastForeign = null; - } else { - LOG.warning("TODO: English example with no foreign: " + title + ", " + nextLine); - final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap); - if (pair.lang1 != "--" || pair.lang2 != "--") { - pairEntry.pairs.add(pair); + if (dash == -1) { + dash = nextLine.indexOf(" - "); + mdashLen = 3; } - } - } else if (nextPrefix.equals("#*")) { - // Can't really index these. - final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap); - lastForeign = nextLine; - if (pair.lang1 != "--" || pair.lang2 != "--") { - pairEntry.pairs.add(pair); - } - } else if (nextPrefix.equals("#::*") || nextPrefix.equals("##") || nextPrefix.equals("#*:") || nextPrefix.equals("#:*") || true) { - final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap); - if (pair.lang1 != "--" || pair.lang2 != "--") { - pairEntry.pairs.add(pair); - } + + if ((nextPrefix.equals("#:") || nextPrefix.equals("##:")) && dash != -1) { + final String foreignEx = nextLine.substring(0, dash); + final String englishEx = nextLine.substring(dash + mdashLen); + final Pair pair = new Pair(formatAndIndexExampleString(englishEx, enIndexBuilder, indexedEntry), formatAndIndexExampleString(foreignEx, foreignIndexBuilder, indexedEntry), swap); + if (pair.lang1 != "--" && pair.lang1 != "--") { + pairEntry.pairs.add(pair); + } + lastForeign = null; + // TODO: make #* and #*: work + } else if (nextPrefix.equals("#:") || nextPrefix.equals("##:")/* || nextPrefix.equals("#*")*/) { + final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap); + lastForeign = nextLine; + if (pair.lang1 != "--" && pair.lang1 != "--") { + pairEntry.pairs.add(pair); + } + } else if (nextPrefix.equals("#::") || nextPrefix.equals("#**")/* || nextPrefix.equals("#*:")*/) { + if (lastForeign != null && pairEntry.pairs.size() > 0) { + if (i + 1 < listSection.nextPrefixes.size()) { + // Chinese has sometimes multiple foreign lines + final String nextNextPrefix = listSection.nextPrefixes.get(i + 1); + if (nextNextPrefix.equals("#::") || nextNextPrefix.equals("#**")) { + ++i; + nextLine += "\n" + listSection.nextLines.get(i); + } + } + pairEntry.pairs.remove(pairEntry.pairs.size() - 1); + final Pair pair = new Pair(formatAndIndexExampleString(nextLine, enIndexBuilder, indexedEntry), formatAndIndexExampleString(lastForeign, foreignIndexBuilder, indexedEntry), swap); + if (pair.lang1 != "--" || pair.lang2 != "--") { + pairEntry.pairs.add(pair); + } + lastForeign = null; + } else { + LOG.warning("TODO: English example with no foreign: " + title + ", " + nextLine); + final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap); + if (pair.lang1 != "--" || pair.lang2 != "--") { + pairEntry.pairs.add(pair); + } + } + } else if (nextPrefix.equals("#*")) { + // Can't really index these. + final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap); + lastForeign = nextLine; + if (pair.lang1 != "--" || pair.lang2 != "--") { + pairEntry.pairs.add(pair); + } + } else if (nextPrefix.equals("#::*") || nextPrefix.equals("##") || nextPrefix.equals("#*:") || nextPrefix.equals("#:*") || true) { + final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap); + if (pair.lang1 != "--" || pair.lang2 != "--") { + pairEntry.pairs.add(pair); + } // } else { // assert false; + } } - } } - + private String formatAndIndexExampleString(final String example, final IndexBuilder indexBuilder, final IndexedEntry indexedEntry) { - // TODO: + // TODO: // if (wikiTokenizer.token().equals("'''")) { // insideTripleQuotes = !insideTripleQuotes; // } - final StringBuilder builder = new StringBuilder(); - appendAndIndexWikiCallback.reset(builder, indexedEntry); - appendAndIndexWikiCallback.entryTypeName = EntryTypeName.WIKTIONARY_EXAMPLE; - appendAndIndexWikiCallback.entryTypeNameSticks = true; - try { - // TODO: this is a hack needed because we don't safely split on the dash. - appendAndIndexWikiCallback.dispatch(example, indexBuilder, EntryTypeName.WIKTIONARY_EXAMPLE); - } catch (AssertionError e) { - return "--"; - } - final String result = trim(builder.toString()); - return result.length() > 0 ? result : "--"; + final StringBuilder builder = new StringBuilder(); + appendAndIndexWikiCallback.reset(builder, indexedEntry); + appendAndIndexWikiCallback.entryTypeName = EntryTypeName.WIKTIONARY_EXAMPLE; + appendAndIndexWikiCallback.entryTypeNameSticks = true; + try { + // TODO: this is a hack needed because we don't safely split on the dash. + appendAndIndexWikiCallback.dispatch(example, indexBuilder, EntryTypeName.WIKTIONARY_EXAMPLE); + } catch (AssertionError e) { + return "--"; + } + final String result = trim(builder.toString()); + return result.length() > 0 ? result : "--"; } - } // ForeignParser +} // ForeignParser diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/EnFunctionCallbacks.java b/src/com/hughes/android/dictionary/parser/wiktionary/EnFunctionCallbacks.java index 5f7f850..5e37a0a 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/EnFunctionCallbacks.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/EnFunctionCallbacks.java @@ -33,1138 +33,1143 @@ import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; class EnFunctionCallbacks { - - static final Map> DEFAULT = new LinkedHashMap>(); - - static void addGenericCallbacks(Map> callbacks) { - FunctionCallback callback = new Gender(); - callbacks.put("m", callback); - callbacks.put("f", callback); - callbacks.put("n", callback); - callbacks.put("p", callback); - callbacks.put("g", callback); - - callbacks.put("etyl", new etyl()); - callbacks.put("term", new term()); - - callback = new EncodingCallback(); - Set encodings = new LinkedHashSet(Arrays.asList( - "IPA", "IPAchar", // Not really encodings, but it works. - "zh-ts", "zh-tsp", - "sd-Arab", "ku-Arab", "Arab", "unicode", "Laoo", "ur-Arab", "Thai", - "fa-Arab", "Khmr", "Cyrl", "ug-Arab", "ko-inline", - "Jpan", "Kore", "Hebr", "rfscript", "Beng", "Mong", "Knda", "Cyrs", - "yue-tsj", "Mlym", "Tfng", "Grek", "yue-yue-j")); - for (final String encoding : encodings) { - callbacks.put(encoding, callback); - } - - callback = new Ignore(); - callbacks.put("trreq", callback); - callbacks.put("t-image", callback); - callbacks.put("defn", callback); - callbacks.put("rfdef", callback); - callbacks.put("rfdate", callback); - callbacks.put("rfex", callback); - callbacks.put("rfquote", callback); - callbacks.put("attention", callback); - callbacks.put("zh-attention", callback); - callbacks.put("top2", callback); - callbacks.put("mid2", callback); - callbacks.put("top3", callback); - callbacks.put("mid3", callback); - callbacks.put("bottom", callback); - callbacks.put("rel-mid", callback); - callbacks.put("rel-mid3", callback); - callbacks.put("rel-mid4", callback); - callbacks.put("rel-bottom", callback); - callbacks.put("der-top", callback); - callbacks.put("der-mid", callback); - callbacks.put("der-mid3", callback); - callbacks.put("der-bottom", callback); - - callback = new AppendName(); - callbacks.put("...", callback); - - callbacks.put("qualifier", new QualifierCallback()); - callbacks.put("italbrac", new italbrac()); - callbacks.put("gloss", new gloss()); - callbacks.put("not used", new not_used()); - callbacks.put("wikipedia", new wikipedia()); - - final it_conj it_conj_cb = new it_conj(); - callbacks.put("it-conj", it_conj_cb); - callbacks.put("it-conj-are", new it_conj_are(it_conj_cb)); - callbacks.put("it-conj-arsi", new it_conj_are(it_conj_cb)); - callbacks.put("it-conj-care", new it_conj_are(it_conj_cb)); - callbacks.put("it-conj-carsi", new it_conj_are(it_conj_cb)); - callbacks.put("it-conj-ciare", new it_conj_are(it_conj_cb)); - callbacks.put("it-conj-ciarsi", new it_conj_are(it_conj_cb)); - callbacks.put("it-conj-iare", new it_conj_are(it_conj_cb)); - callbacks.put("it-conj-iarsi", new it_conj_are(it_conj_cb)); - callbacks.put("it-conj-iare-b", new it_conj_are(it_conj_cb)); - callbacks.put("it-conj-iarsi-b", new it_conj_are(it_conj_cb)); - callbacks.put("it-conj-ire", new it_conj_ire(it_conj_cb)); - callbacks.put("it-conj-irsi", new it_conj_ire(it_conj_cb)); - callbacks.put("it-conj-ire-b", new it_conj_ire(it_conj_cb)); - callbacks.put("it-conj-irsi-b", new it_conj_ire(it_conj_cb)); - callbacks.put("it-conj-cire", new it_conj_ire(it_conj_cb)); - callbacks.put("it-conj-cirsi", new it_conj_ire(it_conj_cb)); - callbacks.put("it-conj-ire", new it_conj_ire(it_conj_cb)); - callbacks.put("it-conj-ere", new it_conj_ere(it_conj_cb)); - callbacks.put("it-conj-ersi", new it_conj_ere(it_conj_cb)); - callbacks.put("it-conj-urre", new it_conj_urre(it_conj_cb)); - callbacks.put("it-conj-ursi", new it_conj_urre(it_conj_cb)); - callbacks.put("it-conj-fare", new it_conj_fare(it_conj_cb)); - - - //"{{it-conj-fare|putre|avere}}\n" + - - - } - - static { - addGenericCallbacks(DEFAULT); - - FunctionCallback callback = new TranslationCallback(); - DEFAULT.put("t", callback); - DEFAULT.put("t+", callback); - DEFAULT.put("t-", callback); - DEFAULT.put("tø", callback); - DEFAULT.put("apdx-t", callback); - - callback = new l_term(); - DEFAULT.put("l", callback); - DEFAULT.put("term", callback); - - //callback = new AppendArg0(); - - callback = new FormOf(); - DEFAULT.put("form of", callback); - DEFAULT.put("conjugation of", callback); - DEFAULT.put("participle of", callback); - DEFAULT.put("present participle of", callback); - DEFAULT.put("past participle of", callback); - DEFAULT.put("feminine past participle of", callback); - DEFAULT.put("gerund of", callback); - DEFAULT.put("feminine of", callback); - DEFAULT.put("plural of", callback); - DEFAULT.put("feminine plural of", callback); - DEFAULT.put("inflected form of", callback); - DEFAULT.put("alternative form of", callback); - DEFAULT.put("dated form of", callback); - DEFAULT.put("apocopic form of", callback); - - callback = new InflOrHead(); - DEFAULT.put("infl", callback); - DEFAULT.put("head", callback); - } - - static final NameAndArgs NAME_AND_ARGS = new NameAndArgs(); - - // ------------------------------------------------------------------ - - static final class TranslationCallback implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, final T parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - - final String transliteration = namedArgs.remove("tr"); - final String alt = namedArgs.remove("alt"); - namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS); - if (args.size() < 2) { - if (!name.equals("ttbc")) { - EnParser.LOG.warning("{{t...}} with wrong args: title=" + parser.title + ", " + wikiTokenizer.token()); - } - return false; - } - final String langCode = ListUtil.get(args, 0); - if (!appendAndIndexWikiCallback.langCodeToTCount.containsKey(langCode)) { - appendAndIndexWikiCallback.langCodeToTCount.put(langCode, new AtomicInteger()); - } - appendAndIndexWikiCallback.langCodeToTCount.get(langCode).incrementAndGet(); - final String word = ListUtil.get(args, 1); - appendAndIndexWikiCallback.dispatch(alt != null ? alt : word, EntryTypeName.WIKTIONARY_TITLE_MULTI); - - // Genders... - if (args.size() > 2) { - appendAndIndexWikiCallback.builder.append(" {"); - for (int i = 2; i < args.size(); ++i) { - if (i > 2) { - appendAndIndexWikiCallback.builder.append("|"); - } - appendAndIndexWikiCallback.builder.append(args.get(i)); + + static final Map> DEFAULT = new LinkedHashMap>(); + + static void addGenericCallbacks(Map> callbacks) { + FunctionCallback callback = new Gender(); + callbacks.put("m", callback); + callbacks.put("f", callback); + callbacks.put("n", callback); + callbacks.put("p", callback); + callbacks.put("g", callback); + + callbacks.put("etyl", new etyl()); + callbacks.put("term", new term()); + + callback = new EncodingCallback(); + Set encodings = new LinkedHashSet(Arrays.asList( + "IPA", "IPAchar", // Not really encodings, but it works. + "zh-ts", "zh-tsp", + "sd-Arab", "ku-Arab", "Arab", "unicode", "Laoo", "ur-Arab", "Thai", + "fa-Arab", "Khmr", "Cyrl", "ug-Arab", "ko-inline", + "Jpan", "Kore", "Hebr", "rfscript", "Beng", "Mong", "Knda", "Cyrs", + "yue-tsj", "Mlym", "Tfng", "Grek", "yue-yue-j")); + for (final String encoding : encodings) { + callbacks.put(encoding, callback); } - appendAndIndexWikiCallback.builder.append("}"); - } - - if (transliteration != null) { - appendAndIndexWikiCallback.builder.append(" ("); - appendAndIndexWikiCallback.dispatch(transliteration, EntryTypeName.WIKTIONARY_TRANSLITERATION); - appendAndIndexWikiCallback.builder.append(")"); - } - - if (alt != null) { - // If alt wasn't null, we appended alt instead of the actual word - // we're filing under.. - appendAndIndexWikiCallback.builder.append(" ("); - appendAndIndexWikiCallback.dispatch(word, EntryTypeName.WIKTIONARY_TITLE_MULTI); - appendAndIndexWikiCallback.builder.append(")"); - } - - // Catch-all for anything else... - if (!namedArgs.isEmpty()) { - appendAndIndexWikiCallback.builder.append(" {"); - EnParser.appendNamedArgs(namedArgs, appendAndIndexWikiCallback); - appendAndIndexWikiCallback.builder.append("}"); - } - - return true; + + callback = new Ignore(); + callbacks.put("trreq", callback); + callbacks.put("t-image", callback); + callbacks.put("defn", callback); + callbacks.put("rfdef", callback); + callbacks.put("rfdate", callback); + callbacks.put("rfex", callback); + callbacks.put("rfquote", callback); + callbacks.put("attention", callback); + callbacks.put("zh-attention", callback); + callbacks.put("top2", callback); + callbacks.put("mid2", callback); + callbacks.put("top3", callback); + callbacks.put("mid3", callback); + callbacks.put("bottom", callback); + callbacks.put("rel-mid", callback); + callbacks.put("rel-mid3", callback); + callbacks.put("rel-mid4", callback); + callbacks.put("rel-bottom", callback); + callbacks.put("der-top", callback); + callbacks.put("der-mid", callback); + callbacks.put("der-mid3", callback); + callbacks.put("der-bottom", callback); + + callback = new AppendName(); + callbacks.put("...", callback); + + callbacks.put("qualifier", new QualifierCallback()); + callbacks.put("italbrac", new italbrac()); + callbacks.put("gloss", new gloss()); + callbacks.put("not used", new not_used()); + callbacks.put("wikipedia", new wikipedia()); + + final it_conj it_conj_cb = new it_conj(); + callbacks.put("it-conj", it_conj_cb); + callbacks.put("it-conj-are", new it_conj_are(it_conj_cb)); + callbacks.put("it-conj-arsi", new it_conj_are(it_conj_cb)); + callbacks.put("it-conj-care", new it_conj_are(it_conj_cb)); + callbacks.put("it-conj-carsi", new it_conj_are(it_conj_cb)); + callbacks.put("it-conj-ciare", new it_conj_are(it_conj_cb)); + callbacks.put("it-conj-ciarsi", new it_conj_are(it_conj_cb)); + callbacks.put("it-conj-iare", new it_conj_are(it_conj_cb)); + callbacks.put("it-conj-iarsi", new it_conj_are(it_conj_cb)); + callbacks.put("it-conj-iare-b", new it_conj_are(it_conj_cb)); + callbacks.put("it-conj-iarsi-b", new it_conj_are(it_conj_cb)); + callbacks.put("it-conj-ire", new it_conj_ire(it_conj_cb)); + callbacks.put("it-conj-irsi", new it_conj_ire(it_conj_cb)); + callbacks.put("it-conj-ire-b", new it_conj_ire(it_conj_cb)); + callbacks.put("it-conj-irsi-b", new it_conj_ire(it_conj_cb)); + callbacks.put("it-conj-cire", new it_conj_ire(it_conj_cb)); + callbacks.put("it-conj-cirsi", new it_conj_ire(it_conj_cb)); + callbacks.put("it-conj-ire", new it_conj_ire(it_conj_cb)); + callbacks.put("it-conj-ere", new it_conj_ere(it_conj_cb)); + callbacks.put("it-conj-ersi", new it_conj_ere(it_conj_cb)); + callbacks.put("it-conj-urre", new it_conj_urre(it_conj_cb)); + callbacks.put("it-conj-ursi", new it_conj_urre(it_conj_cb)); + callbacks.put("it-conj-fare", new it_conj_fare(it_conj_cb)); + + + //"{{it-conj-fare|putre|avere}}\n" + + + + } + + static { + addGenericCallbacks(DEFAULT); + + FunctionCallback callback = new TranslationCallback(); + DEFAULT.put("t", callback); + DEFAULT.put("t+", callback); + DEFAULT.put("t-", callback); + DEFAULT.put("tø", callback); + DEFAULT.put("apdx-t", callback); + + callback = new l_term(); + DEFAULT.put("l", callback); + DEFAULT.put("term", callback); + + //callback = new AppendArg0(); + + callback = new FormOf(); + DEFAULT.put("form of", callback); + DEFAULT.put("conjugation of", callback); + DEFAULT.put("participle of", callback); + DEFAULT.put("present participle of", callback); + DEFAULT.put("past participle of", callback); + DEFAULT.put("feminine past participle of", callback); + DEFAULT.put("gerund of", callback); + DEFAULT.put("feminine of", callback); + DEFAULT.put("plural of", callback); + DEFAULT.put("feminine plural of", callback); + DEFAULT.put("inflected form of", callback); + DEFAULT.put("alternative form of", callback); + DEFAULT.put("dated form of", callback); + DEFAULT.put("apocopic form of", callback); + + callback = new InflOrHead(); + DEFAULT.put("infl", callback); + DEFAULT.put("head", callback); } - } - - // ------------------------------------------------------------------ - - static final class QualifierCallback implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final T parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - namedArgs.remove("lang"); - if (!namedArgs.isEmpty()) { - EnParser.LOG.warning("weird qualifier: " + wikiTokenizer.token()); - return false; - } - appendAndIndexWikiCallback.builder.append("("); - for (int i = 0; i < args.size(); ++i) { - appendAndIndexWikiCallback.dispatch(args.get(i), null); - if (i > 0) { - appendAndIndexWikiCallback.builder.append(", "); - } - } - appendAndIndexWikiCallback.builder.append(")"); - return true; + + static final NameAndArgs NAME_AND_ARGS = new NameAndArgs(); + + // ------------------------------------------------------------------ + + static final class TranslationCallback implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, final T parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + + final String transliteration = namedArgs.remove("tr"); + final String alt = namedArgs.remove("alt"); + namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS); + if (args.size() < 2) { + if (!name.equals("ttbc")) { + EnParser.LOG.warning("{{t...}} with wrong args: title=" + parser.title + ", " + wikiTokenizer.token()); + } + return false; + } + final String langCode = ListUtil.get(args, 0); + if (!appendAndIndexWikiCallback.langCodeToTCount.containsKey(langCode)) { + appendAndIndexWikiCallback.langCodeToTCount.put(langCode, new AtomicInteger()); + } + appendAndIndexWikiCallback.langCodeToTCount.get(langCode).incrementAndGet(); + final String word = ListUtil.get(args, 1); + appendAndIndexWikiCallback.dispatch(alt != null ? alt : word, EntryTypeName.WIKTIONARY_TITLE_MULTI); + + // Genders... + if (args.size() > 2) { + appendAndIndexWikiCallback.builder.append(" {"); + for (int i = 2; i < args.size(); ++i) { + if (i > 2) { + appendAndIndexWikiCallback.builder.append("|"); + } + appendAndIndexWikiCallback.builder.append(args.get(i)); + } + appendAndIndexWikiCallback.builder.append("}"); + } + + if (transliteration != null) { + appendAndIndexWikiCallback.builder.append(" ("); + appendAndIndexWikiCallback.dispatch(transliteration, EntryTypeName.WIKTIONARY_TRANSLITERATION); + appendAndIndexWikiCallback.builder.append(")"); + } + + if (alt != null) { + // If alt wasn't null, we appended alt instead of the actual word + // we're filing under.. + appendAndIndexWikiCallback.builder.append(" ("); + appendAndIndexWikiCallback.dispatch(word, EntryTypeName.WIKTIONARY_TITLE_MULTI); + appendAndIndexWikiCallback.builder.append(")"); + } + + // Catch-all for anything else... + if (!namedArgs.isEmpty()) { + appendAndIndexWikiCallback.builder.append(" {"); + EnParser.appendNamedArgs(namedArgs, appendAndIndexWikiCallback); + appendAndIndexWikiCallback.builder.append("}"); + } + + return true; + } } - } - - // ------------------------------------------------------------------ - - static final class EncodingCallback implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final T parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - namedArgs.remove("lang"); - if (!namedArgs.isEmpty()) { - EnParser.LOG.warning("weird encoding: " + wikiTokenizer.token()); - return false; - } - if (args.size() == 0) { - // Things like "{{Jpan}}" exist. - return true; - } - - if (name.equals("IPA")) { - appendAndIndexWikiCallback.dispatch("IPA: ", null); - } - - for (int i = 0; i < args.size(); ++i) { - if (i > 0) { - appendAndIndexWikiCallback.builder.append(", "); + + // ------------------------------------------------------------------ + + static final class QualifierCallback implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final T parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + namedArgs.remove("lang"); + if (!namedArgs.isEmpty()) { + EnParser.LOG.warning("weird qualifier: " + wikiTokenizer.token()); + return false; + } + appendAndIndexWikiCallback.builder.append("("); + for (int i = 0; i < args.size(); ++i) { + appendAndIndexWikiCallback.dispatch(args.get(i), null); + if (i > 0) { + appendAndIndexWikiCallback.builder.append(", "); + } + } + appendAndIndexWikiCallback.builder.append(")"); + return true; } - final String arg = args.get(i); + } + + // ------------------------------------------------------------------ + + static final class EncodingCallback implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final T parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + namedArgs.remove("lang"); + if (!namedArgs.isEmpty()) { + EnParser.LOG.warning("weird encoding: " + wikiTokenizer.token()); + return false; + } + if (args.size() == 0) { + // Things like "{{Jpan}}" exist. + return true; + } + + if (name.equals("IPA")) { + appendAndIndexWikiCallback.dispatch("IPA: ", null); + } + + for (int i = 0; i < args.size(); ++i) { + if (i > 0) { + appendAndIndexWikiCallback.builder.append(", "); + } + final String arg = args.get(i); // if (arg.equals(parser.title)) { // parser.titleAppended = true; // } - appendAndIndexWikiCallback.dispatch(arg, appendAndIndexWikiCallback.entryTypeName); - } - - return true; - } - } - - // ------------------------------------------------------------------ - - static final class Gender implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final T parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - if (!namedArgs.isEmpty()) { - return false; - } - appendAndIndexWikiCallback.builder.append("{"); - appendAndIndexWikiCallback.builder.append(name); - for (int i = 0; i < args.size(); ++i) { - appendAndIndexWikiCallback.builder.append("|").append(args.get(i)); - } - appendAndIndexWikiCallback.builder.append("}"); - return true; - } - } - - // ------------------------------------------------------------------ - - static final class l_term implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final EnParser parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - - // for {{l}}, lang is arg 0, but not for {{term}} - if (name.equals("term")) { - args.add(0, ""); - } - - final EntryTypeName entryTypeName; - switch (parser.state) { - case TRANSLATION_LINE: entryTypeName = EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT; break; - case ENGLISH_DEF_OF_FOREIGN: entryTypeName = EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK; break; - default: throw new IllegalStateException("Invalid enum value: " + parser.state); - } - - final String langCode = args.get(0); - final IndexBuilder indexBuilder; - if ("".equals(langCode)) { - indexBuilder = parser.foreignIndexBuilder; - } else if ("en".equals(langCode)) { - indexBuilder = parser.enIndexBuilder; - } else { - indexBuilder = parser.foreignIndexBuilder; - } - - String displayText = ListUtil.get(args, 2, ""); - if (displayText.equals("")) { - displayText = ListUtil.get(args, 1, null); - } - - if (displayText != null) { - appendAndIndexWikiCallback.dispatch(displayText, indexBuilder, entryTypeName); - } else { - EnParser.LOG.warning("no display text: " + wikiTokenizer.token()); - } - - final String tr = namedArgs.remove("tr"); - if (tr != null) { - appendAndIndexWikiCallback.builder.append(" ("); - appendAndIndexWikiCallback.dispatch(tr, indexBuilder, EntryTypeName.WIKTIONARY_TRANSLITERATION); - appendAndIndexWikiCallback.builder.append(")"); - } - - final String gloss = ListUtil.get(args, 3, ""); - if (!gloss.equals("")) { - appendAndIndexWikiCallback.builder.append(" ("); - appendAndIndexWikiCallback.dispatch(gloss, parser.enIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF); - appendAndIndexWikiCallback.builder.append(")"); - } - - namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS); - if (!namedArgs.isEmpty()) { - appendAndIndexWikiCallback.builder.append(" {").append(name); - EnParser.appendNamedArgs(namedArgs, appendAndIndexWikiCallback); - appendAndIndexWikiCallback.builder.append("}"); - } - - return true; - } - } - - // ------------------------------------------------------------------ - - static final class AppendArg0 implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final EnParser parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - if (args.size() != 1 || !namedArgs.isEmpty()) { - return false; - } - appendAndIndexWikiCallback.dispatch(args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); - - final String tr = namedArgs.remove("tr"); - if (tr != null) { - appendAndIndexWikiCallback.builder.append(" ("); - appendAndIndexWikiCallback.dispatch(tr, EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); - appendAndIndexWikiCallback.builder.append(")"); - parser.wordForms.add(tr); - } - - return true; - } - } - - // ------------------------------------------------------------------ - - static final class italbrac implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final T parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - if (args.size() != 1 || !namedArgs.isEmpty()) { - return false; - } - appendAndIndexWikiCallback.builder.append("("); - appendAndIndexWikiCallback.dispatch(args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); - appendAndIndexWikiCallback.builder.append(")"); - return true; - } - } - - // ------------------------------------------------------------------ - - static final class gloss implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final T parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - if (args.size() != 1 || !namedArgs.isEmpty()) { - return false; - } - appendAndIndexWikiCallback.builder.append("("); - appendAndIndexWikiCallback.dispatch(args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); - appendAndIndexWikiCallback.builder.append(")"); - return true; - } - } - - // ------------------------------------------------------------------ - - static final class Ignore implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final T parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - return true; - } - } - - // ------------------------------------------------------------------ - - static final class not_used implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final T parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - appendAndIndexWikiCallback.builder.append("(not used)"); - return true; + appendAndIndexWikiCallback.dispatch(arg, appendAndIndexWikiCallback.entryTypeName); + } + + return true; + } } - } - - - // ------------------------------------------------------------------ - - static final class AppendName implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final T parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - if (!args.isEmpty() || !namedArgs.isEmpty()) { - return false; - } - appendAndIndexWikiCallback.builder.append(name); - return true; + + // ------------------------------------------------------------------ + + static final class Gender implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final T parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + if (!namedArgs.isEmpty()) { + return false; + } + appendAndIndexWikiCallback.builder.append("{"); + appendAndIndexWikiCallback.builder.append(name); + for (int i = 0; i < args.size(); ++i) { + appendAndIndexWikiCallback.builder.append("|").append(args.get(i)); + } + appendAndIndexWikiCallback.builder.append("}"); + return true; + } } - } - - // -------------------------------------------------------------------- - // -------------------------------------------------------------------- - - - static final class FormOf implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final EnParser parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - parser.entryIsFormOfSomething = true; - String formName = name; - if (name.equals("form of")) { - formName = ListUtil.remove(args, 0, null); - } - if (formName == null) { - EnParser.LOG.warning("Missing form name: " + parser.title); - formName = "form of"; - } - String baseForm = ListUtil.get(args, 1, ""); - if ("".equals(baseForm)) { - baseForm = ListUtil.get(args, 0, null); - ListUtil.remove(args, 1, ""); - } else { - ListUtil.remove(args, 0, null); - } - namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS); - - appendAndIndexWikiCallback.builder.append("{"); - NAME_AND_ARGS.onWikiFunction(wikiTokenizer, formName, args, namedArgs, parser, appendAndIndexWikiCallback); - appendAndIndexWikiCallback.builder.append("}"); - if (baseForm != null && appendAndIndexWikiCallback.indexedEntry != null) { - parser.foreignIndexBuilder.addEntryWithString(appendAndIndexWikiCallback.indexedEntry, baseForm, EntryTypeName.WIKTIONARY_BASE_FORM_MULTI); - } else { - // null baseForm happens in Danish. - EnParser.LOG.warning("Null baseform: " + parser.title); - } - return true; + + // ------------------------------------------------------------------ + + static final class l_term implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final EnParser parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + + // for {{l}}, lang is arg 0, but not for {{term}} + if (name.equals("term")) { + args.add(0, ""); + } + + final EntryTypeName entryTypeName; + switch (parser.state) { + case TRANSLATION_LINE: + entryTypeName = EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT; + break; + case ENGLISH_DEF_OF_FOREIGN: + entryTypeName = EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK; + break; + default: + throw new IllegalStateException("Invalid enum value: " + parser.state); + } + + final String langCode = args.get(0); + final IndexBuilder indexBuilder; + if ("".equals(langCode)) { + indexBuilder = parser.foreignIndexBuilder; + } else if ("en".equals(langCode)) { + indexBuilder = parser.enIndexBuilder; + } else { + indexBuilder = parser.foreignIndexBuilder; + } + + String displayText = ListUtil.get(args, 2, ""); + if (displayText.equals("")) { + displayText = ListUtil.get(args, 1, null); + } + + if (displayText != null) { + appendAndIndexWikiCallback.dispatch(displayText, indexBuilder, entryTypeName); + } else { + EnParser.LOG.warning("no display text: " + wikiTokenizer.token()); + } + + final String tr = namedArgs.remove("tr"); + if (tr != null) { + appendAndIndexWikiCallback.builder.append(" ("); + appendAndIndexWikiCallback.dispatch(tr, indexBuilder, EntryTypeName.WIKTIONARY_TRANSLITERATION); + appendAndIndexWikiCallback.builder.append(")"); + } + + final String gloss = ListUtil.get(args, 3, ""); + if (!gloss.equals("")) { + appendAndIndexWikiCallback.builder.append(" ("); + appendAndIndexWikiCallback.dispatch(gloss, parser.enIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF); + appendAndIndexWikiCallback.builder.append(")"); + } + + namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS); + if (!namedArgs.isEmpty()) { + appendAndIndexWikiCallback.builder.append(" {").append(name); + EnParser.appendNamedArgs(namedArgs, appendAndIndexWikiCallback); + appendAndIndexWikiCallback.builder.append("}"); + } + + return true; + } } - } - - static final EnFunctionCallbacks.FormOf FORM_OF = new FormOf(); - - - // -------------------------------------------------------------------- - // -------------------------------------------------------------------- - - static final class wikipedia implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final T parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - namedArgs.remove("lang"); - if (args.size() > 1 || !namedArgs.isEmpty()) { - // Unindexed! - return false; - } else if (args.size() == 1) { - return false; - } else { - return true; - } + + // ------------------------------------------------------------------ + + static final class AppendArg0 implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final EnParser parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + if (args.size() != 1 || !namedArgs.isEmpty()) { + return false; + } + appendAndIndexWikiCallback.dispatch(args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); + + final String tr = namedArgs.remove("tr"); + if (tr != null) { + appendAndIndexWikiCallback.builder.append(" ("); + appendAndIndexWikiCallback.dispatch(tr, EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); + appendAndIndexWikiCallback.builder.append(")"); + parser.wordForms.add(tr); + } + + return true; + } } - } - - static final class InflOrHead implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final EnParser parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - // See: http://en.wiktionary.org/wiki/Template:infl - // TODO: Actually these functions should start a new WordPOS: - // See: http://en.wiktionary.org/wiki/quattro - final String langCode = ListUtil.get(args, 0); - String head = namedArgs.remove("head"); - if (head == null) { - head = namedArgs.remove("title"); // Bug - } - if (head == null) { - head = parser.title; - } - - namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS); - - final String tr = namedArgs.remove("tr"); - String g = namedArgs.remove("g"); - if (g == null) { - g = namedArgs.remove("gender"); - } - final String g2 = namedArgs.remove("g2"); - final String g3 = namedArgs.remove("g3"); - - // We might have already taken care of this in a generic way... - if (!parser.titleAppended) { - appendAndIndexWikiCallback.dispatch(head, EntryTypeName.WIKTIONARY_TITLE_MULTI); - parser.titleAppended = true; - } - - if (g != null) { - appendAndIndexWikiCallback.builder.append(" {").append(g); - if (g2 != null) { - appendAndIndexWikiCallback.builder.append("|").append(g2); + + // ------------------------------------------------------------------ + + static final class italbrac implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final T parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + if (args.size() != 1 || !namedArgs.isEmpty()) { + return false; + } + appendAndIndexWikiCallback.builder.append("("); + appendAndIndexWikiCallback.dispatch(args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); + appendAndIndexWikiCallback.builder.append(")"); + return true; } - if (g3 != null) { - appendAndIndexWikiCallback.builder.append("|").append(g3); + } + + // ------------------------------------------------------------------ + + static final class gloss implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final T parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + if (args.size() != 1 || !namedArgs.isEmpty()) { + return false; + } + appendAndIndexWikiCallback.builder.append("("); + appendAndIndexWikiCallback.dispatch(args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); + appendAndIndexWikiCallback.builder.append(")"); + return true; } - appendAndIndexWikiCallback.builder.append("}"); - } - - if (tr != null) { - appendAndIndexWikiCallback.builder.append(" ("); - appendAndIndexWikiCallback.dispatch(tr, EntryTypeName.WIKTIONARY_TITLE_MULTI); - appendAndIndexWikiCallback.builder.append(")"); - parser.wordForms.add(tr); - } - - final String pos = ListUtil.get(args, 1); - if (pos != null) { - appendAndIndexWikiCallback.builder.append(" (").append(pos).append(")"); - } - for (int i = 2; i < args.size(); i += 2) { - final String inflName = ListUtil.get(args, i); - final String inflValue = ListUtil.get(args, i + 1); - appendAndIndexWikiCallback.builder.append(", "); - appendAndIndexWikiCallback.dispatch(inflName, null, null); - if (inflValue != null && inflValue.length() > 0) { - appendAndIndexWikiCallback.builder.append(": "); - appendAndIndexWikiCallback.dispatch(inflValue, null, null); - parser.wordForms.add(inflValue); + } + + // ------------------------------------------------------------------ + + static final class Ignore implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final T parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + return true; } - } - for (final String key : namedArgs.keySet()) { - final String value = WikiTokenizer.toPlainText(namedArgs.get(key)); - appendAndIndexWikiCallback.builder.append(" "); - appendAndIndexWikiCallback.dispatch(key, null, null); - appendAndIndexWikiCallback.builder.append("="); - appendAndIndexWikiCallback.dispatch(value, null, null); - parser.wordForms.add(value); - } - return true; } - } - - static final class etyl implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final T parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - final String langCode = ListUtil.get(args, 0); - if (langCode == null) { - return false; + + // ------------------------------------------------------------------ + + static final class not_used implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final T parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + appendAndIndexWikiCallback.builder.append("(not used)"); + return true; } - String langName = WiktionaryLangs.getEnglishName(langCode); - if (langName != null) { - appendAndIndexWikiCallback.dispatch(langName, null); - } else { - appendAndIndexWikiCallback.dispatch("lang:" + langCode, null); + } + + + // ------------------------------------------------------------------ + + static final class AppendName implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final T parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + if (!args.isEmpty() || !namedArgs.isEmpty()) { + return false; + } + appendAndIndexWikiCallback.builder.append(name); + return true; } - return true; - } - } - - static final class term implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final T parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - namedArgs.remove("sc"); - - // Main text. - final String lang = namedArgs.remove("lang"); - String head = ListUtil.get(args, 0); - String display = ListUtil.get(args, 1); - if (StringUtil.isNullOrEmpty(head) && StringUtil.isNullOrEmpty(display)) { - head = display = parser.title; + } + + // -------------------------------------------------------------------- + // -------------------------------------------------------------------- + + + static final class FormOf implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final EnParser parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + parser.entryIsFormOfSomething = true; + String formName = name; + if (name.equals("form of")) { + formName = ListUtil.remove(args, 0, null); + } + if (formName == null) { + EnParser.LOG.warning("Missing form name: " + parser.title); + formName = "form of"; + } + String baseForm = ListUtil.get(args, 1, ""); + if ("".equals(baseForm)) { + baseForm = ListUtil.get(args, 0, null); + ListUtil.remove(args, 1, ""); + } else { + ListUtil.remove(args, 0, null); + } + namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS); + + appendAndIndexWikiCallback.builder.append("{"); + NAME_AND_ARGS.onWikiFunction(wikiTokenizer, formName, args, namedArgs, parser, appendAndIndexWikiCallback); + appendAndIndexWikiCallback.builder.append("}"); + if (baseForm != null && appendAndIndexWikiCallback.indexedEntry != null) { + parser.foreignIndexBuilder.addEntryWithString(appendAndIndexWikiCallback.indexedEntry, baseForm, EntryTypeName.WIKTIONARY_BASE_FORM_MULTI); + } else { + // null baseForm happens in Danish. + EnParser.LOG.warning("Null baseform: " + parser.title); + } + return true; } - if (StringUtil.isNullOrEmpty(head)) { - // Dispatches formatted wiki text. - appendAndIndexWikiCallback.dispatch(display, null); - } else { - if (StringUtil.isNullOrEmpty(display)) { - display = head; - } - appendAndIndexWikiCallback.dispatch(String.format("[[%s|%s]]", display, head), null); + } + + static final EnFunctionCallbacks.FormOf FORM_OF = new FormOf(); + + + // -------------------------------------------------------------------- + // -------------------------------------------------------------------- + + static final class wikipedia implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final T parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + namedArgs.remove("lang"); + if (args.size() > 1 || !namedArgs.isEmpty()) { + // Unindexed! + return false; + } else if (args.size() == 1) { + return false; + } else { + return true; + } } - - // Stuff in ()s. - final String tr = namedArgs.remove("tr"); - final String pos = namedArgs.remove("pos"); - String gloss = ListUtil.get(args, 2); - String literally = namedArgs.remove("lit"); - if (!StringUtil.isNullOrEmpty(gloss)) { - gloss = String.format("\"%s\"", gloss); + } + + static final class InflOrHead implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final EnParser parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + // See: http://en.wiktionary.org/wiki/Template:infl + // TODO: Actually these functions should start a new WordPOS: + // See: http://en.wiktionary.org/wiki/quattro + final String langCode = ListUtil.get(args, 0); + String head = namedArgs.remove("head"); + if (head == null) { + head = namedArgs.remove("title"); // Bug + } + if (head == null) { + head = parser.title; + } + + namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS); + + final String tr = namedArgs.remove("tr"); + String g = namedArgs.remove("g"); + if (g == null) { + g = namedArgs.remove("gender"); + } + final String g2 = namedArgs.remove("g2"); + final String g3 = namedArgs.remove("g3"); + + // We might have already taken care of this in a generic way... + if (!parser.titleAppended) { + appendAndIndexWikiCallback.dispatch(head, EntryTypeName.WIKTIONARY_TITLE_MULTI); + parser.titleAppended = true; + } + + if (g != null) { + appendAndIndexWikiCallback.builder.append(" {").append(g); + if (g2 != null) { + appendAndIndexWikiCallback.builder.append("|").append(g2); + } + if (g3 != null) { + appendAndIndexWikiCallback.builder.append("|").append(g3); + } + appendAndIndexWikiCallback.builder.append("}"); + } + + if (tr != null) { + appendAndIndexWikiCallback.builder.append(" ("); + appendAndIndexWikiCallback.dispatch(tr, EntryTypeName.WIKTIONARY_TITLE_MULTI); + appendAndIndexWikiCallback.builder.append(")"); + parser.wordForms.add(tr); + } + + final String pos = ListUtil.get(args, 1); + if (pos != null) { + appendAndIndexWikiCallback.builder.append(" (").append(pos).append(")"); + } + for (int i = 2; i < args.size(); i += 2) { + final String inflName = ListUtil.get(args, i); + final String inflValue = ListUtil.get(args, i + 1); + appendAndIndexWikiCallback.builder.append(", "); + appendAndIndexWikiCallback.dispatch(inflName, null, null); + if (inflValue != null && inflValue.length() > 0) { + appendAndIndexWikiCallback.builder.append(": "); + appendAndIndexWikiCallback.dispatch(inflValue, null, null); + parser.wordForms.add(inflValue); + } + } + for (final String key : namedArgs.keySet()) { + final String value = WikiTokenizer.toPlainText(namedArgs.get(key)); + appendAndIndexWikiCallback.builder.append(" "); + appendAndIndexWikiCallback.dispatch(key, null, null); + appendAndIndexWikiCallback.builder.append("="); + appendAndIndexWikiCallback.dispatch(value, null, null); + parser.wordForms.add(value); + } + return true; } - if (!StringUtil.isNullOrEmpty(literally)) { - literally = String.format("literally %s", literally); + } + + static final class etyl implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final T parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + final String langCode = ListUtil.get(args, 0); + if (langCode == null) { + return false; + } + String langName = WiktionaryLangs.getEnglishName(langCode); + if (langName != null) { + appendAndIndexWikiCallback.dispatch(langName, null); + } else { + appendAndIndexWikiCallback.dispatch("lang:" + langCode, null); + } + return true; } - final List inParens = new ArrayList(Arrays.asList(tr, pos, gloss, literally)); - cleanList(inParens); - appendCommaSeparatedList(appendAndIndexWikiCallback, inParens); - - if (tr != null) { - parser.addLinkToCurrentEntry(tr, lang, EntryTypeName.WIKTIONARY_MENTIONED); + } + + static final class term implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final T parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + namedArgs.remove("sc"); + + // Main text. + final String lang = namedArgs.remove("lang"); + String head = ListUtil.get(args, 0); + String display = ListUtil.get(args, 1); + if (StringUtil.isNullOrEmpty(head) && StringUtil.isNullOrEmpty(display)) { + head = display = parser.title; + } + if (StringUtil.isNullOrEmpty(head)) { + // Dispatches formatted wiki text. + appendAndIndexWikiCallback.dispatch(display, null); + } else { + if (StringUtil.isNullOrEmpty(display)) { + display = head; + } + appendAndIndexWikiCallback.dispatch(String.format("[[%s|%s]]", display, head), null); + } + + // Stuff in ()s. + final String tr = namedArgs.remove("tr"); + final String pos = namedArgs.remove("pos"); + String gloss = ListUtil.get(args, 2); + String literally = namedArgs.remove("lit"); + if (!StringUtil.isNullOrEmpty(gloss)) { + gloss = String.format("\"%s\"", gloss); + } + if (!StringUtil.isNullOrEmpty(literally)) { + literally = String.format("literally %s", literally); + } + final List inParens = new ArrayList(Arrays.asList(tr, pos, gloss, literally)); + cleanList(inParens); + appendCommaSeparatedList(appendAndIndexWikiCallback, inParens); + + if (tr != null) { + parser.addLinkToCurrentEntry(tr, lang, EntryTypeName.WIKTIONARY_MENTIONED); + } + return namedArgs.isEmpty(); } - return namedArgs.isEmpty(); - } - private void appendCommaSeparatedList( + private void appendCommaSeparatedList( final AppendAndIndexWikiCallback appendAndIndexWikiCallback, final List inParens) { - if (!inParens.isEmpty()) { - appendAndIndexWikiCallback.dispatch(" (", null); - for (int i = 0; i < inParens.size(); ++i) { - if (i > 0) { - appendAndIndexWikiCallback.dispatch(", ", null); + if (!inParens.isEmpty()) { + appendAndIndexWikiCallback.dispatch(" (", null); + for (int i = 0; i < inParens.size(); ++i) { + if (i > 0) { + appendAndIndexWikiCallback.dispatch(", ", null); + } + appendAndIndexWikiCallback.dispatch(inParens.get(i), null); } - appendAndIndexWikiCallback.dispatch(inParens.get(i), null); + appendAndIndexWikiCallback.dispatch(")", null); } - appendAndIndexWikiCallback.dispatch(")", null); } - } - } - - private static void cleanList(List asList) { - int pos; - while ((pos = asList.indexOf("")) != -1) { - asList.remove(pos); - } - while ((pos = asList.indexOf(null)) != -1) { - asList.remove(pos); - } - } - - - static { - DEFAULT.put("it-noun", new it_noun()); - } - static final class it_noun implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final EnParser parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - parser.titleAppended = true; - final String gender = ListUtil.get(args, 0); - final String singular = parser.title; - final String plural = ListUtil.get(args, 1, null); - appendAndIndexWikiCallback.builder.append(" "); - appendAndIndexWikiCallback.dispatch(singular, null, null); - appendAndIndexWikiCallback.builder.append(" {").append(gender).append("}, "); - if (plural != null) { - appendAndIndexWikiCallback.dispatch(plural, null, null); - appendAndIndexWikiCallback.builder.append(" {pl}"); - parser.wordForms.add(plural); - } - final String f = namedArgs.remove("f"); - if (f != null) { - appendAndIndexWikiCallback.builder.append(", "); - appendAndIndexWikiCallback.dispatch(f, null, null); - appendAndIndexWikiCallback.builder.append(" {f}"); - } - final String m = namedArgs.remove("m"); - if (m != null) { - appendAndIndexWikiCallback.builder.append(", "); - appendAndIndexWikiCallback.dispatch(m, null, null); - appendAndIndexWikiCallback.builder.append(" {m}"); - } - parser.wordForms.add(singular); - if (!namedArgs.isEmpty() || args.size() > 4) { - EnParser.LOG.warning("Invalid it-noun: " + wikiTokenizer.token()); - } - return true; } - } - - static { - DEFAULT.put("it-proper noun", new it_proper_noun()); - } - static final class it_proper_noun implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final T parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - return false; + + private static void cleanList(List asList) { + int pos; + while ((pos = asList.indexOf("")) != -1) { + asList.remove(pos); + } + while ((pos = asList.indexOf(null)) != -1) { + asList.remove(pos); + } } - } - - // ----------------------------------------------------------------------- - // Italian stuff - // ----------------------------------------------------------------------- - -static final class it_conj_are implements FunctionCallback { - final it_conj dest; - it_conj_are(it_conj dest) { - this.dest = dest; + + + static { + DEFAULT.put("it-noun", new it_noun()); } - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final T parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - final String h = name.equals("it-conj-care") || name.equals("it-conj-carsi") ? "h" : ""; - final String i = name.equals("it-conj-ciare") || name.equals("it-conj-ciarsi") ? "i" : ""; - final String i2 = name.equals("it-conj-iare") || name.equals("it-conj-iarsi") ? "" : "i"; - final boolean si = name.equals("it-conj-arsi") || name.equals("it-conj-iarsi") || name.equals("it-conj-iarsi-b") || name.equals("it-conj-carsi") || name.equals("it-conj-ciarsi"); - final String root = args.get(0); - passThroughOrFillIn(namedArgs, "inf", root + i + (si ? "arsi" : "are"), false); - namedArgs.put("aux", ListUtil.get(args, 1, "")); - passThroughOrFillIn(namedArgs, "ger", root + i + "ando" + (si ? "si" : ""), true); - passThroughOrFillIn(namedArgs, "presp", root + i + "ante"+ (si ? "si" : ""), true); - passThroughOrFillIn(namedArgs, "pastp", root + i + "ato", true); - if (si) { - passThroughOrFillIn(namedArgs, "pastp2", root + i + "atosi", true); + static final class it_noun implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final EnParser parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + parser.titleAppended = true; + final String gender = ListUtil.get(args, 0); + final String singular = parser.title; + final String plural = ListUtil.get(args, 1, null); + appendAndIndexWikiCallback.builder.append(" "); + appendAndIndexWikiCallback.dispatch(singular, null, null); + appendAndIndexWikiCallback.builder.append(" {").append(gender).append("}, "); + if (plural != null) { + appendAndIndexWikiCallback.dispatch(plural, null, null); + appendAndIndexWikiCallback.builder.append(" {pl}"); + parser.wordForms.add(plural); + } + final String f = namedArgs.remove("f"); + if (f != null) { + appendAndIndexWikiCallback.builder.append(", "); + appendAndIndexWikiCallback.dispatch(f, null, null); + appendAndIndexWikiCallback.builder.append(" {f}"); + } + final String m = namedArgs.remove("m"); + if (m != null) { + appendAndIndexWikiCallback.builder.append(", "); + appendAndIndexWikiCallback.dispatch(m, null, null); + appendAndIndexWikiCallback.builder.append(" {m}"); + } + parser.wordForms.add(singular); + if (!namedArgs.isEmpty() || args.size() > 4) { + EnParser.LOG.warning("Invalid it-noun: " + wikiTokenizer.token()); + } + return true; } - final String i2b = (name.equals("it-conj-iare-b") || name.equals("it-conj-iarsi-b")) ? "" : i2; - - it_conj_passMood(namedArgs, "pres", false, root, Arrays.asList(i + "o", h + i2, i + "a", h + i2 + "amo", i + "ate", i + "ano")); - it_conj_passMood(namedArgs, "imperf", false, root, Arrays.asList(i + "avo", i + "avi", i + "ava", i + "avamo", i + "avate", i + "avano")); - it_conj_passMood(namedArgs, "prem", false, root, Arrays.asList(i + "ai", i + "asti", i + "ò", i + "ammo", i + "aste", i + "arono")); - it_conj_passMood(namedArgs, "fut", true, root, Arrays.asList(h + "erò", h + "erai", h + "erà", h + "eremo", h + "erete", h + "eranno")); - it_conj_passMood(namedArgs, "cond", true, root, Arrays.asList(h + "erei", h + "eresti", h + "erebbe", h + "eremmo", h + "ereste", h + "erebbero")); - - passThroughOrFillIn(namedArgs, "sub123s", root + h + i2, false); - passThroughOrFillIn(namedArgs, "sub1p", root + h + i2b + "amo", false); - passThroughOrFillIn(namedArgs, "sub2p", root + h + i2b + "ate", false); - passThroughOrFillIn(namedArgs, "sub3p", root + h + i2 + "no", false); - - passThroughOrFillIn(namedArgs, "impsub12s", root + i + "assi", false); - passThroughOrFillIn(namedArgs, "impsub3s", root + i + "asse", false); - passThroughOrFillIn(namedArgs, "impsub1p", root + i + "assimo", false); - passThroughOrFillIn(namedArgs, "impsub2p", root + i + "aste", false); - passThroughOrFillIn(namedArgs, "impsub3p", root + i + "assero", false); - - passThroughOrFillIn(namedArgs, "imp2s", root + i + "a" + (si ? "ti" : ""), true); - passThroughOrFillIn(namedArgs, "imp3s", (si ? "si " : "") + root + h + i2, true); - passThroughOrFillIn(namedArgs, "imp1p", root + h + i2b + "amo" + (si ? "ci" : ""), true); - passThroughOrFillIn(namedArgs, "imp2p", root + i + "ate" + (si ? "vi" : ""), true); - passThroughOrFillIn(namedArgs, "imp3p", (si ? "si " : "") + root + h + i2 + "no", true); - - return dest.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, appendAndIndexWikiCallback); - } } - static final class it_conj_ire implements FunctionCallback { - final it_conj dest; - it_conj_ire(it_conj dest) { - this.dest = dest; + static { + DEFAULT.put("it-proper noun", new it_proper_noun()); + } + static final class it_proper_noun implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final T parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + return false; + } } - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final T parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - final String root = args.get(0); - final String i = name.equals("it-conj-cire") || name.equals("it-conj-cirsi") ? "i" : ""; - final boolean si = name.equals("it-conj-irsi") || name.equals("it-conj-irsi-b") || name.equals("it-conj-cirsi"); - - passThroughOrFillIn(namedArgs, "inf", root + (si ? "irsi" : "ire"), false); - namedArgs.put("aux", ListUtil.get(args, 1, "")); - passThroughOrFillIn(namedArgs, "ger", root + "endo" + (si ? "si" : ""), true); - passThroughOrFillIn(namedArgs, "presp", root + "ente" + (si ? "si" : ""), true); - passThroughOrFillIn(namedArgs, "pastp", root + "ito", true); - if (si) { - passThroughOrFillIn(namedArgs, "pastp2", root + "itosi", true); + + // ----------------------------------------------------------------------- + // Italian stuff + // ----------------------------------------------------------------------- + + static final class it_conj_are implements FunctionCallback { + final it_conj dest; + it_conj_are(it_conj dest) { + this.dest = dest; + } + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final T parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + final String h = name.equals("it-conj-care") || name.equals("it-conj-carsi") ? "h" : ""; + final String i = name.equals("it-conj-ciare") || name.equals("it-conj-ciarsi") ? "i" : ""; + final String i2 = name.equals("it-conj-iare") || name.equals("it-conj-iarsi") ? "" : "i"; + final boolean si = name.equals("it-conj-arsi") || name.equals("it-conj-iarsi") || name.equals("it-conj-iarsi-b") || name.equals("it-conj-carsi") || name.equals("it-conj-ciarsi"); + final String root = args.get(0); + passThroughOrFillIn(namedArgs, "inf", root + i + (si ? "arsi" : "are"), false); + namedArgs.put("aux", ListUtil.get(args, 1, "")); + passThroughOrFillIn(namedArgs, "ger", root + i + "ando" + (si ? "si" : ""), true); + passThroughOrFillIn(namedArgs, "presp", root + i + "ante"+ (si ? "si" : ""), true); + passThroughOrFillIn(namedArgs, "pastp", root + i + "ato", true); + if (si) { + passThroughOrFillIn(namedArgs, "pastp2", root + i + "atosi", true); + } + final String i2b = (name.equals("it-conj-iare-b") || name.equals("it-conj-iarsi-b")) ? "" : i2; + + it_conj_passMood(namedArgs, "pres", false, root, Arrays.asList(i + "o", h + i2, i + "a", h + i2 + "amo", i + "ate", i + "ano")); + it_conj_passMood(namedArgs, "imperf", false, root, Arrays.asList(i + "avo", i + "avi", i + "ava", i + "avamo", i + "avate", i + "avano")); + it_conj_passMood(namedArgs, "prem", false, root, Arrays.asList(i + "ai", i + "asti", i + "ò", i + "ammo", i + "aste", i + "arono")); + it_conj_passMood(namedArgs, "fut", true, root, Arrays.asList(h + "erò", h + "erai", h + "erà", h + "eremo", h + "erete", h + "eranno")); + it_conj_passMood(namedArgs, "cond", true, root, Arrays.asList(h + "erei", h + "eresti", h + "erebbe", h + "eremmo", h + "ereste", h + "erebbero")); + + passThroughOrFillIn(namedArgs, "sub123s", root + h + i2, false); + passThroughOrFillIn(namedArgs, "sub1p", root + h + i2b + "amo", false); + passThroughOrFillIn(namedArgs, "sub2p", root + h + i2b + "ate", false); + passThroughOrFillIn(namedArgs, "sub3p", root + h + i2 + "no", false); + + passThroughOrFillIn(namedArgs, "impsub12s", root + i + "assi", false); + passThroughOrFillIn(namedArgs, "impsub3s", root + i + "asse", false); + passThroughOrFillIn(namedArgs, "impsub1p", root + i + "assimo", false); + passThroughOrFillIn(namedArgs, "impsub2p", root + i + "aste", false); + passThroughOrFillIn(namedArgs, "impsub3p", root + i + "assero", false); + + passThroughOrFillIn(namedArgs, "imp2s", root + i + "a" + (si ? "ti" : ""), true); + passThroughOrFillIn(namedArgs, "imp3s", (si ? "si " : "") + root + h + i2, true); + passThroughOrFillIn(namedArgs, "imp1p", root + h + i2b + "amo" + (si ? "ci" : ""), true); + passThroughOrFillIn(namedArgs, "imp2p", root + i + "ate" + (si ? "vi" : ""), true); + passThroughOrFillIn(namedArgs, "imp3p", (si ? "si " : "") + root + h + i2 + "no", true); + + return dest.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, appendAndIndexWikiCallback); } - if (!name.endsWith("-b")) { - it_conj_passMood(namedArgs, "pres", false, root, Arrays.asList(i + "o", "i", "e", "iamo", "ite", i + "ono")); - } else { - it_conj_passMood(namedArgs, "pres", false, root, Arrays.asList("isco", "isci", "isce", "iamo", "ite", "iscono")); + } + + static final class it_conj_ire implements FunctionCallback { + final it_conj dest; + it_conj_ire(it_conj dest) { + this.dest = dest; } - it_conj_passMood(namedArgs, "imperf", false, root, Arrays.asList("ivo", "ivi", "iva", "ivamo", "ivate", "ivano")); - it_conj_passMood(namedArgs, "prem", false, root, Arrays.asList("ii", "isti", "ì", "immo", "iste", "irono")); - // Regular past historic synonyms: - passThroughOrFillIn(namedArgs, "prem3s2", root + "é", true); - passThroughOrFillIn(namedArgs, "prem3p2", root + "erono", true); - it_conj_passMood(namedArgs, "fut", true, root, Arrays.asList("irò", "irai", "irà", "iremo", "irete", "iranno")); - it_conj_passMood(namedArgs, "cond", true, root, Arrays.asList("irei", "iresti", "irebbe", "iremmo", "ireste", "irebbero")); - - if (!name.endsWith("-b")) { - passThroughOrFillIn(namedArgs, "sub123s", root + i + "a", false); - passThroughOrFillIn(namedArgs, "sub3p", root + i + "ano", false); - } else { - passThroughOrFillIn(namedArgs, "sub123s", root + "isca", false); - passThroughOrFillIn(namedArgs, "sub3p", root + "iscano", false); + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final T parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + final String root = args.get(0); + final String i = name.equals("it-conj-cire") || name.equals("it-conj-cirsi") ? "i" : ""; + final boolean si = name.equals("it-conj-irsi") || name.equals("it-conj-irsi-b") || name.equals("it-conj-cirsi"); + + passThroughOrFillIn(namedArgs, "inf", root + (si ? "irsi" : "ire"), false); + namedArgs.put("aux", ListUtil.get(args, 1, "")); + passThroughOrFillIn(namedArgs, "ger", root + "endo" + (si ? "si" : ""), true); + passThroughOrFillIn(namedArgs, "presp", root + "ente" + (si ? "si" : ""), true); + passThroughOrFillIn(namedArgs, "pastp", root + "ito", true); + if (si) { + passThroughOrFillIn(namedArgs, "pastp2", root + "itosi", true); + } + if (!name.endsWith("-b")) { + it_conj_passMood(namedArgs, "pres", false, root, Arrays.asList(i + "o", "i", "e", "iamo", "ite", i + "ono")); + } else { + it_conj_passMood(namedArgs, "pres", false, root, Arrays.asList("isco", "isci", "isce", "iamo", "ite", "iscono")); + } + it_conj_passMood(namedArgs, "imperf", false, root, Arrays.asList("ivo", "ivi", "iva", "ivamo", "ivate", "ivano")); + it_conj_passMood(namedArgs, "prem", false, root, Arrays.asList("ii", "isti", "ì", "immo", "iste", "irono")); + // Regular past historic synonyms: + passThroughOrFillIn(namedArgs, "prem3s2", root + "é", true); + passThroughOrFillIn(namedArgs, "prem3p2", root + "erono", true); + it_conj_passMood(namedArgs, "fut", true, root, Arrays.asList("irò", "irai", "irà", "iremo", "irete", "iranno")); + it_conj_passMood(namedArgs, "cond", true, root, Arrays.asList("irei", "iresti", "irebbe", "iremmo", "ireste", "irebbero")); + + if (!name.endsWith("-b")) { + passThroughOrFillIn(namedArgs, "sub123s", root + i + "a", false); + passThroughOrFillIn(namedArgs, "sub3p", root + i + "ano", false); + } else { + passThroughOrFillIn(namedArgs, "sub123s", root + "isca", false); + passThroughOrFillIn(namedArgs, "sub3p", root + "iscano", false); + } + passThroughOrFillIn(namedArgs, "sub1p", root + "iamo", false); + passThroughOrFillIn(namedArgs, "sub2p", root + "iate", false); + + passThroughOrFillIn(namedArgs, "impsub12s", root + "issi", false); + passThroughOrFillIn(namedArgs, "impsub3s", root + "isse", false); + passThroughOrFillIn(namedArgs, "impsub1p", root + "issimo", false); + passThroughOrFillIn(namedArgs, "impsub2p", root + "iste", false); + passThroughOrFillIn(namedArgs, "impsub3p", root + "issero", false); + + if (!name.endsWith("-b")) { + passThroughOrFillIn(namedArgs, "imp2s", root + "i" + (si ? "ti" : ""), true); + passThroughOrFillIn(namedArgs, "imp3s", (si ? "si " : "") + root + i + "a", true); + passThroughOrFillIn(namedArgs, "imp3p", (si ? "si " : "") + root + i + "ano", true); + } else { + passThroughOrFillIn(namedArgs, "imp2s", root + "isci" + (si ? "ti" : ""), true); + passThroughOrFillIn(namedArgs, "imp3s", (si ? "si " : "") + root + "isca", true); + passThroughOrFillIn(namedArgs, "imp3p", (si ? "si " : "") + root + "iscano", true); + } + passThroughOrFillIn(namedArgs, "imp1p", root + "iamo" + (si ? "ci" : ""), true); + passThroughOrFillIn(namedArgs, "imp2p", root + "ite" + (si ? "vi" : ""), true); + + return dest.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, appendAndIndexWikiCallback); } - passThroughOrFillIn(namedArgs, "sub1p", root + "iamo", false); - passThroughOrFillIn(namedArgs, "sub2p", root + "iate", false); + } - passThroughOrFillIn(namedArgs, "impsub12s", root + "issi", false); - passThroughOrFillIn(namedArgs, "impsub3s", root + "isse", false); - passThroughOrFillIn(namedArgs, "impsub1p", root + "issimo", false); - passThroughOrFillIn(namedArgs, "impsub2p", root + "iste", false); - passThroughOrFillIn(namedArgs, "impsub3p", root + "issero", false); - if (!name.endsWith("-b")) { - passThroughOrFillIn(namedArgs, "imp2s", root + "i" + (si ? "ti" : ""), true); - passThroughOrFillIn(namedArgs, "imp3s", (si ? "si " : "") + root + i + "a", true); - passThroughOrFillIn(namedArgs, "imp3p", (si ? "si " : "") + root + i + "ano", true); - } else { - passThroughOrFillIn(namedArgs, "imp2s", root + "isci" + (si ? "ti" : ""), true); - passThroughOrFillIn(namedArgs, "imp3s", (si ? "si " : "") + root + "isca", true); - passThroughOrFillIn(namedArgs, "imp3p", (si ? "si " : "") + root + "iscano", true); + static final class it_conj_ere implements FunctionCallback { + final it_conj dest; + it_conj_ere(it_conj dest) { + this.dest = dest; } - passThroughOrFillIn(namedArgs, "imp1p", root + "iamo" + (si ? "ci" : ""), true); - passThroughOrFillIn(namedArgs, "imp2p", root + "ite" + (si ? "vi" : ""), true); + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final T parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + final String root = args.get(0); + final boolean si = name.equals("it-conj-ersi"); + + passThroughOrFillIn(namedArgs, "inf", root + (si ? "ersi" : "ere"), false); + namedArgs.put("aux", ListUtil.get(args, 1, "")); + passThroughOrFillIn(namedArgs, "ger", root + "endo" + (si ? "si" : ""), true); + passThroughOrFillIn(namedArgs, "presp", root + "ente" + (si ? "si" : ""), true); + passThroughOrFillIn(namedArgs, "pastp", root + "uto", true); + if (si) { + passThroughOrFillIn(namedArgs, "pastp2", root + "utosi", true); + } + it_conj_passMood(namedArgs, "pres", false, root, Arrays.asList("o", "i", "e", "iamo", "ete", "ono")); + it_conj_passMood(namedArgs, "imperf", false, root, Arrays.asList("evo", "evi", "eva", "evamo", "evate", "evano")); + it_conj_passMood(namedArgs, "prem", false, root, Arrays.asList("ei", "esti", "ette", "emmo", "este", "ettero")); + // Regular past historic synonyms: + passThroughOrFillIn(namedArgs, "prem3s2", root + "é", true); + passThroughOrFillIn(namedArgs, "prem3p2", root + "erono", true); + it_conj_passMood(namedArgs, "fut", true, root, Arrays.asList("erò", "erai", "erà", "eremo", "erete", "eranno")); + it_conj_passMood(namedArgs, "cond", true, root, Arrays.asList("erei", "eresti", "erebbe", "eremmo", "ereste", "erebbero")); + + passThroughOrFillIn(namedArgs, "sub123s", root + "a", false); + passThroughOrFillIn(namedArgs, "sub1p", root + "iamo", false); + passThroughOrFillIn(namedArgs, "sub2p", root + "iate", false); + passThroughOrFillIn(namedArgs, "sub3p", root + "ano", false); + + passThroughOrFillIn(namedArgs, "impsub12s", root + "essi", false); + passThroughOrFillIn(namedArgs, "impsub3s", root + "esse", false); + passThroughOrFillIn(namedArgs, "impsub1p", root + "essimo", false); + passThroughOrFillIn(namedArgs, "impsub2p", root + "este", false); + passThroughOrFillIn(namedArgs, "impsub3p", root + "essero", false); + + passThroughOrFillIn(namedArgs, "imp2s", root + "i" + (si ? "ti" : ""), true); + passThroughOrFillIn(namedArgs, "imp3s", (si ? "si " : "") + root + "a", true); + passThroughOrFillIn(namedArgs, "imp1p", root + "iamo" + (si ? "ci" : ""), true); + passThroughOrFillIn(namedArgs, "imp2p", root + "ete" + (si ? "vi" : ""), true); + passThroughOrFillIn(namedArgs, "imp3p", (si ? "si " : "") + root + "ano", true); - return dest.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, appendAndIndexWikiCallback); - } + return dest.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, appendAndIndexWikiCallback); + } } - - static final class it_conj_ere implements FunctionCallback { - final it_conj dest; - it_conj_ere(it_conj dest) { - this.dest = dest; - } - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final T parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - final String root = args.get(0); - final boolean si = name.equals("it-conj-ersi"); - - passThroughOrFillIn(namedArgs, "inf", root + (si ? "ersi" : "ere"), false); - namedArgs.put("aux", ListUtil.get(args, 1, "")); - passThroughOrFillIn(namedArgs, "ger", root + "endo" + (si ? "si" : ""), true); - passThroughOrFillIn(namedArgs, "presp", root + "ente" + (si ? "si" : ""), true); - passThroughOrFillIn(namedArgs, "pastp", root + "uto", true); - if (si) { - passThroughOrFillIn(namedArgs, "pastp2", root + "utosi", true); - } - it_conj_passMood(namedArgs, "pres", false, root, Arrays.asList("o", "i", "e", "iamo", "ete", "ono")); - it_conj_passMood(namedArgs, "imperf", false, root, Arrays.asList("evo", "evi", "eva", "evamo", "evate", "evano")); - it_conj_passMood(namedArgs, "prem", false, root, Arrays.asList("ei", "esti", "ette", "emmo", "este", "ettero")); - // Regular past historic synonyms: - passThroughOrFillIn(namedArgs, "prem3s2", root + "é", true); - passThroughOrFillIn(namedArgs, "prem3p2", root + "erono", true); - it_conj_passMood(namedArgs, "fut", true, root, Arrays.asList("erò", "erai", "erà", "eremo", "erete", "eranno")); - it_conj_passMood(namedArgs, "cond", true, root, Arrays.asList("erei", "eresti", "erebbe", "eremmo", "ereste", "erebbero")); - - passThroughOrFillIn(namedArgs, "sub123s", root + "a", false); - passThroughOrFillIn(namedArgs, "sub1p", root + "iamo", false); - passThroughOrFillIn(namedArgs, "sub2p", root + "iate", false); - passThroughOrFillIn(namedArgs, "sub3p", root + "ano", false); - - passThroughOrFillIn(namedArgs, "impsub12s", root + "essi", false); - passThroughOrFillIn(namedArgs, "impsub3s", root + "esse", false); - passThroughOrFillIn(namedArgs, "impsub1p", root + "essimo", false); - passThroughOrFillIn(namedArgs, "impsub2p", root + "este", false); - passThroughOrFillIn(namedArgs, "impsub3p", root + "essero", false); - - passThroughOrFillIn(namedArgs, "imp2s", root + "i" + (si ? "ti" : ""), true); - passThroughOrFillIn(namedArgs, "imp3s", (si ? "si " : "") + root + "a", true); - passThroughOrFillIn(namedArgs, "imp1p", root + "iamo" + (si ? "ci" : ""), true); - passThroughOrFillIn(namedArgs, "imp2p", root + "ete" + (si ? "vi" : ""), true); - passThroughOrFillIn(namedArgs, "imp3p", (si ? "si " : "") + root + "ano", true); - - return dest.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, appendAndIndexWikiCallback); + static final class it_conj_urre implements FunctionCallback { + final it_conj dest; + it_conj_urre(it_conj dest) { + this.dest = dest; } - } - - static final class it_conj_urre implements FunctionCallback { - final it_conj dest; - it_conj_urre(it_conj dest) { - this.dest = dest; - } - @Override + @Override public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final T parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - final String root = args.get(0); - final boolean si = name.equals("it-conj-ursi"); - - passThroughOrFillIn(namedArgs, "inf", root + (si ? "ursi" : "urre"), false); - namedArgs.put("aux", ListUtil.get(args, 1, "")); - passThroughOrFillIn(namedArgs, "ger", root + "ucendo" + (si ? "si" : ""), true); - passThroughOrFillIn(namedArgs, "presp", root + "ucente" + (si ? "si" : ""), true); - passThroughOrFillIn(namedArgs, "pastp", root + "otto", true); - if (si) { - passThroughOrFillIn(namedArgs, "pastp2", root + "ottosi", true); - } - it_conj_passMood(namedArgs, "pres", false, root, Arrays.asList("uco", "uci", "uce", "uciamo", "ucete", "ucono")); - it_conj_passMood(namedArgs, "imperf", false, root, Arrays.asList("ucevo", "ucevi", "uceva", "ucevamo", "ucevate", "ucevano")); - it_conj_passMood(namedArgs, "prem", false, root, Arrays.asList("ussi", "ucesti", "usse", "ucemmo", "uceste", "ussero")); - it_conj_passMood(namedArgs, "fut", true, root, Arrays.asList("urrò", "urrai", "urrà", "urremo", "urrete", "urranno")); - it_conj_passMood(namedArgs, "cond", true, root, Arrays.asList("urrei", "urresti", "urrebbe", "urremmo", "urreste", "urrebbero")); - - passThroughOrFillIn(namedArgs, "sub123s", root + "uca", false); - passThroughOrFillIn(namedArgs, "sub1p", root + "uciamo", false); - passThroughOrFillIn(namedArgs, "sub2p", root + "uciate", false); - passThroughOrFillIn(namedArgs, "sub3p", root + "ucano", false); - - passThroughOrFillIn(namedArgs, "impsub12s", root + "ucessi", false); - passThroughOrFillIn(namedArgs, "impsub3s", root + "ucesse", false); - passThroughOrFillIn(namedArgs, "impsub1p", root + "ucessimo", false); - passThroughOrFillIn(namedArgs, "impsub2p", root + "uceste", false); - passThroughOrFillIn(namedArgs, "impsub3p", root + "ucessero", false); - - passThroughOrFillIn(namedArgs, "imp2s", root + "uci" + (si ? "ti" : ""), true); - passThroughOrFillIn(namedArgs, "imp3s", (si ? "si" : "") + root + "uca", true); - passThroughOrFillIn(namedArgs, "imp1p", root + "uciamo" + (si ? "ci" : ""), true); - passThroughOrFillIn(namedArgs, "imp2p", root + "ucete" + (si ? "vi" : ""), true); - passThroughOrFillIn(namedArgs, "imp3p", (si ? "si" : "") + root + "ucano", true); - - return dest.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, appendAndIndexWikiCallback); + final Map namedArgs, + final T parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + final String root = args.get(0); + final boolean si = name.equals("it-conj-ursi"); + + passThroughOrFillIn(namedArgs, "inf", root + (si ? "ursi" : "urre"), false); + namedArgs.put("aux", ListUtil.get(args, 1, "")); + passThroughOrFillIn(namedArgs, "ger", root + "ucendo" + (si ? "si" : ""), true); + passThroughOrFillIn(namedArgs, "presp", root + "ucente" + (si ? "si" : ""), true); + passThroughOrFillIn(namedArgs, "pastp", root + "otto", true); + if (si) { + passThroughOrFillIn(namedArgs, "pastp2", root + "ottosi", true); + } + it_conj_passMood(namedArgs, "pres", false, root, Arrays.asList("uco", "uci", "uce", "uciamo", "ucete", "ucono")); + it_conj_passMood(namedArgs, "imperf", false, root, Arrays.asList("ucevo", "ucevi", "uceva", "ucevamo", "ucevate", "ucevano")); + it_conj_passMood(namedArgs, "prem", false, root, Arrays.asList("ussi", "ucesti", "usse", "ucemmo", "uceste", "ussero")); + it_conj_passMood(namedArgs, "fut", true, root, Arrays.asList("urrò", "urrai", "urrà", "urremo", "urrete", "urranno")); + it_conj_passMood(namedArgs, "cond", true, root, Arrays.asList("urrei", "urresti", "urrebbe", "urremmo", "urreste", "urrebbero")); + + passThroughOrFillIn(namedArgs, "sub123s", root + "uca", false); + passThroughOrFillIn(namedArgs, "sub1p", root + "uciamo", false); + passThroughOrFillIn(namedArgs, "sub2p", root + "uciate", false); + passThroughOrFillIn(namedArgs, "sub3p", root + "ucano", false); + + passThroughOrFillIn(namedArgs, "impsub12s", root + "ucessi", false); + passThroughOrFillIn(namedArgs, "impsub3s", root + "ucesse", false); + passThroughOrFillIn(namedArgs, "impsub1p", root + "ucessimo", false); + passThroughOrFillIn(namedArgs, "impsub2p", root + "uceste", false); + passThroughOrFillIn(namedArgs, "impsub3p", root + "ucessero", false); + + passThroughOrFillIn(namedArgs, "imp2s", root + "uci" + (si ? "ti" : ""), true); + passThroughOrFillIn(namedArgs, "imp3s", (si ? "si" : "") + root + "uca", true); + passThroughOrFillIn(namedArgs, "imp1p", root + "uciamo" + (si ? "ci" : ""), true); + passThroughOrFillIn(namedArgs, "imp2p", root + "ucete" + (si ? "vi" : ""), true); + passThroughOrFillIn(namedArgs, "imp3p", (si ? "si" : "") + root + "ucano", true); + + return dest.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, appendAndIndexWikiCallback); + } + } + + static final class it_conj_fare implements FunctionCallback { + final it_conj dest; + it_conj_fare(it_conj dest) { + this.dest = dest; } - } - - static final class it_conj_fare implements FunctionCallback { - final it_conj dest; - it_conj_fare(it_conj dest) { - this.dest = dest; - } - @Override + @Override public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final T parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - final String root = args.get(0); - passThroughOrFillIn(namedArgs, "inf", root + "fare", false); - namedArgs.put("aux", ListUtil.get(args, 1, "")); - passThroughOrFillIn(namedArgs, "ger", root + "facendo", true); - passThroughOrFillIn(namedArgs, "presp", root + "facente", true); - passThroughOrFillIn(namedArgs, "pastp", root + "fatto", true); - it_conj_passMood(namedArgs, "pres", false, root, Arrays.asList("faccio", "fai", "fà", "facciamo", "fate", "fanno")); - passThroughOrFillIn(namedArgs, "pres1s2", root + "fò", true); - it_conj_passMood(namedArgs, "imperf", false, root, Arrays.asList("facevo", "facevi", "faceva", "facevamo", "facevate", "facevano")); - it_conj_passMood(namedArgs, "prem", false, root, Arrays.asList("feci", "facesti", "fece", "facemmo", "faceste", "fecero")); - it_conj_passMood(namedArgs, "fut", true, root, Arrays.asList("farò", "farai", "farà", "faremo", "farete", "faranno")); - it_conj_passMood(namedArgs, "cond", true, root, Arrays.asList("farei", "faresti", "farebbe", "faremmo", "fareste", "farebbero")); - - passThroughOrFillIn(namedArgs, "sub123s", root + "faccia", false); - passThroughOrFillIn(namedArgs, "sub1p", root + "facciamo", false); - passThroughOrFillIn(namedArgs, "sub2p", root + "facciate", false); - passThroughOrFillIn(namedArgs, "sub3p", root + "facciano", false); - - passThroughOrFillIn(namedArgs, "impsub12s", root + "facessi", false); - passThroughOrFillIn(namedArgs, "impsub3s", root + "facesse", false); - passThroughOrFillIn(namedArgs, "impsub1p", root + "facessimo", false); - passThroughOrFillIn(namedArgs, "impsub2p", root + "faceste", false); - passThroughOrFillIn(namedArgs, "impsub3p", root + "facessero", false); - - passThroughOrFillIn(namedArgs, "imp2s", root + "fa", true); - passThroughOrFillIn(namedArgs, "imp3s", root + "faccia", true); - passThroughOrFillIn(namedArgs, "imp1p", root + "facciamo", true); - passThroughOrFillIn(namedArgs, "imp2p", root + "fate", true); - passThroughOrFillIn(namedArgs, "imp3p", root + "facciano", true); - - return dest.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, appendAndIndexWikiCallback); + final Map namedArgs, + final T parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + final String root = args.get(0); + passThroughOrFillIn(namedArgs, "inf", root + "fare", false); + namedArgs.put("aux", ListUtil.get(args, 1, "")); + passThroughOrFillIn(namedArgs, "ger", root + "facendo", true); + passThroughOrFillIn(namedArgs, "presp", root + "facente", true); + passThroughOrFillIn(namedArgs, "pastp", root + "fatto", true); + it_conj_passMood(namedArgs, "pres", false, root, Arrays.asList("faccio", "fai", "fà", "facciamo", "fate", "fanno")); + passThroughOrFillIn(namedArgs, "pres1s2", root + "fò", true); + it_conj_passMood(namedArgs, "imperf", false, root, Arrays.asList("facevo", "facevi", "faceva", "facevamo", "facevate", "facevano")); + it_conj_passMood(namedArgs, "prem", false, root, Arrays.asList("feci", "facesti", "fece", "facemmo", "faceste", "fecero")); + it_conj_passMood(namedArgs, "fut", true, root, Arrays.asList("farò", "farai", "farà", "faremo", "farete", "faranno")); + it_conj_passMood(namedArgs, "cond", true, root, Arrays.asList("farei", "faresti", "farebbe", "faremmo", "fareste", "farebbero")); + + passThroughOrFillIn(namedArgs, "sub123s", root + "faccia", false); + passThroughOrFillIn(namedArgs, "sub1p", root + "facciamo", false); + passThroughOrFillIn(namedArgs, "sub2p", root + "facciate", false); + passThroughOrFillIn(namedArgs, "sub3p", root + "facciano", false); + + passThroughOrFillIn(namedArgs, "impsub12s", root + "facessi", false); + passThroughOrFillIn(namedArgs, "impsub3s", root + "facesse", false); + passThroughOrFillIn(namedArgs, "impsub1p", root + "facessimo", false); + passThroughOrFillIn(namedArgs, "impsub2p", root + "faceste", false); + passThroughOrFillIn(namedArgs, "impsub3p", root + "facessero", false); + + passThroughOrFillIn(namedArgs, "imp2s", root + "fa", true); + passThroughOrFillIn(namedArgs, "imp3s", root + "faccia", true); + passThroughOrFillIn(namedArgs, "imp1p", root + "facciamo", true); + passThroughOrFillIn(namedArgs, "imp2p", root + "fate", true); + passThroughOrFillIn(namedArgs, "imp3p", root + "facciano", true); + + return dest.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, appendAndIndexWikiCallback); } - } - - static final Map it_indicativePronouns = new LinkedHashMap(); - static { - it_indicativePronouns.put("1s", "io"); - it_indicativePronouns.put("2s", "tu"); - it_indicativePronouns.put("3s", "lui/lei"); - it_indicativePronouns.put("1p", "noi"); - it_indicativePronouns.put("2p", "voi"); - it_indicativePronouns.put("3p", "essi/esse"); - } - - static final Map it_subjunctivePronouns = new LinkedHashMap(); - static { - it_subjunctivePronouns.put("1s", "che io"); - it_subjunctivePronouns.put("2s", "che tu"); - it_subjunctivePronouns.put("3s", "che lui/lei"); - it_subjunctivePronouns.put("1p", "che noi"); - it_subjunctivePronouns.put("2p", "che voi"); - it_subjunctivePronouns.put("3p", "che essi/esse"); - } - - static final Map it_imperativePronouns = new LinkedHashMap(); - static { - it_imperativePronouns.put("1s", "-"); - it_imperativePronouns.put("2s", "tu"); - it_imperativePronouns.put("3s", "lui/lei"); - it_imperativePronouns.put("1p", "noi"); - it_imperativePronouns.put("2p", "voi"); - it_imperativePronouns.put("3p", "essi/esse"); - } - - - static final class it_conj implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final T parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - - final StringBuilder builder = appendAndIndexWikiCallback.builder; - - final String inf = namedArgs.get("inf"); - - // TODO: center everything horizontally. - builder.append(""); - - builder.append(""); - builder.append(""); - builder.append(""); - builder.append("\n"); - - builder.append(""); - builder.append(""); - builder.append(""); - builder.append(""); - builder.append(""); - builder.append("\n"); - - builder.append(""); - builder.append(""); - builder.append(""); - builder.append(""); - builder.append(""); - builder.append("\n"); - - final List prefixes = (inf != null && inf.endsWith("si")) ? it_reflexive_pronouns : it_empty; - - String style = " style=\"background:#c0cfe4\""; - outputDataRow(appendAndIndexWikiCallback, style, "indicativo", style, "th", "", new LinkedHashMap(it_indicativePronouns), it_empty, false); - outputDataRow(appendAndIndexWikiCallback, style, "presente", "", "td", "pres", namedArgs, prefixes, true); - outputDataRow(appendAndIndexWikiCallback, style, "imperfetto", "", "td", "imperf", namedArgs, prefixes, true); - outputDataRow(appendAndIndexWikiCallback, style, "passato remoto", "", "td", "prem", namedArgs, prefixes, true); - outputDataRow(appendAndIndexWikiCallback, style, "futuro", "", "td", "fut", namedArgs, prefixes, true); - - style = " style=\"background:#c0d8e4\""; - outputDataRow(appendAndIndexWikiCallback, style, "condizionale", style, "th", "", new LinkedHashMap(it_indicativePronouns), it_empty, false); - outputDataRow(appendAndIndexWikiCallback, style, "presente", "", "td", "cond", namedArgs, prefixes, true); - - style = " style=\"background:#c0e4c0\""; - outputDataRow(appendAndIndexWikiCallback, style, "congiuntivo", style, "th", "", new LinkedHashMap(it_subjunctivePronouns), it_empty, false); - namedArgs.put("sub3s2", namedArgs.remove("sub3s")); - namedArgs.put("sub1s", namedArgs.get("sub123s")); - namedArgs.put("sub2s", namedArgs.get("sub123s")); - namedArgs.put("sub3s", namedArgs.remove("sub123s")); - namedArgs.put("sub1s2", namedArgs.get("sub123s2")); - namedArgs.put("sub2s2", namedArgs.get("sub123s2")); - namedArgs.put("sub3s2", namedArgs.remove("sub123s2")); - outputDataRow(appendAndIndexWikiCallback, style, "presente", "", "td", "sub", namedArgs, prefixes, true); - namedArgs.put("impsub1s", namedArgs.get("impsub12s")); - namedArgs.put("impsub2s", namedArgs.remove("impsub12s")); - namedArgs.put("impsub1s2", namedArgs.get("impsub12s2")); - namedArgs.put("impsub2s2", namedArgs.remove("impsub12s2")); - outputDataRow(appendAndIndexWikiCallback, style, "imperfetto", "", "td", "impsub", namedArgs, prefixes, true); - - style = " style=\"background:#e4d4c0\""; - outputDataRow(appendAndIndexWikiCallback, style, "imperativo", style, "th", "", new LinkedHashMap(it_imperativePronouns), it_empty, false); - outputDataRow(appendAndIndexWikiCallback, style, "", "", "td", "imp", namedArgs, it_empty, false); // these are attached to the stem. - - builder.append("
infinito"); - appendAndIndexWikiCallback.dispatch(MapUtil.safeRemove(namedArgs, "inf", "-"), null); - builder.append("
verbo ausiliare"); - appendAndIndexWikiCallback.dispatch(MapUtil.safeRemove(namedArgs, "aux", "-"), null); - builder.append("gerundio"); - outputKeyVariations(appendAndIndexWikiCallback, builder, "ger", namedArgs, true); - builder.append("
participio presente"); - outputKeyVariations(appendAndIndexWikiCallback, builder, "presp", namedArgs, true); - builder.append("participio passato"); - outputKeyVariations(appendAndIndexWikiCallback, builder, "pastp", namedArgs, true); - builder.append("
\n"); - - if (!namedArgs.isEmpty()) { - System.err.println("NON-EMPTY namedArgs: " + namedArgs); - if ("muovesse".equals(namedArgs.get("impsib3s2"))) { - return false; - } - if ("percuotesse".equals(namedArgs.get("impsib3s2"))) { + } + + static final Map it_indicativePronouns = new LinkedHashMap(); + static { + it_indicativePronouns.put("1s", "io"); + it_indicativePronouns.put("2s", "tu"); + it_indicativePronouns.put("3s", "lui/lei"); + it_indicativePronouns.put("1p", "noi"); + it_indicativePronouns.put("2p", "voi"); + it_indicativePronouns.put("3p", "essi/esse"); + } + + static final Map it_subjunctivePronouns = new LinkedHashMap(); + static { + it_subjunctivePronouns.put("1s", "che io"); + it_subjunctivePronouns.put("2s", "che tu"); + it_subjunctivePronouns.put("3s", "che lui/lei"); + it_subjunctivePronouns.put("1p", "che noi"); + it_subjunctivePronouns.put("2p", "che voi"); + it_subjunctivePronouns.put("3p", "che essi/esse"); + } + + static final Map it_imperativePronouns = new LinkedHashMap(); + static { + it_imperativePronouns.put("1s", "-"); + it_imperativePronouns.put("2s", "tu"); + it_imperativePronouns.put("3s", "lui/lei"); + it_imperativePronouns.put("1p", "noi"); + it_imperativePronouns.put("2p", "voi"); + it_imperativePronouns.put("3p", "essi/esse"); + } + + + static final class it_conj implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final T parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + + final StringBuilder builder = appendAndIndexWikiCallback.builder; + + final String inf = namedArgs.get("inf"); + + // TODO: center everything horizontally. + builder.append(""); + + builder.append(""); + builder.append(""); + builder.append(""); + builder.append("\n"); + + builder.append(""); + builder.append(""); + builder.append(""); + builder.append(""); + builder.append(""); + builder.append("\n"); + + builder.append(""); + builder.append(""); + builder.append(""); + builder.append(""); + builder.append(""); + builder.append("\n"); + + final List prefixes = (inf != null && inf.endsWith("si")) ? it_reflexive_pronouns : it_empty; + + String style = " style=\"background:#c0cfe4\""; + outputDataRow(appendAndIndexWikiCallback, style, "indicativo", style, "th", "", new LinkedHashMap(it_indicativePronouns), it_empty, false); + outputDataRow(appendAndIndexWikiCallback, style, "presente", "", "td", "pres", namedArgs, prefixes, true); + outputDataRow(appendAndIndexWikiCallback, style, "imperfetto", "", "td", "imperf", namedArgs, prefixes, true); + outputDataRow(appendAndIndexWikiCallback, style, "passato remoto", "", "td", "prem", namedArgs, prefixes, true); + outputDataRow(appendAndIndexWikiCallback, style, "futuro", "", "td", "fut", namedArgs, prefixes, true); + + style = " style=\"background:#c0d8e4\""; + outputDataRow(appendAndIndexWikiCallback, style, "condizionale", style, "th", "", new LinkedHashMap(it_indicativePronouns), it_empty, false); + outputDataRow(appendAndIndexWikiCallback, style, "presente", "", "td", "cond", namedArgs, prefixes, true); + + style = " style=\"background:#c0e4c0\""; + outputDataRow(appendAndIndexWikiCallback, style, "congiuntivo", style, "th", "", new LinkedHashMap(it_subjunctivePronouns), it_empty, false); + namedArgs.put("sub3s2", namedArgs.remove("sub3s")); + namedArgs.put("sub1s", namedArgs.get("sub123s")); + namedArgs.put("sub2s", namedArgs.get("sub123s")); + namedArgs.put("sub3s", namedArgs.remove("sub123s")); + namedArgs.put("sub1s2", namedArgs.get("sub123s2")); + namedArgs.put("sub2s2", namedArgs.get("sub123s2")); + namedArgs.put("sub3s2", namedArgs.remove("sub123s2")); + outputDataRow(appendAndIndexWikiCallback, style, "presente", "", "td", "sub", namedArgs, prefixes, true); + namedArgs.put("impsub1s", namedArgs.get("impsub12s")); + namedArgs.put("impsub2s", namedArgs.remove("impsub12s")); + namedArgs.put("impsub1s2", namedArgs.get("impsub12s2")); + namedArgs.put("impsub2s2", namedArgs.remove("impsub12s2")); + outputDataRow(appendAndIndexWikiCallback, style, "imperfetto", "", "td", "impsub", namedArgs, prefixes, true); + + style = " style=\"background:#e4d4c0\""; + outputDataRow(appendAndIndexWikiCallback, style, "imperativo", style, "th", "", new LinkedHashMap(it_imperativePronouns), it_empty, false); + outputDataRow(appendAndIndexWikiCallback, style, "", "", "td", "imp", namedArgs, it_empty, false); // these are attached to the stem. + + builder.append("
infinito"); + appendAndIndexWikiCallback.dispatch(MapUtil.safeRemove(namedArgs, "inf", "-"), null); + builder.append("
verbo ausiliare"); + appendAndIndexWikiCallback.dispatch(MapUtil.safeRemove(namedArgs, "aux", "-"), null); + builder.append("gerundio"); + outputKeyVariations(appendAndIndexWikiCallback, builder, "ger", namedArgs, true); + builder.append("
participio presente"); + outputKeyVariations(appendAndIndexWikiCallback, builder, "presp", namedArgs, true); + builder.append("participio passato"); + outputKeyVariations(appendAndIndexWikiCallback, builder, "pastp", namedArgs, true); + builder.append("
\n"); + + if (!namedArgs.isEmpty()) { + System.err.println("NON-EMPTY namedArgs: " + namedArgs); + if ("muovesse".equals(namedArgs.get("impsib3s2"))) { + return false; + } + if ("percuotesse".equals(namedArgs.get("impsib3s2"))) { + return false; + } + // Too many to deal with: + //assert false; return false; } - // Too many to deal with: - //assert false; - return false; - } - return true; - } + return true; + } private void outputDataRow(AppendAndIndexWikiCallback appendAndIndexWikiCallback, - String col1Style, String headerName, - String col2Style, final String type2, - String moodName, Map namedArgs, final List prefixes, final boolean isForm) { + String col1Style, String headerName, + String col2Style, final String type2, + String moodName, Map namedArgs, final List prefixes, final boolean isForm) { final StringBuilder builder = appendAndIndexWikiCallback.builder; builder.append(""); builder.append("").append(headerName).append(""); @@ -1183,49 +1188,49 @@ static final class it_conj_are implements Fu builder.append("\n"); } } - - static void passThroughOrFillIn(final Map namedArgs, final String key, final String fillIn, final boolean quoteToEmpty) { - final String value = namedArgs.get(key); - if (quoteToEmpty && "''".equals(value)) { - namedArgs.put(key, ""); - return; - } - if (value == null || value.equals("")) { - namedArgs.put(key, fillIn); - } - } - - static final List it_number_s_p = Arrays.asList("s", "p"); - static final List it_person_1_2_3 = Arrays.asList("1", "2", "3"); - static final List it_reflexive_pronouns = Arrays.asList("mi ", "ti ", "si ", "ci ", "vi ", "si "); - static final List it_empty = Arrays.asList("", "", "", "", "", ""); - static void it_conj_passMood(final Map namedArgs, final String moodName, final boolean quoteToEmpty, final String root, final List suffixes) { - assert suffixes.size() == 6; - int i = 0; - for (final String number : it_number_s_p) { - for (final String person : it_person_1_2_3) { - passThroughOrFillIn(namedArgs, String.format("%s%s%s", moodName, person, number), root + suffixes.get(i), quoteToEmpty); - ++i; - } - } - } - - private static void outputKeyVariations(AppendAndIndexWikiCallback appendAndIndexWikiCallback, - final StringBuilder builder, final String keyBase, Map namedArgs, boolean isForm) { - for (int suffix = 0; suffix <= 4; ++suffix) { - final String key = suffix == 0 ? keyBase : keyBase + suffix; - final String val = namedArgs.remove(key); - if (val != null && !val.trim().equals("")) { - if (suffix > 0) { - builder.append(", "); - } - appendAndIndexWikiCallback.dispatch(val, null); - if (isForm) { - appendAndIndexWikiCallback.parser.addLinkToCurrentEntry(val, null, EntryTypeName.WIKTIONARY_INFLECTED_FORM_MULTI); + + static void passThroughOrFillIn(final Map namedArgs, final String key, final String fillIn, final boolean quoteToEmpty) { + final String value = namedArgs.get(key); + if (quoteToEmpty && "''".equals(value)) { + namedArgs.put(key, ""); + return; + } + if (value == null || value.equals("")) { + namedArgs.put(key, fillIn); + } + } + + static final List it_number_s_p = Arrays.asList("s", "p"); + static final List it_person_1_2_3 = Arrays.asList("1", "2", "3"); + static final List it_reflexive_pronouns = Arrays.asList("mi ", "ti ", "si ", "ci ", "vi ", "si "); + static final List it_empty = Arrays.asList("", "", "", "", "", ""); + static void it_conj_passMood(final Map namedArgs, final String moodName, final boolean quoteToEmpty, final String root, final List suffixes) { + assert suffixes.size() == 6; + int i = 0; + for (final String number : it_number_s_p) { + for (final String person : it_person_1_2_3) { + passThroughOrFillIn(namedArgs, String.format("%s%s%s", moodName, person, number), root + suffixes.get(i), quoteToEmpty); + ++i; + } + } + } + + private static void outputKeyVariations(AppendAndIndexWikiCallback appendAndIndexWikiCallback, + final StringBuilder builder, final String keyBase, Map namedArgs, boolean isForm) { + for (int suffix = 0; suffix <= 4; ++suffix) { + final String key = suffix == 0 ? keyBase : keyBase + suffix; + final String val = namedArgs.remove(key); + if (val != null && !val.trim().equals("")) { + if (suffix > 0) { + builder.append(", "); + } + appendAndIndexWikiCallback.dispatch(val, null); + if (isForm) { + appendAndIndexWikiCallback.parser.addLinkToCurrentEntry(val, null, EntryTypeName.WIKTIONARY_INFLECTED_FORM_MULTI); + } } } } - } } diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/EnParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/EnParser.java index 1c7f912..d15cc92 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/EnParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/EnParser.java @@ -28,127 +28,127 @@ import com.hughes.android.dictionary.parser.WikiTokenizer; public abstract class EnParser extends AbstractWiktionaryParser { - // TODO: process {{ttbc}} lines - - public static final Pattern partOfSpeechHeader = Pattern.compile( - "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" + - "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" + - "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" + - "Ligature|Idiom|Phrase|\\{\\{acronym\\}\\}|\\{\\{initialism\\}\\}|" + - "\\{\\{abbreviation\\}\\}|" + - // These are @deprecated: - "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" + - "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb|" + - // These are extras I found: - "Determiner|Numeral|Number|Cardinal number|Ordinal number|Proverb|" + - "Particle|Interjection|Pronominal adverb|" + - "Han character|Hanzi|Hanja|Kanji|Katakana character|Syllable"); - - static final Set USELESS_WIKI_ARGS = new LinkedHashSet( - Arrays.asList( - "lang", - "sc", - "sort", - "cat", - "cat2", - "xs", - "nodot")); - - static boolean isIgnorableTitle(final String title) { - return title.startsWith("Wiktionary:") || - title.startsWith("Template:") || - title.startsWith("Appendix:") || - title.startsWith("Category:") || - title.startsWith("Index:") || - title.startsWith("MediaWiki:") || - title.startsWith("TransWiki:") || - title.startsWith("Citations:") || - title.startsWith("Concordance:") || - title.startsWith("Help:"); - } - - final IndexBuilder enIndexBuilder; - final IndexBuilder foreignIndexBuilder; - final Pattern langPattern; - final Pattern langCodePattern; - final boolean swap; - - // State used while parsing. - enum State { - TRANSLATION_LINE, - ENGLISH_DEF_OF_FOREIGN, - ENGLISH_EXAMPLE, - FOREIGN_EXAMPLE, - } - State state = null; - - public boolean entryIsFormOfSomething = false; - final Collection wordForms = new ArrayList(); - boolean titleAppended = false; - - - final AppendAndIndexWikiCallback appendAndIndexWikiCallback = new AppendAndIndexCallback(this); - { - appendAndIndexWikiCallback.functionCallbacks.putAll(EnFunctionCallbacks.DEFAULT); - for (final String key : new ArrayList(appendAndIndexWikiCallback.functionCallbacks.keySet())) { - // Don't handle the it-conj functions here. - if (key.startsWith("it-conj")) { - appendAndIndexWikiCallback.functionCallbacks.remove(key); + // TODO: process {{ttbc}} lines + + public static final Pattern partOfSpeechHeader = Pattern.compile( + "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" + + "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" + + "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" + + "Ligature|Idiom|Phrase|\\{\\{acronym\\}\\}|\\{\\{initialism\\}\\}|" + + "\\{\\{abbreviation\\}\\}|" + + // These are @deprecated: + "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" + + "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb|" + + // These are extras I found: + "Determiner|Numeral|Number|Cardinal number|Ordinal number|Proverb|" + + "Particle|Interjection|Pronominal adverb|" + + "Han character|Hanzi|Hanja|Kanji|Katakana character|Syllable"); + + static final Set USELESS_WIKI_ARGS = new LinkedHashSet( + Arrays.asList( + "lang", + "sc", + "sort", + "cat", + "cat2", + "xs", + "nodot")); + + static boolean isIgnorableTitle(final String title) { + return title.startsWith("Wiktionary:") || + title.startsWith("Template:") || + title.startsWith("Appendix:") || + title.startsWith("Category:") || + title.startsWith("Index:") || + title.startsWith("MediaWiki:") || + title.startsWith("TransWiki:") || + title.startsWith("Citations:") || + title.startsWith("Concordance:") || + title.startsWith("Help:"); + } + + final IndexBuilder enIndexBuilder; + final IndexBuilder foreignIndexBuilder; + final Pattern langPattern; + final Pattern langCodePattern; + final boolean swap; + + // State used while parsing. + enum State { + TRANSLATION_LINE, + ENGLISH_DEF_OF_FOREIGN, + ENGLISH_EXAMPLE, + FOREIGN_EXAMPLE, + } + State state = null; + + public boolean entryIsFormOfSomething = false; + final Collection wordForms = new ArrayList(); + boolean titleAppended = false; + + + final AppendAndIndexWikiCallback appendAndIndexWikiCallback = new AppendAndIndexCallback(this); + { + appendAndIndexWikiCallback.functionCallbacks.putAll(EnFunctionCallbacks.DEFAULT); + for (final String key : new ArrayList(appendAndIndexWikiCallback.functionCallbacks.keySet())) { + // Don't handle the it-conj functions here. + if (key.startsWith("it-conj")) { + appendAndIndexWikiCallback.functionCallbacks.remove(key); + } } } - } - - EnParser(final IndexBuilder enIndexBuilder, final IndexBuilder otherIndexBuilder, final Pattern langPattern, final Pattern langCodePattern, final boolean swap) { - this.enIndexBuilder = enIndexBuilder; - this.foreignIndexBuilder = otherIndexBuilder; - this.langPattern = langPattern; - this.langCodePattern = langCodePattern; - this.swap = swap; - } - - @Override - void removeUselessArgs(Map namedArgs) { - namedArgs.keySet().removeAll(USELESS_WIKI_ARGS); - } - - static class AppendAndIndexCallback extends AppendAndIndexWikiCallback { - - public AppendAndIndexCallback(EnParser parser) { - super(parser); + + EnParser(final IndexBuilder enIndexBuilder, final IndexBuilder otherIndexBuilder, final Pattern langPattern, final Pattern langCodePattern, final boolean swap) { + this.enIndexBuilder = enIndexBuilder; + this.foreignIndexBuilder = otherIndexBuilder; + this.langPattern = langPattern; + this.langCodePattern = langCodePattern; + this.swap = swap; } @Override - public void onWikiLink(WikiTokenizer wikiTokenizer) { - final String text = wikiTokenizer.wikiLinkText(); - final String link = wikiTokenizer.wikiLinkDest(); - if (link != null) { - if (link.contains("#English")) { - dispatch(text, parser.enIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK); - } else if (link.contains("#") && parser.langPattern.matcher(link).find()) { - dispatch(text, parser.foreignIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF_OTHER_LANG); - } else if (link.equals("plural")) { - builder.append(text); - } else { - //LOG.warning("Special link: " + englishTokenizer.token()); - dispatch(text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK); + void removeUselessArgs(Map namedArgs) { + namedArgs.keySet().removeAll(USELESS_WIKI_ARGS); + } + + static class AppendAndIndexCallback extends AppendAndIndexWikiCallback { + + public AppendAndIndexCallback(EnParser parser) { + super(parser); } - } else { - // link == null - final EntryTypeName entryTypeName; - switch (parser.state) { - case TRANSLATION_LINE: - entryTypeName = EntryTypeName.WIKTIONARY_TRANSLATION_WIKI_TEXT; - break; - case ENGLISH_DEF_OF_FOREIGN: - entryTypeName = EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK; - break; - default: - throw new IllegalStateException("Invalid enum value: " + parser.state); + + @Override + public void onWikiLink(WikiTokenizer wikiTokenizer) { + final String text = wikiTokenizer.wikiLinkText(); + final String link = wikiTokenizer.wikiLinkDest(); + if (link != null) { + if (link.contains("#English")) { + dispatch(text, parser.enIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK); + } else if (link.contains("#") && parser.langPattern.matcher(link).find()) { + dispatch(text, parser.foreignIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF_OTHER_LANG); + } else if (link.equals("plural")) { + builder.append(text); + } else { + //LOG.warning("Special link: " + englishTokenizer.token()); + dispatch(text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK); + } + } else { + // link == null + final EntryTypeName entryTypeName; + switch (parser.state) { + case TRANSLATION_LINE: + entryTypeName = EntryTypeName.WIKTIONARY_TRANSLATION_WIKI_TEXT; + break; + case ENGLISH_DEF_OF_FOREIGN: + entryTypeName = EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK; + break; + default: + throw new IllegalStateException("Invalid enum value: " + parser.state); + } + dispatch(text, entryTypeName); + } } - dispatch(text, entryTypeName); - } + } - - } } diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/EnToTranslationParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/EnToTranslationParser.java index d37c0e3..8c9683c 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/EnToTranslationParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/EnToTranslationParser.java @@ -27,205 +27,205 @@ import com.hughes.android.dictionary.parser.WikiTokenizer; public final class EnToTranslationParser extends EnParser { public EnToTranslationParser(final IndexBuilder enIndexBuilder, - final IndexBuilder otherIndexBuilder, final Pattern langPattern, - final Pattern langCodePattern, final boolean swap) { - super(enIndexBuilder, otherIndexBuilder, langPattern, langCodePattern, swap); + final IndexBuilder otherIndexBuilder, final Pattern langPattern, + final Pattern langCodePattern, final boolean swap) { + super(enIndexBuilder, otherIndexBuilder, langPattern, langCodePattern, swap); } @Override void parseSection(String heading, String text) { - if (isIgnorableTitle(title)) { - return; - } - heading = heading.replace("=", "").trim(); - if (!heading.contains("English")) { - return; - } - - String pos = null; - int posDepth = -1; - - final WikiTokenizer wikiTokenizer = new WikiTokenizer(text); - while (wikiTokenizer.nextToken() != null) { - - if (wikiTokenizer.isHeading()) { - final String headerName = wikiTokenizer.headingWikiText(); - - if (wikiTokenizer.headingDepth() <= posDepth) { - pos = null; - posDepth = -1; - } - - if (partOfSpeechHeader.matcher(headerName).matches()) { - posDepth = wikiTokenizer.headingDepth(); - pos = wikiTokenizer.headingWikiText(); - // TODO: if we're inside the POS section, we should handle the first title line... - - } else if (headerName.equals("Translations")) { - if (pos == null) { - LOG.info("Translations without POS (but using anyway): " + title); + if (isIgnorableTitle(title)) { + return; + } + heading = heading.replace("=", "").trim(); + if (!heading.contains("English")) { + return; + } + + String pos = null; + int posDepth = -1; + + final WikiTokenizer wikiTokenizer = new WikiTokenizer(text); + while (wikiTokenizer.nextToken() != null) { + + if (wikiTokenizer.isHeading()) { + final String headerName = wikiTokenizer.headingWikiText(); + + if (wikiTokenizer.headingDepth() <= posDepth) { + pos = null; + posDepth = -1; + } + + if (partOfSpeechHeader.matcher(headerName).matches()) { + posDepth = wikiTokenizer.headingDepth(); + pos = wikiTokenizer.headingWikiText(); + // TODO: if we're inside the POS section, we should handle the first title line... + + } else if (headerName.equals("Translations")) { + if (pos == null) { + LOG.info("Translations without POS (but using anyway): " + title); + } + doTranslations(wikiTokenizer, pos); + } else if (headerName.equals("Pronunciation")) { + //doPronunciation(wikiLineReader); + } + } else if (wikiTokenizer.isFunction()) { + final String name = wikiTokenizer.functionName(); + if (name.equals("head") && pos == null) { + LOG.warning("{{head}} without POS: " + title); + } } - doTranslations(wikiTokenizer, pos); - } else if (headerName.equals("Pronunciation")) { - //doPronunciation(wikiLineReader); - } - } else if (wikiTokenizer.isFunction()) { - final String name = wikiTokenizer.functionName(); - if (name.equals("head") && pos == null) { - LOG.warning("{{head}} without POS: " + title); - } } - } } private void doTranslations(final WikiTokenizer wikiTokenizer, final String pos) { - if (title.equals("absolutely")) { - //System.out.println(); - } - - String topLevelLang = null; - String sense = null; - boolean done = false; - while (wikiTokenizer.nextToken() != null) { - if (wikiTokenizer.isHeading()) { - wikiTokenizer.returnToLineStart(); - return; - } - if (done) { - continue; + if (title.equals("absolutely")) { + //System.out.println(); } - - // Check whether we care about this line: - - if (wikiTokenizer.isFunction()) { - final String functionName = wikiTokenizer.functionName(); - final List positionArgs = wikiTokenizer.functionPositionArgs(); - - if (functionName.equals("trans-top")) { - sense = null; - if (wikiTokenizer.functionPositionArgs().size() >= 1) { - sense = positionArgs.get(0); - sense = WikiTokenizer.toPlainText(sense); - //LOG.info("Sense: " + sense); + + String topLevelLang = null; + String sense = null; + boolean done = false; + while (wikiTokenizer.nextToken() != null) { + if (wikiTokenizer.isHeading()) { + wikiTokenizer.returnToLineStart(); + return; + } + if (done) { + continue; } - } else if (functionName.equals("trans-bottom")) { - sense = null; - } else if (functionName.equals("trans-mid")) { - } else if (functionName.equals("trans-see")) { - incrementCount("WARNING:trans-see"); - } else if (functionName.startsWith("picdic")) { - } else if (functionName.startsWith("checktrans")) { - done = true; - } else if (functionName.startsWith("ttbc")) { - wikiTokenizer.nextLine(); - // TODO: would be great to handle ttbc - // TODO: Check this: done = true; - } else { - LOG.warning("Unexpected translation wikifunction: " + wikiTokenizer.token() + ", title=" + title); - } - } else if (wikiTokenizer.isListItem()) { - final String line = wikiTokenizer.listItemWikiText(); - // This line could produce an output... - + + // Check whether we care about this line: + + if (wikiTokenizer.isFunction()) { + final String functionName = wikiTokenizer.functionName(); + final List positionArgs = wikiTokenizer.functionPositionArgs(); + + if (functionName.equals("trans-top")) { + sense = null; + if (wikiTokenizer.functionPositionArgs().size() >= 1) { + sense = positionArgs.get(0); + sense = WikiTokenizer.toPlainText(sense); + //LOG.info("Sense: " + sense); + } + } else if (functionName.equals("trans-bottom")) { + sense = null; + } else if (functionName.equals("trans-mid")) { + } else if (functionName.equals("trans-see")) { + incrementCount("WARNING:trans-see"); + } else if (functionName.startsWith("picdic")) { + } else if (functionName.startsWith("checktrans")) { + done = true; + } else if (functionName.startsWith("ttbc")) { + wikiTokenizer.nextLine(); + // TODO: would be great to handle ttbc + // TODO: Check this: done = true; + } else { + LOG.warning("Unexpected translation wikifunction: " + wikiTokenizer.token() + ", title=" + title); + } + } else if (wikiTokenizer.isListItem()) { + final String line = wikiTokenizer.listItemWikiText(); + // This line could produce an output... + // if (line.contains("ich hoan dich gear")) { // //System.out.println(); // } - - // First strip the language and check whether it matches. - // And hold onto it for sub-lines. - final int colonIndex = line.indexOf(":"); - if (colonIndex == -1) { - continue; - } - - final String lang = trim(WikiTokenizer.toPlainText(line.substring(0, colonIndex))); - incrementCount("tCount:" + lang); - final boolean appendLang; - if (wikiTokenizer.listItemPrefix().length() == 1) { - topLevelLang = lang; - final boolean thisFind = langPattern.matcher(lang).find(); - if (!thisFind) { - continue; - } - appendLang = !langPattern.matcher(lang).matches(); - } else if (topLevelLang == null) { - continue; - } else { - // Two-level -- the only way we won't append is if this second level matches exactly. - if (!langPattern.matcher(lang).matches() && !langPattern.matcher(topLevelLang).find()) { - continue; + + // First strip the language and check whether it matches. + // And hold onto it for sub-lines. + final int colonIndex = line.indexOf(":"); + if (colonIndex == -1) { + continue; + } + + final String lang = trim(WikiTokenizer.toPlainText(line.substring(0, colonIndex))); + incrementCount("tCount:" + lang); + final boolean appendLang; + if (wikiTokenizer.listItemPrefix().length() == 1) { + topLevelLang = lang; + final boolean thisFind = langPattern.matcher(lang).find(); + if (!thisFind) { + continue; + } + appendLang = !langPattern.matcher(lang).matches(); + } else if (topLevelLang == null) { + continue; + } else { + // Two-level -- the only way we won't append is if this second level matches exactly. + if (!langPattern.matcher(lang).matches() && !langPattern.matcher(topLevelLang).find()) { + continue; + } + appendLang = !langPattern.matcher(lang).matches(); + } + + String rest = line.substring(colonIndex + 1).trim(); + if (rest.length() > 0) { + doTranslationLine(line, appendLang ? lang : null, pos, sense, rest); + } + + } else if (wikiTokenizer.remainderStartsWith("''See''")) { + wikiTokenizer.nextLine(); + incrementCount("WARNING: ''See''" ); + LOG.fine("Skipping See line: " + wikiTokenizer.token()); + } else if (wikiTokenizer.isWikiLink()) { + final String wikiLink = wikiTokenizer.wikiLinkText(); + if (wikiLink.contains(":") && wikiLink.contains(title)) { + } else if (wikiLink.contains("Category:")) { + } else { + incrementCount("WARNING: Unexpected wikiLink" ); + LOG.warning("Unexpected wikiLink: " + wikiTokenizer.token() + ", title=" + title); + } + } else if (wikiTokenizer.isNewline() || wikiTokenizer.isMarkup() || wikiTokenizer.isComment()) { + } else { + final String token = wikiTokenizer.token(); + if (token.equals("----")) { + } else { + LOG.warning("Unexpected translation token: " + wikiTokenizer.token() + ", title=" + title); + incrementCount("WARNING: Unexpected translation token" ); + } } - appendLang = !langPattern.matcher(lang).matches(); - } - - String rest = line.substring(colonIndex + 1).trim(); - if (rest.length() > 0) { - doTranslationLine(line, appendLang ? lang : null, pos, sense, rest); - } - - } else if (wikiTokenizer.remainderStartsWith("''See''")) { - wikiTokenizer.nextLine(); - incrementCount("WARNING: ''See''" ); - LOG.fine("Skipping See line: " + wikiTokenizer.token()); - } else if (wikiTokenizer.isWikiLink()) { - final String wikiLink = wikiTokenizer.wikiLinkText(); - if (wikiLink.contains(":") && wikiLink.contains(title)) { - } else if (wikiLink.contains("Category:")) { - } else { - incrementCount("WARNING: Unexpected wikiLink" ); - LOG.warning("Unexpected wikiLink: " + wikiTokenizer.token() + ", title=" + title); - } - } else if (wikiTokenizer.isNewline() || wikiTokenizer.isMarkup() || wikiTokenizer.isComment()) { - } else { - final String token = wikiTokenizer.token(); - if (token.equals("----")) { - } else { - LOG.warning("Unexpected translation token: " + wikiTokenizer.token() + ", title=" + title); - incrementCount("WARNING: Unexpected translation token" ); - } + } - - } } - + private void doTranslationLine(final String line, final String lang, final String pos, final String sense, final String rest) { - state = State.TRANSLATION_LINE; - // Good chance we'll actually file this one... - final PairEntry pairEntry = new PairEntry(entrySource); - final IndexedEntry indexedEntry = new IndexedEntry(pairEntry); - indexedEntry.isValid = true; - - final StringBuilder foreignText = new StringBuilder(); - appendAndIndexWikiCallback.reset(foreignText, indexedEntry); - appendAndIndexWikiCallback.dispatch(rest, foreignIndexBuilder, EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); - - if (foreignText.length() == 0) { - LOG.warning("Empty foreignText: " + line); - incrementCount("WARNING: Empty foreignText" ); - return; - } - - if (lang != null) { - foreignText.insert(0, String.format("(%s) ", lang)); - } - - StringBuilder englishText = new StringBuilder(); - - englishText.append(title); - if (sense != null) { - englishText.append(" (").append(sense).append(")"); - enIndexBuilder.addEntryWithString(indexedEntry, sense, EntryTypeName.WIKTIONARY_TRANSLATION_SENSE); - } - if (pos != null) { - englishText.append(" (").append(pos.toLowerCase()).append(")"); - } - enIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_MULTI); - - final Pair pair = new Pair(trim(englishText.toString()), trim(foreignText.toString()), swap); - pairEntry.pairs.add(pair); - if (!pairsAdded.add(pair.toString())) { - LOG.warning("Duplicate pair: " + pair.toString()); - incrementCount("WARNING: Duplicate pair" ); - } + state = State.TRANSLATION_LINE; + // Good chance we'll actually file this one... + final PairEntry pairEntry = new PairEntry(entrySource); + final IndexedEntry indexedEntry = new IndexedEntry(pairEntry); + indexedEntry.isValid = true; + + final StringBuilder foreignText = new StringBuilder(); + appendAndIndexWikiCallback.reset(foreignText, indexedEntry); + appendAndIndexWikiCallback.dispatch(rest, foreignIndexBuilder, EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); + + if (foreignText.length() == 0) { + LOG.warning("Empty foreignText: " + line); + incrementCount("WARNING: Empty foreignText" ); + return; + } + + if (lang != null) { + foreignText.insert(0, String.format("(%s) ", lang)); + } + + StringBuilder englishText = new StringBuilder(); + + englishText.append(title); + if (sense != null) { + englishText.append(" (").append(sense).append(")"); + enIndexBuilder.addEntryWithString(indexedEntry, sense, EntryTypeName.WIKTIONARY_TRANSLATION_SENSE); + } + if (pos != null) { + englishText.append(" (").append(pos.toLowerCase()).append(")"); + } + enIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_MULTI); + + final Pair pair = new Pair(trim(englishText.toString()), trim(foreignText.toString()), swap); + pairEntry.pairs.add(pair); + if (!pairsAdded.add(pair.toString())) { + LOG.warning("Duplicate pair: " + pair.toString()); + incrementCount("WARNING: Duplicate pair" ); + } } - } // EnToTranslationParser +} // EnToTranslationParser diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/EnTranslationToTranslationParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/EnTranslationToTranslationParser.java index 8025021..14cf43c 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/EnTranslationToTranslationParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/EnTranslationToTranslationParser.java @@ -30,126 +30,126 @@ import com.hughes.android.dictionary.parser.wiktionary.EnFunctionCallbacks.Trans import com.hughes.util.ListUtil; public final class EnTranslationToTranslationParser extends AbstractWiktionaryParser { - + final List indexBuilders; final Pattern[] langCodePatterns; PairEntry pairEntry = null; IndexedEntry indexedEntry = null; - StringBuilder[] builders = null; - - public static final String NAME = "EnTranslationToTranslation"; - - final Set Ts = new LinkedHashSet(Arrays.asList("t", "t+", - "t-", "tø", "apdx-t", "ttbc")); - + StringBuilder[] builders = null; + + public static final String NAME = "EnTranslationToTranslation"; + + final Set Ts = new LinkedHashSet(Arrays.asList("t", "t+", + "t-", "tø", "apdx-t", "ttbc")); + public EnTranslationToTranslationParser(final List indexBuilders, - final Pattern[] langCodePatterns) { - this.indexBuilders = indexBuilders; - this.langCodePatterns = langCodePatterns; + final Pattern[] langCodePatterns) { + this.indexBuilders = indexBuilders; + this.langCodePatterns = langCodePatterns; } - + @Override void removeUselessArgs(Map namedArgs) { - namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS); + namedArgs.keySet().removeAll(EnParser.USELESS_WIKI_ARGS); } - + @Override void parseSection(String heading, String text) { - if (EnParser.isIgnorableTitle(title)) { - return; - } - final WikiTokenizer.Callback callback = new WikiTokenizer.DoNothingCallback() { - @Override - public void onFunction(WikiTokenizer wikiTokenizer, String name, - List functionPositionArgs, - Map functionNamedArgs) { - //System.out.println(wikiTokenizer.token()); - if (Ts.contains(name)) { - onT(wikiTokenizer); - } else if (name.equals("trans-top") || name.equals("checktrans-top") || name.equals("checktrans")) { - startEntry(title, wikiTokenizer.token()); - } else if (name.equals("trans-bottom")) { - finishEntry(title); - } + if (EnParser.isIgnorableTitle(title)) { + return; } + final WikiTokenizer.Callback callback = new WikiTokenizer.DoNothingCallback() { + @Override + public void onFunction(WikiTokenizer wikiTokenizer, String name, + List functionPositionArgs, + Map functionNamedArgs) { + //System.out.println(wikiTokenizer.token()); + if (Ts.contains(name)) { + onT(wikiTokenizer); + } else if (name.equals("trans-top") || name.equals("checktrans-top") || name.equals("checktrans")) { + startEntry(title, wikiTokenizer.token()); + } else if (name.equals("trans-bottom")) { + finishEntry(title); + } + } - @Override - public void onListItem(WikiTokenizer wikiTokenizer) { - WikiTokenizer.dispatch(wikiTokenizer.listItemWikiText(), false, this); + @Override + public void onListItem(WikiTokenizer wikiTokenizer) { + WikiTokenizer.dispatch(wikiTokenizer.listItemWikiText(), false, this); + } + }; + WikiTokenizer.dispatch(text, true, callback); + + if (builders != null) { + LOG.warning("unended translations: " + title); + finishEntry(title); } - }; - WikiTokenizer.dispatch(text, true, callback); - - if (builders != null) { - LOG.warning("unended translations: " + title); - finishEntry(title); - } - } - - final TranslationCallback translationCallback = new TranslationCallback(); - - final AppendAndIndexWikiCallback appendAndIndexWikiCallback = new AppendAndIndexWikiCallback( - this); - { - for (final String t : Ts) { - appendAndIndexWikiCallback.functionCallbacks.put(t, translationCallback); } - } - - private void onT(WikiTokenizer wikiTokenizer) { - if (builders == null) { - LOG.warning("{{t...}} section outside of {{trans-top}}: " + title); - startEntry(title, "QUICKDIC_OUTSIDE"); - } - - final List args = wikiTokenizer.functionPositionArgs(); - final String langCode = ListUtil.get(args, 0); - if (langCode == null) { - LOG.warning("Missing langCode: " + wikiTokenizer.token()); - return; + + final TranslationCallback translationCallback = new TranslationCallback(); + + final AppendAndIndexWikiCallback appendAndIndexWikiCallback = new AppendAndIndexWikiCallback( + this); + { + for (final String t : Ts) { + appendAndIndexWikiCallback.functionCallbacks.put(t, translationCallback); + } } - for (int p = 0; p < 2; ++p) { - if (langCodePatterns[p].matcher(langCode).matches()) { - appendAndIndexWikiCallback.builder = builders[p]; - if (appendAndIndexWikiCallback.builder.length() > 0) { - appendAndIndexWikiCallback.builder.append(", "); + + private void onT(WikiTokenizer wikiTokenizer) { + if (builders == null) { + LOG.warning("{{t...}} section outside of {{trans-top}}: " + title); + startEntry(title, "QUICKDIC_OUTSIDE"); + } + + final List args = wikiTokenizer.functionPositionArgs(); + final String langCode = ListUtil.get(args, 0); + if (langCode == null) { + LOG.warning("Missing langCode: " + wikiTokenizer.token()); + return; + } + for (int p = 0; p < 2; ++p) { + if (langCodePatterns[p].matcher(langCode).matches()) { + appendAndIndexWikiCallback.builder = builders[p]; + if (appendAndIndexWikiCallback.builder.length() > 0) { + appendAndIndexWikiCallback.builder.append(", "); + } + appendAndIndexWikiCallback.indexBuilder = indexBuilders.get(p); + appendAndIndexWikiCallback.onFunction(wikiTokenizer, + wikiTokenizer.functionName(), wikiTokenizer.functionPositionArgs(), + wikiTokenizer.functionNamedArgs()); + } } - appendAndIndexWikiCallback.indexBuilder = indexBuilders.get(p); - appendAndIndexWikiCallback.onFunction(wikiTokenizer, - wikiTokenizer.functionName(), wikiTokenizer.functionPositionArgs(), - wikiTokenizer.functionNamedArgs()); - } } - } void startEntry(final String title, final String func) { - if (pairEntry != null) { - LOG.warning("startEntry() twice: " + title + ", " + func); - finishEntry(title); - } - - pairEntry = new PairEntry(entrySource); - indexedEntry = new IndexedEntry(pairEntry); - builders = new StringBuilder[] { new StringBuilder(), new StringBuilder() }; - appendAndIndexWikiCallback.indexedEntry = indexedEntry; + if (pairEntry != null) { + LOG.warning("startEntry() twice: " + title + ", " + func); + finishEntry(title); + } + + pairEntry = new PairEntry(entrySource); + indexedEntry = new IndexedEntry(pairEntry); + builders = new StringBuilder[] { new StringBuilder(), new StringBuilder() }; + appendAndIndexWikiCallback.indexedEntry = indexedEntry; } - + void finishEntry(final String title) { - if (pairEntry == null) { - LOG.warning("finalizeEntry() twice: " + title); - return; - } - final String lang1 = builders[0].toString(); - final String lang2 = builders[1].toString(); - if (lang1.length() > 0 && lang2.length() > 0) { - pairEntry.pairs.add(new Pair(lang1, lang2)); - indexedEntry.isValid = true; - } - - pairEntry = null; - indexedEntry = null; - builders = null; + if (pairEntry == null) { + LOG.warning("finalizeEntry() twice: " + title); + return; + } + final String lang1 = builders[0].toString(); + final String lang2 = builders[1].toString(); + if (lang1.length() > 0 && lang2.length() > 0) { + pairEntry.pairs.add(new Pair(lang1, lang2)); + indexedEntry.isValid = true; + } + + pairEntry = null; + indexedEntry = null; + builders = null; } - } \ No newline at end of file +} \ No newline at end of file diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/FrFunctionCallbacks.java b/src/com/hughes/android/dictionary/parser/wiktionary/FrFunctionCallbacks.java index 7727ad0..2edf3ac 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/FrFunctionCallbacks.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/FrFunctionCallbacks.java @@ -23,50 +23,50 @@ import java.util.List; import java.util.Map; class FrFunctionCallbacks { - - static void addGenericCallbacks(Map> callbacks) { - callbacks.put("-étym-", new Redispatch("\n==== Étymologie ====\n")); - callbacks.put("-pron-", new Redispatch("\n==== Prononciation ====\n")); - callbacks.put("-voir-", new Redispatch("\n==== Voir aussi ====\n")); - callbacks.put("-drv-", new Redispatch("\n==== Dérivés ====\n")); - callbacks.put("-syn-", new Redispatch("\n==== Synonymes ====\n")); - callbacks.put("-apr-", new Redispatch("\n==== Apparentés étymologiques ====\n")); - callbacks.put("-hyper-", new Redispatch("\n==== Hyperonymes ====\n")); - callbacks.put("-hypo-", new Redispatch("\n==== Hyponymes ====\n")); - callbacks.put("-réf-", new Redispatch("\n==== Références ====\n")); - callbacks.put("-homo-", new Redispatch("\n==== Homophones ====\n")); - callbacks.put("-anagr-", new Redispatch("\n==== Anagrammes ====\n")); - callbacks.put("-voc-", new Redispatch("\n==== Vocabulaire apparenté par le sens ====\n")); - callbacks.put("-exp-", new Redispatch("\n==== Expressions ====\n")); - callbacks.put("-note-", new Redispatch("\n==== Note ====\n")); + static void addGenericCallbacks(Map> callbacks) { + callbacks.put("-étym-", new Redispatch("\n==== Étymologie ====\n")); + callbacks.put("-pron-", new Redispatch("\n==== Prononciation ====\n")); + callbacks.put("-voir-", new Redispatch("\n==== Voir aussi ====\n")); + callbacks.put("-drv-", new Redispatch("\n==== Dérivés ====\n")); + callbacks.put("-syn-", new Redispatch("\n==== Synonymes ====\n")); - callbacks.put("-trad-", new ItFunctionCallbacks.SkipSection()); - } + callbacks.put("-apr-", new Redispatch("\n==== Apparentés étymologiques ====\n")); + callbacks.put("-hyper-", new Redispatch("\n==== Hyperonymes ====\n")); + callbacks.put("-hypo-", new Redispatch("\n==== Hyponymes ====\n")); + callbacks.put("-réf-", new Redispatch("\n==== Références ====\n")); + callbacks.put("-homo-", new Redispatch("\n==== Homophones ====\n")); + callbacks.put("-anagr-", new Redispatch("\n==== Anagrammes ====\n")); + callbacks.put("-voc-", new Redispatch("\n==== Vocabulaire apparenté par le sens ====\n")); + callbacks.put("-exp-", new Redispatch("\n==== Expressions ====\n")); + callbacks.put("-note-", new Redispatch("\n==== Note ====\n")); - - static final NameAndArgs NAME_AND_ARGS = new NameAndArgs(); - - - static final class MakeHeadingFromName implements FunctionCallback { - final String header; - public MakeHeadingFromName(String header) { - this.header = header; + callbacks.put("-trad-", new ItFunctionCallbacks.SkipSection()); } - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, - final T parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - if (!namedArgs.isEmpty() || args.size() != 0) { - return false; + + static final NameAndArgs NAME_AND_ARGS = new NameAndArgs(); + + + static final class MakeHeadingFromName implements FunctionCallback { + final String header; + public MakeHeadingFromName(String header) { + this.header = header; + } + + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, + final T parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + if (!namedArgs.isEmpty() || args.size() != 0) { + return false; + } + //appendAndIndexWikiCallback.builder.append(String.format("<%s>", header)); + appendAndIndexWikiCallback.dispatch("\n" + header + name + header, null); + //appendAndIndexWikiCallback.builder.append(String.format("\n", header)); + return true; } - //appendAndIndexWikiCallback.builder.append(String.format("<%s>", header)); - appendAndIndexWikiCallback.dispatch("\n" + header + name + header, null); - //appendAndIndexWikiCallback.builder.append(String.format("\n", header)); - return true; - } } diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/FunctionCallback.java b/src/com/hughes/android/dictionary/parser/wiktionary/FunctionCallback.java index 550dd5d..059497e 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/FunctionCallback.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/FunctionCallback.java @@ -23,10 +23,10 @@ import com.hughes.android.dictionary.parser.wiktionary.AbstractWiktionaryParser. public interface FunctionCallback { boolean onWikiFunction( - final WikiTokenizer tokenizer, - final String name, - final List args, - final Map namedArgs, - final T parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback); + final WikiTokenizer tokenizer, + final String name, + final List args, + final Map namedArgs, + final T parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback); } diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/ItFunctionCallbacks.java b/src/com/hughes/android/dictionary/parser/wiktionary/ItFunctionCallbacks.java index 3b089fd..8278ccd 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/ItFunctionCallbacks.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/ItFunctionCallbacks.java @@ -24,7 +24,7 @@ import java.util.Map; class ItFunctionCallbacks { static void addGenericCallbacks( - Map> callbacks) { + Map> callbacks) { callbacks.put("-hyph-", new Redispatch("\n==== Sillabazione ====\n")); callbacks.put("-pron-", new Redispatch("\n==== Pronuncia ====\n")); callbacks.put("-etim-", new Redispatch("\n==== Etimologia / Derivazione ====\n")); @@ -44,7 +44,7 @@ class ItFunctionCallbacks { static final NameAndArgs NAME_AND_ARGS = new NameAndArgs(); static final class Redispatch implements - FunctionCallback { + FunctionCallback { final String newText; public Redispatch(String newText) { @@ -53,10 +53,10 @@ class ItFunctionCallbacks { @Override public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, - final List args, - final Map namedArgs, - final T parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + final List args, + final Map namedArgs, + final T parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { if (!namedArgs.isEmpty() || args.size() != 0) { return false; } @@ -66,18 +66,18 @@ class ItFunctionCallbacks { } static final class SkipSection implements - FunctionCallback { + FunctionCallback { public SkipSection() { } @Override public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, - final List args, - final Map namedArgs, - final T parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + final List args, + final Map namedArgs, + final T parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { while (wikiTokenizer.nextToken() != null) { - if (wikiTokenizer.isFunction() + if (wikiTokenizer.isFunction() && wikiTokenizer.functionName().startsWith("-") && wikiTokenizer.functionName().endsWith("-") // Hack to prevent infinite-looping, would be better to check that this func was at the start of the line. diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java index e861b9d..0066d3b 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java @@ -28,7 +28,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { boolean skipWikiLink(final WikiTokenizer wikiTokenizer); String adjustWikiLink(String wikiLinkDest, final String wikiLinkText); void addFunctionCallbacks( - Map> functionCallbacks); + Map> functionCallbacks); } static final Map isoToLangConfig = new LinkedHashMap(); static { @@ -38,7 +38,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { public boolean skipSection(String headingText) { return enSkipSections.matcher(headingText).matches(); } - + @Override public EntryTypeName sectionNameToEntryType(String sectionName) { if (sectionName.equalsIgnoreCase("Synonyms")) { @@ -56,7 +56,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { } return null; } - + @Override public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { final String wikiText = wikiTokenizer.wikiLinkText(); @@ -82,11 +82,11 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { @Override public void addFunctionCallbacks( - Map> functionCallbacks) { + Map> functionCallbacks) { EnFunctionCallbacks.addGenericCallbacks(functionCallbacks); } }); - + final Pattern esSkipSections = Pattern.compile(".*(Traducciones|Locuciones).*"); isoToLangConfig.put("ES", new LangConfig() { @Override @@ -130,7 +130,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { @Override public void addFunctionCallbacks( - Map> functionCallbacks) { + Map> functionCallbacks) { // TODO: need Spanish variant } }); @@ -141,7 +141,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { public boolean skipSection(String headingText) { return deSkipSections.matcher(headingText).matches(); } - + @Override public EntryTypeName sectionNameToEntryType(String sectionName) { if (sectionName.equalsIgnoreCase("Synonyme")) { @@ -152,7 +152,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { } return null; } - + @Override public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { final String wikiText = wikiTokenizer.wikiLinkText(); @@ -178,18 +178,18 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { @Override public void addFunctionCallbacks( - Map> functionCallbacks) { + Map> functionCallbacks) { DeFunctionCallbacks.addGenericCallbacks(functionCallbacks); } }); - + final Pattern itSkipSections = Pattern.compile(".*(Traduzione|Note / Riferimenti).*"); isoToLangConfig.put("IT", new LangConfig() { @Override public boolean skipSection(String headingText) { return itSkipSections.matcher(headingText).matches(); } - + @Override public EntryTypeName sectionNameToEntryType(String sectionName) { if (sectionName.equalsIgnoreCase("Sinonimi")) { @@ -200,7 +200,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { } return null; } - + @Override public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { final String wikiText = wikiTokenizer.wikiLinkText(); @@ -226,7 +226,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { @Override public void addFunctionCallbacks( - Map> functionCallbacks) { + Map> functionCallbacks) { ItFunctionCallbacks.addGenericCallbacks(functionCallbacks); } }); @@ -238,7 +238,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { public boolean skipSection(String headingText) { return frSkipSections.matcher(headingText).matches(); } - + @Override public EntryTypeName sectionNameToEntryType(String sectionName) { if (sectionName.equalsIgnoreCase("Synonymes")) { @@ -249,7 +249,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { } return null; } - + @Override public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { final String wikiText = wikiTokenizer.wikiLinkText(); @@ -275,7 +275,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { @Override public void addFunctionCallbacks( - Map> functionCallbacks) { + Map> functionCallbacks) { FrFunctionCallbacks.addGenericCallbacks(functionCallbacks); } }); @@ -286,10 +286,10 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { final String skipLangIso; final LangConfig langConfig; final String webUrlTemplate; - + public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final IndexBuilder defIndexBuilder, final String wiktionaryIso, final String skipLangIso, - final String webUrlTemplate) { + final String webUrlTemplate) { this.titleIndexBuilder = titleIndexBuilder; this.defIndexBuilder = defIndexBuilder; assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso; @@ -297,7 +297,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { this.skipLangIso = skipLangIso; this.webUrlTemplate = webUrlTemplate; } - + IndexedEntry indexedEntry = null; @Override @@ -307,7 +307,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { indexedEntry = new IndexedEntry(htmlEntry); final AppendAndIndexWikiCallback callback = new AppendCallback( - this); + this); langConfig.addFunctionCallbacks(callback.functionCallbacks); callback.builder = new StringBuilder(); @@ -316,11 +316,11 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { if (webUrlTemplate != null) { final String webUrl = String.format(webUrlTemplate, title); - // URI.create can raise an exception e.g. if webUrl contains %, just ignore those cases. - try { - callback.builder.append(String.format("

%s", URI.create(webUrl).toASCIIString(), escapeHtmlLiteral(webUrl))); - } catch (Exception e) - {} + // URI.create can raise an exception e.g. if webUrl contains %, just ignore those cases. + try { + callback.builder.append(String.format("

%s", URI.create(webUrl).toASCIIString(), escapeHtmlLiteral(webUrl))); + } catch (Exception e) { + } } htmlEntry.html = callback.builder.toString(); indexedEntry.isValid = true; @@ -332,26 +332,26 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { tokenData.htmlEntries.add(htmlEntry); // titleIndexBuilder.addEntryWithString(indexedEntry, title, // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL); - + indexedEntry = null; } @Override void removeUselessArgs(Map namedArgs) { } - + @Override public void addLinkToCurrentEntry(String token, final String lang, EntryTypeName entryTypeName) { if (lang == null || lang.equals(skipLangIso)) { titleIndexBuilder.addEntryWithString(indexedEntry, token, entryTypeName); } } - + public static String escapeHtmlLiteral(final String plainText) { final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText); if (StringUtil.isAscii(htmlEscaped)) { return htmlEscaped; - } else { + } else { return StringUtil.escapeUnicodeToPureHtml(plainText); } @@ -399,7 +399,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { @Override public void onFunction(WikiTokenizer wikiTokenizer, String name, - List args, Map namedArgs) { + List args, Map namedArgs) { if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) { namedArgs.remove("lang"); } @@ -414,7 +414,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { @Override public void onNewline(WikiTokenizer wikiTokenizer) { } - + EntryTypeName sectionEntryTypeName; IndexBuilder currentIndexBuilder; @@ -451,7 +451,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { final String prefix = wikiTokenizer.listItemPrefix(); while (listPrefixStack.size() < prefix.length()) { builder.append(String.format("<%s>", - WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size())))); + WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size())))); listPrefixStack.add(prefix.charAt(listPrefixStack.size())); } builder.append("

  • "); diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java b/src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java index 7f52642..9dfa00a 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java @@ -23,195 +23,195 @@ import java.util.Set; import java.util.regex.Pattern; public class WiktionaryLangs { - - public static final Map isoCodeToEnWikiName = new LinkedHashMap(); - static { - isoCodeToEnWikiName.put("AF", "Afrikaans"); - isoCodeToEnWikiName.put("SQ", "Albanian"); - isoCodeToEnWikiName.put("AR", "Arabic"); - isoCodeToEnWikiName.put("HY", "Armenian"); - isoCodeToEnWikiName.put("BE", "Belarusian"); - isoCodeToEnWikiName.put("BN", "Bengali"); - isoCodeToEnWikiName.put("BG", "Bulgarian"); - isoCodeToEnWikiName.put("CA", "Catalan"); - isoCodeToEnWikiName.put("SH", "Serbo-Croatian"); - isoCodeToEnWikiName.put("CS", "Czech"); - isoCodeToEnWikiName.put("ZH", "Chinese"); - isoCodeToEnWikiName.put("cmn", "Mandarin"); - isoCodeToEnWikiName.put("yue", "Cantonese"); - isoCodeToEnWikiName.put("DA", "Danish"); - isoCodeToEnWikiName.put("NL", "Dutch"); - isoCodeToEnWikiName.put("EN", "English"); - isoCodeToEnWikiName.put("EO", "Esperanto"); - isoCodeToEnWikiName.put("ET", "Estonian"); - isoCodeToEnWikiName.put("FI", "Finnish"); - isoCodeToEnWikiName.put("FR", "French"); - isoCodeToEnWikiName.put("DE", "German"); - isoCodeToEnWikiName.put("grc", "Ancient Greek"); - isoCodeToEnWikiName.put("EL", "Greek"); - isoCodeToEnWikiName.put("haw", "Hawaiian"); - isoCodeToEnWikiName.put("HE", "Hebrew"); - isoCodeToEnWikiName.put("HI", "Hindi"); - isoCodeToEnWikiName.put("HU", "Hungarian"); - isoCodeToEnWikiName.put("IS", "Icelandic"); - isoCodeToEnWikiName.put("ID", "Indonesian"); - isoCodeToEnWikiName.put("GA", "Irish"); - isoCodeToEnWikiName.put("GD", "Gaelic"); - isoCodeToEnWikiName.put("GV", "Manx"); - isoCodeToEnWikiName.put("IT", "Italian"); - isoCodeToEnWikiName.put("LA", "Latin"); - isoCodeToEnWikiName.put("LV", "Latvian"); - isoCodeToEnWikiName.put("LT", "Lithuanian"); - isoCodeToEnWikiName.put("JA", "Japanese"); - isoCodeToEnWikiName.put("KO", "Korean"); - isoCodeToEnWikiName.put("KU", "Kurdish"); - isoCodeToEnWikiName.put("LO", "Lao"); - isoCodeToEnWikiName.put("ML", "Malayalam"); - isoCodeToEnWikiName.put("MS", "Malay"); - isoCodeToEnWikiName.put("MI", "Maori"); - isoCodeToEnWikiName.put("MN", "Mongolian"); - isoCodeToEnWikiName.put("NE", "Nepali"); - isoCodeToEnWikiName.put("NO", "Norwegian"); - isoCodeToEnWikiName.put("FA", "Persian"); - isoCodeToEnWikiName.put("PL", "Polish"); - isoCodeToEnWikiName.put("PT", "Portuguese"); - isoCodeToEnWikiName.put("PA", "Punjabi"); - isoCodeToEnWikiName.put("RO", "Romanian"); - isoCodeToEnWikiName.put("RU", "Russian"); - isoCodeToEnWikiName.put("SA", "Sanskrit"); - isoCodeToEnWikiName.put("SK", "Slovak"); - isoCodeToEnWikiName.put("SL", "Slovene|Slovenian"); - isoCodeToEnWikiName.put("SO", "Somali"); - isoCodeToEnWikiName.put("ES", "Spanish"); - isoCodeToEnWikiName.put("SW", "Swahili"); - isoCodeToEnWikiName.put("SV", "Swedish"); - isoCodeToEnWikiName.put("TL", "Tagalog"); - isoCodeToEnWikiName.put("TG", "Tajik"); - isoCodeToEnWikiName.put("TA", "Tamil"); - isoCodeToEnWikiName.put("TH", "Thai"); - isoCodeToEnWikiName.put("BO", "Tibetan"); - isoCodeToEnWikiName.put("TR", "Turkish"); - isoCodeToEnWikiName.put("UK", "Ukrainian"); - isoCodeToEnWikiName.put("UR", "Urdu"); - isoCodeToEnWikiName.put("VI", "Vietnamese"); - isoCodeToEnWikiName.put("CI", "Welsh"); - isoCodeToEnWikiName.put("YI", "Yiddish"); - isoCodeToEnWikiName.put("ZU", "Zulu"); - isoCodeToEnWikiName.put("AZ", "Azeri"); - isoCodeToEnWikiName.put("EU", "Basque"); - isoCodeToEnWikiName.put("BR", "Breton"); - isoCodeToEnWikiName.put("MR", "Marathi"); - isoCodeToEnWikiName.put("FO", "Faroese"); - isoCodeToEnWikiName.put("GL", "Galician"); - isoCodeToEnWikiName.put("KA", "Georgian"); - isoCodeToEnWikiName.put("HT", "Haitian Creole"); - isoCodeToEnWikiName.put("LB", "Luxembourgish"); - isoCodeToEnWikiName.put("MK", "Macedonian"); - isoCodeToEnWikiName.put("GV", "Manx"); - - // No longer exists in EN: - // isoCodeToEnWikiName.put("BS", "Bosnian"); - // isoCodeToEnWikiName.put("SR", "Serbian"); - // isoCodeToEnWikiName.put("HR", "Croatian"); - - // Font doesn't work: - //isoCodeToEnWikiName.put("MY", "Burmese"); - - - { - //Set missing = new LinkedHashSet(isoCodeToEnWikiName.keySet()); - //missing.removeAll(Language.isoCodeToResources.keySet()); - //System.out.println(missing); + + public static final Map isoCodeToEnWikiName = new LinkedHashMap(); + static { + isoCodeToEnWikiName.put("AF", "Afrikaans"); + isoCodeToEnWikiName.put("SQ", "Albanian"); + isoCodeToEnWikiName.put("AR", "Arabic"); + isoCodeToEnWikiName.put("HY", "Armenian"); + isoCodeToEnWikiName.put("BE", "Belarusian"); + isoCodeToEnWikiName.put("BN", "Bengali"); + isoCodeToEnWikiName.put("BG", "Bulgarian"); + isoCodeToEnWikiName.put("CA", "Catalan"); + isoCodeToEnWikiName.put("SH", "Serbo-Croatian"); + isoCodeToEnWikiName.put("CS", "Czech"); + isoCodeToEnWikiName.put("ZH", "Chinese"); + isoCodeToEnWikiName.put("cmn", "Mandarin"); + isoCodeToEnWikiName.put("yue", "Cantonese"); + isoCodeToEnWikiName.put("DA", "Danish"); + isoCodeToEnWikiName.put("NL", "Dutch"); + isoCodeToEnWikiName.put("EN", "English"); + isoCodeToEnWikiName.put("EO", "Esperanto"); + isoCodeToEnWikiName.put("ET", "Estonian"); + isoCodeToEnWikiName.put("FI", "Finnish"); + isoCodeToEnWikiName.put("FR", "French"); + isoCodeToEnWikiName.put("DE", "German"); + isoCodeToEnWikiName.put("grc", "Ancient Greek"); + isoCodeToEnWikiName.put("EL", "Greek"); + isoCodeToEnWikiName.put("haw", "Hawaiian"); + isoCodeToEnWikiName.put("HE", "Hebrew"); + isoCodeToEnWikiName.put("HI", "Hindi"); + isoCodeToEnWikiName.put("HU", "Hungarian"); + isoCodeToEnWikiName.put("IS", "Icelandic"); + isoCodeToEnWikiName.put("ID", "Indonesian"); + isoCodeToEnWikiName.put("GA", "Irish"); + isoCodeToEnWikiName.put("GD", "Gaelic"); + isoCodeToEnWikiName.put("GV", "Manx"); + isoCodeToEnWikiName.put("IT", "Italian"); + isoCodeToEnWikiName.put("LA", "Latin"); + isoCodeToEnWikiName.put("LV", "Latvian"); + isoCodeToEnWikiName.put("LT", "Lithuanian"); + isoCodeToEnWikiName.put("JA", "Japanese"); + isoCodeToEnWikiName.put("KO", "Korean"); + isoCodeToEnWikiName.put("KU", "Kurdish"); + isoCodeToEnWikiName.put("LO", "Lao"); + isoCodeToEnWikiName.put("ML", "Malayalam"); + isoCodeToEnWikiName.put("MS", "Malay"); + isoCodeToEnWikiName.put("MI", "Maori"); + isoCodeToEnWikiName.put("MN", "Mongolian"); + isoCodeToEnWikiName.put("NE", "Nepali"); + isoCodeToEnWikiName.put("NO", "Norwegian"); + isoCodeToEnWikiName.put("FA", "Persian"); + isoCodeToEnWikiName.put("PL", "Polish"); + isoCodeToEnWikiName.put("PT", "Portuguese"); + isoCodeToEnWikiName.put("PA", "Punjabi"); + isoCodeToEnWikiName.put("RO", "Romanian"); + isoCodeToEnWikiName.put("RU", "Russian"); + isoCodeToEnWikiName.put("SA", "Sanskrit"); + isoCodeToEnWikiName.put("SK", "Slovak"); + isoCodeToEnWikiName.put("SL", "Slovene|Slovenian"); + isoCodeToEnWikiName.put("SO", "Somali"); + isoCodeToEnWikiName.put("ES", "Spanish"); + isoCodeToEnWikiName.put("SW", "Swahili"); + isoCodeToEnWikiName.put("SV", "Swedish"); + isoCodeToEnWikiName.put("TL", "Tagalog"); + isoCodeToEnWikiName.put("TG", "Tajik"); + isoCodeToEnWikiName.put("TA", "Tamil"); + isoCodeToEnWikiName.put("TH", "Thai"); + isoCodeToEnWikiName.put("BO", "Tibetan"); + isoCodeToEnWikiName.put("TR", "Turkish"); + isoCodeToEnWikiName.put("UK", "Ukrainian"); + isoCodeToEnWikiName.put("UR", "Urdu"); + isoCodeToEnWikiName.put("VI", "Vietnamese"); + isoCodeToEnWikiName.put("CI", "Welsh"); + isoCodeToEnWikiName.put("YI", "Yiddish"); + isoCodeToEnWikiName.put("ZU", "Zulu"); + isoCodeToEnWikiName.put("AZ", "Azeri"); + isoCodeToEnWikiName.put("EU", "Basque"); + isoCodeToEnWikiName.put("BR", "Breton"); + isoCodeToEnWikiName.put("MR", "Marathi"); + isoCodeToEnWikiName.put("FO", "Faroese"); + isoCodeToEnWikiName.put("GL", "Galician"); + isoCodeToEnWikiName.put("KA", "Georgian"); + isoCodeToEnWikiName.put("HT", "Haitian Creole"); + isoCodeToEnWikiName.put("LB", "Luxembourgish"); + isoCodeToEnWikiName.put("MK", "Macedonian"); + isoCodeToEnWikiName.put("GV", "Manx"); + + // No longer exists in EN: + // isoCodeToEnWikiName.put("BS", "Bosnian"); + // isoCodeToEnWikiName.put("SR", "Serbian"); + // isoCodeToEnWikiName.put("HR", "Croatian"); + + // Font doesn't work: + //isoCodeToEnWikiName.put("MY", "Burmese"); + + + { + //Set missing = new LinkedHashSet(isoCodeToEnWikiName.keySet()); + //missing.removeAll(Language.isoCodeToResources.keySet()); + //System.out.println(missing); + } + //assert Language.isoCodeToResources.keySet().containsAll(isoCodeToEnWikiName.keySet()); + } + + public static final Map> wikiCodeToIsoCodeToWikiName = new LinkedHashMap>(); + static { + // en + wikiCodeToIsoCodeToWikiName.put("en", isoCodeToEnWikiName); + + Map isoCodeToWikiName; + + // egrep -o '\{\{Wortart[^}]+\}\}' dewiktionary-pages-articles.xml | cut -d \| -f3 | sort | uniq -c | sort -nr + isoCodeToWikiName = new LinkedHashMap(); + wikiCodeToIsoCodeToWikiName.put("de", isoCodeToWikiName); + isoCodeToWikiName.put("DE", "Deutsch"); + isoCodeToWikiName.put("EN", "Englisch"); + isoCodeToWikiName.put("IT", "Italienisch"); + isoCodeToWikiName.put("PL", "Polnisch"); + isoCodeToWikiName.put("FR", "Französisch"); + isoCodeToWikiName.put("EO", "Esperanto"); + isoCodeToWikiName.put("CA", "Katalanisch"); + isoCodeToWikiName.put("LA", "Latein"); + isoCodeToWikiName.put("CS", "Tschechisch"); + isoCodeToWikiName.put("HU", "Ungarisch"); + isoCodeToWikiName.put("SV", "Schwedisch"); + isoCodeToWikiName.put("ES", "Spanisch"); + + // egrep -o '== *\{\{langue\|[a-zA-Z]+\}\} *==' frwiktionary-pages-articles.xml | sort | uniq -c | sort -nr + isoCodeToWikiName = new LinkedHashMap(); + wikiCodeToIsoCodeToWikiName.put("fr", isoCodeToWikiName); + isoCodeToWikiName.put("FR", Pattern.quote("{{langue|fr}}")); + isoCodeToWikiName.put("RU", Pattern.quote("{{langue|ru}}")); + isoCodeToWikiName.put("AR", Pattern.quote("{{langue|ar}}")); // Arabic + isoCodeToWikiName.put("BG", Pattern.quote("{{langue|bg}}")); // Bulgarian + isoCodeToWikiName.put("EN", Pattern.quote("{{langue|en}}")); + //isoCodeToWikiName.put("", Pattern.quote("{{langue|sl}}")); + isoCodeToWikiName.put("LA", Pattern.quote("{{langue|la}}")); + isoCodeToWikiName.put("IT", Pattern.quote("{{langue|it}}")); + isoCodeToWikiName.put("EO", Pattern.quote("{{langue|eo}}")); + isoCodeToWikiName.put("CS", Pattern.quote("{{langue|cs}}")); // Czech + isoCodeToWikiName.put("NL", Pattern.quote("{{langue|nl}}")); // Dutch + //isoCodeToWikiName.put("", Pattern.quote("{{langue|mg}}")); + //isoCodeToWikiName.put("", Pattern.quote("{{langue|hsb}}")); + isoCodeToWikiName.put("ZH", Pattern.quote("{{langue|zh}}")); + isoCodeToWikiName.put("cmn", Pattern.quote("{{langue|cmn}}")); + isoCodeToWikiName.put("yue", Pattern.quote("{{langue|yue}}")); + isoCodeToWikiName.put("JA", Pattern.quote("{{langue|ja}}")); + isoCodeToWikiName.put("DE", Pattern.quote("{{langue|de}}")); + isoCodeToWikiName.put("IS", Pattern.quote("{{langue|is}}")); // Icelandic + isoCodeToWikiName.put("ES", Pattern.quote("{{langue|es}}")); + isoCodeToWikiName.put("UK", Pattern.quote("{{langue|uk}}")); + + // egrep -o '= *\{\{-[a-z]+-\}\} *=' itwiktionary-pages-articles.xml | sort | uniq -c | sort -n + isoCodeToWikiName = new LinkedHashMap(); + wikiCodeToIsoCodeToWikiName.put("it", isoCodeToWikiName); + isoCodeToWikiName.put("IT", "\\{\\{-(it|scn|nap|cal|lmo)-\\}\\}"); // scn, nap, cal, lmo + isoCodeToWikiName.put("EN", Pattern.quote("{{-en-}}")); + isoCodeToWikiName.put("FR", Pattern.quote("{{-fr-}}")); + isoCodeToWikiName.put("DE", Pattern.quote("{{-de-}}")); + isoCodeToWikiName.put("ES", Pattern.quote("{{-es-}}")); + isoCodeToWikiName.put("JA", Pattern.quote("{{-ja-}}")); + isoCodeToWikiName.put("PL", Pattern.quote("{{-pl-}}")); + isoCodeToWikiName.put("NL", Pattern.quote("{{-nl-}}")); + isoCodeToWikiName.put("LV", Pattern.quote("{{-lv-}}")); + isoCodeToWikiName.put("LA", Pattern.quote("{{-la-}}")); + isoCodeToWikiName.put("HU", Pattern.quote("{{-hu-}}")); + isoCodeToWikiName.put("EL", Pattern.quote("{{-grc-}}")); + isoCodeToWikiName.put("SV", Pattern.quote("{{-sv-}}")); + isoCodeToWikiName.put("RU", Pattern.quote("{{-ru-}}")); + + // There seems to be no consistent pattern and few foreign language entries anyway + isoCodeToWikiName = new LinkedHashMap(); + wikiCodeToIsoCodeToWikiName.put("es", isoCodeToWikiName); + isoCodeToWikiName.put("ES", Pattern.quote("{{ES")); + } + public static String getEnglishName(String langCode) { + String name = isoCodeToEnWikiName.get(langCode); + if (name == null) { + name = isoCodeToEnWikiName.get(langCode.toUpperCase()); + } + if (name == null) { + return null; + } + if (name.indexOf('|') != -1) { + return name.substring(0, name.indexOf('|')); + } + if (name.indexOf('$') != -1) { + return name.substring(0, name.indexOf('$')); + } + return name; // can be null. } - //assert Language.isoCodeToResources.keySet().containsAll(isoCodeToEnWikiName.keySet()); - } - - public static final Map> wikiCodeToIsoCodeToWikiName = new LinkedHashMap>(); - static { - // en - wikiCodeToIsoCodeToWikiName.put("en", isoCodeToEnWikiName); - - Map isoCodeToWikiName; - - // egrep -o '\{\{Wortart[^}]+\}\}' dewiktionary-pages-articles.xml | cut -d \| -f3 | sort | uniq -c | sort -nr - isoCodeToWikiName = new LinkedHashMap(); - wikiCodeToIsoCodeToWikiName.put("de", isoCodeToWikiName); - isoCodeToWikiName.put("DE", "Deutsch"); - isoCodeToWikiName.put("EN", "Englisch"); - isoCodeToWikiName.put("IT", "Italienisch"); - isoCodeToWikiName.put("PL", "Polnisch"); - isoCodeToWikiName.put("FR", "Französisch"); - isoCodeToWikiName.put("EO", "Esperanto"); - isoCodeToWikiName.put("CA", "Katalanisch"); - isoCodeToWikiName.put("LA", "Latein"); - isoCodeToWikiName.put("CS", "Tschechisch"); - isoCodeToWikiName.put("HU", "Ungarisch"); - isoCodeToWikiName.put("SV", "Schwedisch"); - isoCodeToWikiName.put("ES", "Spanisch"); - - // egrep -o '== *\{\{langue\|[a-zA-Z]+\}\} *==' frwiktionary-pages-articles.xml | sort | uniq -c | sort -nr - isoCodeToWikiName = new LinkedHashMap(); - wikiCodeToIsoCodeToWikiName.put("fr", isoCodeToWikiName); - isoCodeToWikiName.put("FR", Pattern.quote("{{langue|fr}}")); - isoCodeToWikiName.put("RU", Pattern.quote("{{langue|ru}}")); - isoCodeToWikiName.put("AR", Pattern.quote("{{langue|ar}}")); // Arabic - isoCodeToWikiName.put("BG", Pattern.quote("{{langue|bg}}")); // Bulgarian - isoCodeToWikiName.put("EN", Pattern.quote("{{langue|en}}")); - //isoCodeToWikiName.put("", Pattern.quote("{{langue|sl}}")); - isoCodeToWikiName.put("LA", Pattern.quote("{{langue|la}}")); - isoCodeToWikiName.put("IT", Pattern.quote("{{langue|it}}")); - isoCodeToWikiName.put("EO", Pattern.quote("{{langue|eo}}")); - isoCodeToWikiName.put("CS", Pattern.quote("{{langue|cs}}")); // Czech - isoCodeToWikiName.put("NL", Pattern.quote("{{langue|nl}}")); // Dutch - //isoCodeToWikiName.put("", Pattern.quote("{{langue|mg}}")); - //isoCodeToWikiName.put("", Pattern.quote("{{langue|hsb}}")); - isoCodeToWikiName.put("ZH", Pattern.quote("{{langue|zh}}")); - isoCodeToWikiName.put("cmn", Pattern.quote("{{langue|cmn}}")); - isoCodeToWikiName.put("yue", Pattern.quote("{{langue|yue}}")); - isoCodeToWikiName.put("JA", Pattern.quote("{{langue|ja}}")); - isoCodeToWikiName.put("DE", Pattern.quote("{{langue|de}}")); - isoCodeToWikiName.put("IS", Pattern.quote("{{langue|is}}")); // Icelandic - isoCodeToWikiName.put("ES", Pattern.quote("{{langue|es}}")); - isoCodeToWikiName.put("UK", Pattern.quote("{{langue|uk}}")); - - // egrep -o '= *\{\{-[a-z]+-\}\} *=' itwiktionary-pages-articles.xml | sort | uniq -c | sort -n - isoCodeToWikiName = new LinkedHashMap(); - wikiCodeToIsoCodeToWikiName.put("it", isoCodeToWikiName); - isoCodeToWikiName.put("IT", "\\{\\{-(it|scn|nap|cal|lmo)-\\}\\}"); // scn, nap, cal, lmo - isoCodeToWikiName.put("EN", Pattern.quote("{{-en-}}")); - isoCodeToWikiName.put("FR", Pattern.quote("{{-fr-}}")); - isoCodeToWikiName.put("DE", Pattern.quote("{{-de-}}")); - isoCodeToWikiName.put("ES", Pattern.quote("{{-es-}}")); - isoCodeToWikiName.put("JA", Pattern.quote("{{-ja-}}")); - isoCodeToWikiName.put("PL", Pattern.quote("{{-pl-}}")); - isoCodeToWikiName.put("NL", Pattern.quote("{{-nl-}}")); - isoCodeToWikiName.put("LV", Pattern.quote("{{-lv-}}")); - isoCodeToWikiName.put("LA", Pattern.quote("{{-la-}}")); - isoCodeToWikiName.put("HU", Pattern.quote("{{-hu-}}")); - isoCodeToWikiName.put("EL", Pattern.quote("{{-grc-}}")); - isoCodeToWikiName.put("SV", Pattern.quote("{{-sv-}}")); - isoCodeToWikiName.put("RU", Pattern.quote("{{-ru-}}")); - - // There seems to be no consistent pattern and few foreign language entries anyway - isoCodeToWikiName = new LinkedHashMap(); - wikiCodeToIsoCodeToWikiName.put("es", isoCodeToWikiName); - isoCodeToWikiName.put("ES", Pattern.quote("{{ES")); - } - public static String getEnglishName(String langCode) { - String name = isoCodeToEnWikiName.get(langCode); - if (name == null) { - name = isoCodeToEnWikiName.get(langCode.toUpperCase()); - } - if (name == null) { - return null; - } - if (name.indexOf('|') != -1) { - return name.substring(0, name.indexOf('|')); - } - if (name.indexOf('$') != -1) { - return name.substring(0, name.indexOf('$')); - } - return name; // can be null. - } - + } -- 2.43.0