From: Thad Hughes Date: Sat, 5 Jan 2013 06:17:34 +0000 (-0800) Subject: Fixed comment for German dictionary. X-Git-Url: http://gitweb.fperrin.net/?p=DictionaryPC.git;a=commitdiff_plain;h=4b7bd1a73f94d1dc94ae3ef0a316f91fce21550d Fixed comment for German dictionary. --- diff --git a/data/inputs/de-en_chemnitz_enwiktionary.info b/data/inputs/de-en_dedication.txt similarity index 96% rename from data/inputs/de-en_chemnitz_enwiktionary.info rename to data/inputs/de-en_dedication.txt index 8875396..89688b9 100644 --- a/data/inputs/de-en_chemnitz_enwiktionary.info +++ b/data/inputs/de-en_dedication.txt @@ -6,4 +6,4 @@ Version: devel 2011-06-21 Source: http://dict.tu-chemnitz.de/ Thanks to Frank Richter. And from: -(EN)Wiktionary +Wiktionary diff --git a/src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java b/src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java index 444ef82..84efd7c 100644 --- a/src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java +++ b/src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java @@ -2,9 +2,7 @@ package com.hughes.android.dictionary.engine; import com.hughes.android.dictionary.DictionaryInfo; import com.hughes.android.dictionary.DictionaryInfo.IndexInfo; -import com.hughes.android.dictionary.parser.wiktionary.WiktionaryLangs; import com.hughes.util.CollectionUtil; -import com.hughes.util.StringUtil; import java.io.File; import java.io.IOException; @@ -18,7 +16,7 @@ import java.util.List; public class CheckDictionariesMain { static final String BASE_URL = "http://quickdic-dictionary.googlecode.com/files/"; - static final String VERSION_CODE = "v005"; + static final String VERSION_CODE = "v006"; public static void main(String[] args) throws IOException { final File dictDir = new File(DictionaryBuilderMain.OUTPUTS); diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java index b1ad5dd..f2e1430 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java @@ -37,12 +37,11 @@ public class DictionaryBuilderMain extends TestCase { // Build the non EN ones. static final String[][] nonEnPairs = new String[][] { - /* {"EN"}, {"DE"}, - {"IT"}, */ - // This one takes a really long time: - // {"FR"}, + {"IT"}, + // This one takes a really long time, and the result is too big for code.google.com + //{"FR"}, // The 3 I use most: {"IT", "EN" }, @@ -134,28 +133,29 @@ public class DictionaryBuilderMain extends TestCase { {"FA", "HY" }, // Persian, Armenian, by request. {"FA", "SV" }, // Persian, Swedish, by request. {"NL", "PL" }, // Dutch, Polish, by request. + }; static final Map isoToDedication = new LinkedHashMap(); static { - isoToDedication.put("AF", "Afrikaans dictionary dedicated to Heiko and Mariëtte Horn."); - isoToDedication.put("HR", "Croatian dictionary dedicated to Ines Viskic and Miro Kresonja."); - isoToDedication.put("NL", "Dutch dictionary dedicated to Mike LeBeau."); - // German handled in file. - isoToDedication.put("EL", "Greek dictionary dedicated to Noah Egge."); - isoToDedication.put("IT", "Italian dictionary dedicated to Carolina Tropini, my favorite stardust in the whole universe! Ti amo!"); - isoToDedication.put("KO", "Korean dictionary dedicated to Ande Elwood--fall fashion und Fernsehturms!"); - isoToDedication.put("PT", "Portuguese dictionary dedicated to Carlos Melo, one Tough Mudder."); - isoToDedication.put("RO", "Romanian dictionary dedicated to Radu Teodorescu."); - isoToDedication.put("RU", "Russian dictionary dedicated to Maxim Aronin--best friend always!."); - isoToDedication.put("SR", "Serbian dictionary dedicated to Filip Crnogorac--thanks for the honey."); - isoToDedication.put("ES", "Spanish dictionary made especially for Carolina Tropini! <3 XoXoXXXXX!"); - isoToDedication.put("SV", "Swedish dictionary dedicated to Kajsa Palmblad--björn kramar!"); + isoToDedication.put("AF", "Wiktionary-based Afrikaans dictionary dedicated to Heiko and Mariëtte Horn."); + isoToDedication.put("HR", "Wiktionary-based Croatian dictionary dedicated to Ines Viskic and Miro Kresonja."); + isoToDedication.put("NL", "Wiktionary-based Dutch dictionary dedicated to Mike LeBeau."); + isoToDedication.put("DE", "@data/inputs/de-en_dedication.txt"); + isoToDedication.put("EL", "Wiktionary-based Greek dictionary dedicated to Noah Egge."); + isoToDedication.put("IT", "Wiktionary-based Italian dictionary dedicated to Carolina Tropini, my favorite stardust in the whole universe! Ti amo!"); + isoToDedication.put("KO", "Wiktionary-based Korean dictionary dedicated to Ande Elwood--fall fashion und Fernsehturms!"); + isoToDedication.put("PT", "Wiktionary-based Portuguese dictionary dedicated to Carlos Melo, one Tough Mudder."); + isoToDedication.put("RO", "Wiktionary-based Romanian dictionary dedicated to Radu Teodorescu."); + isoToDedication.put("RU", "Wiktionary-based Russian dictionary dedicated to Maxim Aronin--best friend always!."); + isoToDedication.put("SR", "Wiktionary-based Serbian dictionary dedicated to Filip Crnogorac--thanks for the honey."); + isoToDedication.put("ES", "Wiktionary-based Spanish dictionary made especially for Carolina Tropini! <3 XoXoXXXXX!"); + isoToDedication.put("SV", "Wiktionary-based Swedish dictionary dedicated to Kajsa Palmblad--björn kramar!"); } - private static String getDedication(String iso) { - return isoToDedication.containsKey(iso) ? "\n\n" + isoToDedication.get(iso) : ""; + private static String getEnDictionaryInfo(String iso) { + return isoToDedication.containsKey(iso) ? isoToDedication.get(iso) : String.format("Wiktionary-based %s dictionary.", iso); } static final Map isoToStoplist = new LinkedHashMap(); @@ -260,7 +260,7 @@ public class DictionaryBuilderMain extends TestCase { result.add(String.format("--lang1=%s", lang1)); result.add(String.format("--lang2=%s", lang2)); - result.add(String.format("--dictInfo=(EN)Wikitionary-based EN-%s dictionary.%s", foreignIso, getDedication(foreignIso))); + result.add(String.format("--dictInfo=%s", getEnDictionaryInfo(foreignIso))); // Foreign section. result.add(String.format("--input%d=%s/wikiSplit/en/%s.data", i, INPUTS, foreignIso)); @@ -323,6 +323,7 @@ public class DictionaryBuilderMain extends TestCase { final Set> done = new LinkedHashSet>(); + boolean go = true; for (final String[] pair : allPairs) { Arrays.sort(pair); final List pairList = Arrays.asList(pair); @@ -331,8 +332,14 @@ public class DictionaryBuilderMain extends TestCase { } done.add(pairList); - if (!pairList.contains("EN") && !pairList.contains("EL")) { - //continue; +// if (pairList.contains("EN") && pairList.contains("DE")) { +// go = true; +// } else { +// go = false; +// } + + if (!go) { + continue; } DictionaryBuilder.main(getMainArgs(pair).toArray(new String[0])); diff --git a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java index c8b150e..81783a8 100644 --- a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java +++ b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java @@ -175,7 +175,7 @@ public final class WikiTokenizer { callback.onPlainText(tokenizer.token()); } else if (tokenizer.isMarkup()) { callback.onMarkup(tokenizer); - } else if (tokenizer.isWikiLink) { + } else if (tokenizer.isWikiLink()) { callback.onWikiLink(tokenizer); } else if (tokenizer.isNewline()) { callback.onNewline(tokenizer); @@ -264,7 +264,8 @@ public final class WikiTokenizer { if (firstUnescapedPipePos != -1) { return trimNewlines(wikiText.substring(start + 2, firstUnescapedPipePos).trim()); } - return trimNewlines(wikiText.substring(start + 2, end - 2).trim()); + final int safeEnd = Math.max(start + 2, end - 2); + return trimNewlines(wikiText.substring(start + 2, safeEnd).trim()); } public List functionPositionArgs() { diff --git a/src/com/hughes/android/dictionary/parser/WikiTokenizerTest.java b/src/com/hughes/android/dictionary/parser/WikiTokenizerTest.java index 8398719..b4999d8 100644 --- a/src/com/hughes/android/dictionary/parser/WikiTokenizerTest.java +++ b/src/com/hughes/android/dictionary/parser/WikiTokenizerTest.java @@ -21,7 +21,7 @@ import java.util.List; import junit.framework.TestCase; public class WikiTokenizerTest extends TestCase { - + public void testWikiLink() { String wikiText; @@ -72,7 +72,16 @@ public class WikiTokenizerTest extends TestCase { public void testFunction() { String wikiText; - + + { + WikiTokenizer wt = new WikiTokenizer("'''Προστατευόμενη Ονομασία Προέλευσης''', \"Protected Designation of Origin\" {{"); + while (wt.nextToken() != null) { + if (wt.isFunction()) { + assertEquals("", wt.functionName()); + } + } + } + wikiText = "{{abc}}"; assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token()); assertTrue(new WikiTokenizer(wikiText).nextToken().isFunction()); diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/EnForeignParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/EnForeignParser.java index 6dbe3ec..5ed0bc2 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/EnForeignParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/EnForeignParser.java @@ -177,8 +177,13 @@ public final class EnForeignParser extends EnParser { } else if (wikiTokenizer.isPlainText()) { // Unindexed! foreignBuilder.append(wikiTokenizer.token()); - - } else if (wikiTokenizer.isMarkup() || wikiTokenizer.isNewline() || wikiTokenizer.isComment()) { + } else if (wikiTokenizer.isHtml()) { + if (!wikiTokenizer.token().startsWith("")) { + foreignBuilder.append(wikiTokenizer.token()); + } + } else if (wikiTokenizer.isMarkup() || + wikiTokenizer.isNewline() || + wikiTokenizer.isComment()) { // Do nothing. } else { LOG.warning("Unexpected token: " + wikiTokenizer.token()); diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/EnFunctionCallbacks.java b/src/com/hughes/android/dictionary/parser/wiktionary/EnFunctionCallbacks.java index 9a7d748..90535cc 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/EnFunctionCallbacks.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/EnFunctionCallbacks.java @@ -232,6 +232,7 @@ class EnFunctionCallbacks { final Map namedArgs, final T parser, final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + namedArgs.remove("lang"); if (!namedArgs.isEmpty()) { EnParser.LOG.warning("weird qualifier: " + wikiTokenizer.token()); return false; diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java b/src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java index f87afa1..f8300fb 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java @@ -33,7 +33,6 @@ public class WiktionaryLangs { isoCodeToEnWikiName.put("BE", "Belarusian"); isoCodeToEnWikiName.put("BN", "Bengali"); isoCodeToEnWikiName.put("BG", "Bulgarian"); - isoCodeToEnWikiName.put("MY", "Burmese"); isoCodeToEnWikiName.put("CA", "Catalan"); isoCodeToEnWikiName.put("SH", "Serbo-Croatian"); isoCodeToEnWikiName.put("HR", "Croatian"); @@ -112,6 +111,10 @@ public class WiktionaryLangs { // No longer exists in EN: // isoCodeToEnWikiName.put("BS", "Bosnian"); // isoCodeToEnWikiName.put("SR", "Serbian"); + + // Font doesn't work: + //isoCodeToEnWikiName.put("MY", "Burmese"); + { Set missing = new LinkedHashSet(isoCodeToEnWikiName.keySet()); diff --git a/todo.txt b/todo.txt index 14c7150..deb2316 100644 --- a/todo.txt +++ b/todo.txt @@ -1,35 +1,19 @@ +Handle wiki tables {| .. |-| .. |} de-conj why does presso not show up? -Afferrare in it, italics don't end. {{L -start new intent for web link. - - {{term {{etyl {{l {{de-conj -Spaces in links are done wrong: "perche mai",click "why on earth", see "why%20..." Delete it conjugation of entries. - Compression for PairEntries! delete these entries: # {{conjugation of|abalienare||2|p|pres|ind|lang=it}} # {{conjugation of|abalienare||2|p|imp|lang=it}} # {{form of|[[feminine|Feminine]] plural|abalienato}} - -HtmlEntry - - text inside functions doesn't get escaped properly. - - Skips Uebersetzung section (likewise in other langs), except maybe for the other lange of interest. - - Build single EN/DE/IT/FR dictionaries based on HtmlEntry. - - Parse Italian verb forms from enwiktionary into something useful. - - "See also" link entries for cross-referencing ("form of"--strong, links to token, "mentioned in"--weaker, links to HtmlEntry). - Nice: - - Add links into the HtmlEntry based on wikilinks. - - Link to them from the appropriate places: IndexEntry (first), and individual rows (tricker, built at different times). - Hitory of lookups. make sure word is sticky when you change dictionaries.