From: Thad Hughes Date: Wed, 8 Feb 2012 23:45:40 +0000 (-0800) Subject: Split EN, DE, IT, FR wiktionaries! Fix splitting to use entire header X-Git-Url: https://gitweb.fperrin.net/?a=commitdiff_plain;h=4fbbecb78123c434890d2e9d8bee5d3099f40366;p=DictionaryPC.git Split EN, DE, IT, FR wiktionaries! Fix splitting to use entire header line (hopefully this works ok). --- diff --git a/data/inputs/flag_graphics/americanFlag.jpg b/data/inputs/flag_graphics/americanFlag.jpg deleted file mode 100755 index 8d85b25..0000000 Binary files a/data/inputs/flag_graphics/americanFlag.jpg and /dev/null differ diff --git a/data/inputs/flag_graphics/as-lgflag.gif b/data/inputs/flag_graphics/as-lgflag.gif deleted file mode 100644 index 07cc33b..0000000 Binary files a/data/inputs/flag_graphics/as-lgflag.gif and /dev/null differ diff --git a/data/inputs/flag_graphics/au-lgflag.gif b/data/inputs/flag_graphics/au-lgflag.gif deleted file mode 100644 index ff3e618..0000000 Binary files a/data/inputs/flag_graphics/au-lgflag.gif and /dev/null differ diff --git a/data/inputs/flag_graphics/be-lgflag.gif b/data/inputs/flag_graphics/be-lgflag.gif deleted file mode 100644 index bc57368..0000000 Binary files a/data/inputs/flag_graphics/be-lgflag.gif and /dev/null differ diff --git a/data/inputs/flag_graphics/br-lgflag.gif b/data/inputs/flag_graphics/br-lgflag.gif deleted file mode 100644 index 940432b..0000000 Binary files a/data/inputs/flag_graphics/br-lgflag.gif and /dev/null differ diff --git a/data/inputs/flag_graphics/ca-lgflag.gif b/data/inputs/flag_graphics/ca-lgflag.gif deleted file mode 100644 index efc4496..0000000 Binary files a/data/inputs/flag_graphics/ca-lgflag.gif and /dev/null differ diff --git a/data/inputs/flag_graphics/ee-lgflag.gif b/data/inputs/flag_graphics/ee-lgflag.gif deleted file mode 100644 index 97c742b..0000000 Binary files a/data/inputs/flag_graphics/ee-lgflag.gif and /dev/null differ diff --git a/data/inputs/flag_graphics/ei-lgflag.gif b/data/inputs/flag_graphics/ei-lgflag.gif deleted file mode 100644 index 68a9273..0000000 Binary files a/data/inputs/flag_graphics/ei-lgflag.gif and /dev/null differ diff --git a/data/inputs/flag_graphics/flags.xcf b/data/inputs/flag_graphics/flags.xcf deleted file mode 100755 index b322c28..0000000 Binary files a/data/inputs/flag_graphics/flags.xcf and /dev/null differ diff --git a/data/inputs/flag_graphics/fr-lgflag.gif b/data/inputs/flag_graphics/fr-lgflag.gif deleted file mode 100644 index 9fa5027..0000000 Binary files a/data/inputs/flag_graphics/fr-lgflag.gif and /dev/null differ diff --git a/data/inputs/flag_graphics/germanFlag.jpg b/data/inputs/flag_graphics/germanFlag.jpg deleted file mode 100755 index b796c87..0000000 Binary files a/data/inputs/flag_graphics/germanFlag.jpg and /dev/null differ diff --git a/data/inputs/flag_graphics/gm-lgflag.gif b/data/inputs/flag_graphics/gm-lgflag.gif deleted file mode 100644 index ef82b20..0000000 Binary files a/data/inputs/flag_graphics/gm-lgflag.gif and /dev/null differ diff --git a/data/inputs/flag_graphics/gr-lgflag.gif b/data/inputs/flag_graphics/gr-lgflag.gif deleted file mode 100644 index f8f35d0..0000000 Binary files a/data/inputs/flag_graphics/gr-lgflag.gif and /dev/null differ diff --git a/data/inputs/flag_graphics/it-lgflag.gif b/data/inputs/flag_graphics/it-lgflag.gif deleted file mode 100644 index 18b6f03..0000000 Binary files a/data/inputs/flag_graphics/it-lgflag.gif and /dev/null differ diff --git a/data/inputs/flag_graphics/ja-lgflag.gif b/data/inputs/flag_graphics/ja-lgflag.gif deleted file mode 100644 index af4419a..0000000 Binary files a/data/inputs/flag_graphics/ja-lgflag.gif and /dev/null differ diff --git a/data/inputs/flag_graphics/mx-lgflag.gif b/data/inputs/flag_graphics/mx-lgflag.gif deleted file mode 100644 index 2569250..0000000 Binary files a/data/inputs/flag_graphics/mx-lgflag.gif and /dev/null differ diff --git a/data/inputs/flag_graphics/nl-lgflag.gif b/data/inputs/flag_graphics/nl-lgflag.gif deleted file mode 100644 index e6fa805..0000000 Binary files a/data/inputs/flag_graphics/nl-lgflag.gif and /dev/null differ diff --git a/data/inputs/flag_graphics/no-lgflag.gif b/data/inputs/flag_graphics/no-lgflag.gif deleted file mode 100644 index c29a5eb..0000000 Binary files a/data/inputs/flag_graphics/no-lgflag.gif and /dev/null differ diff --git a/data/inputs/flag_graphics/po-lgflag.gif b/data/inputs/flag_graphics/po-lgflag.gif deleted file mode 100644 index e7a49d8..0000000 Binary files a/data/inputs/flag_graphics/po-lgflag.gif and /dev/null differ diff --git a/data/inputs/flag_graphics/rs-lgflag.gif b/data/inputs/flag_graphics/rs-lgflag.gif deleted file mode 100644 index c958629..0000000 Binary files a/data/inputs/flag_graphics/rs-lgflag.gif and /dev/null differ diff --git a/data/inputs/flag_graphics/sf-lgflag.gif b/data/inputs/flag_graphics/sf-lgflag.gif deleted file mode 100644 index 3750169..0000000 Binary files a/data/inputs/flag_graphics/sf-lgflag.gif and /dev/null differ diff --git a/data/inputs/flag_graphics/sp-lgflag.gif b/data/inputs/flag_graphics/sp-lgflag.gif deleted file mode 100644 index 7cf2cb7..0000000 Binary files a/data/inputs/flag_graphics/sp-lgflag.gif and /dev/null differ diff --git a/data/inputs/flag_graphics/sw-lgflag.gif b/data/inputs/flag_graphics/sw-lgflag.gif deleted file mode 100644 index 4fdb247..0000000 Binary files a/data/inputs/flag_graphics/sw-lgflag.gif and /dev/null differ diff --git a/data/inputs/flag_graphics/sz-lgflag.gif b/data/inputs/flag_graphics/sz-lgflag.gif deleted file mode 100644 index 8184466..0000000 Binary files a/data/inputs/flag_graphics/sz-lgflag.gif and /dev/null differ diff --git a/data/inputs/flag_graphics/ts-lgflag.gif b/data/inputs/flag_graphics/ts-lgflag.gif deleted file mode 100644 index ae0c7f9..0000000 Binary files a/data/inputs/flag_graphics/ts-lgflag.gif and /dev/null differ diff --git a/data/inputs/flag_graphics/tw-lgflag.gif b/data/inputs/flag_graphics/tw-lgflag.gif deleted file mode 100644 index 4c9c400..0000000 Binary files a/data/inputs/flag_graphics/tw-lgflag.gif and /dev/null differ diff --git a/data/inputs/flag_graphics/uk-lgflag.gif b/data/inputs/flag_graphics/uk-lgflag.gif deleted file mode 100644 index 17b15b7..0000000 Binary files a/data/inputs/flag_graphics/uk-lgflag.gif and /dev/null differ diff --git a/data/inputs/flag_graphics/us-lgflag.gif b/data/inputs/flag_graphics/us-lgflag.gif deleted file mode 100644 index 7269199..0000000 Binary files a/data/inputs/flag_graphics/us-lgflag.gif and /dev/null differ diff --git a/data/inputs/flag_graphics/wa-lgflag.gif b/data/inputs/flag_graphics/wa-lgflag.gif deleted file mode 100644 index 63a7799..0000000 Binary files a/data/inputs/flag_graphics/wa-lgflag.gif and /dev/null differ diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java index 6e640d6..9f33164 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java @@ -19,7 +19,7 @@ import java.util.Map; import junit.framework.TestCase; -import com.hughes.android.dictionary.parser.enwiktionary.EnWiktionaryLangs; +import com.hughes.android.dictionary.parser.enwiktionary.WiktionaryLangs; public class DictionaryBuilderMain extends TestCase { @@ -31,7 +31,7 @@ public class DictionaryBuilderMain extends TestCase { // Builds all the dictionaries it can, outputs list to a text file. - final Map isoToWikiName = new LinkedHashMap(EnWiktionaryLangs.isoCodeToWikiName); + final Map isoToWikiName = new LinkedHashMap(WiktionaryLangs.isoCodeToWikiName); isoToWikiName.remove("EN"); isoToWikiName.remove("DE"); diff --git a/src/com/hughes/android/dictionary/engine/LanguageTest.java b/src/com/hughes/android/dictionary/engine/LanguageTest.java index ee0b227..e281e50 100644 --- a/src/com/hughes/android/dictionary/engine/LanguageTest.java +++ b/src/com/hughes/android/dictionary/engine/LanguageTest.java @@ -23,7 +23,7 @@ import java.util.Set; import junit.framework.TestCase; -import com.hughes.android.dictionary.parser.enwiktionary.EnWiktionaryLangs; +import com.hughes.android.dictionary.parser.enwiktionary.WiktionaryLangs; import com.ibm.icu.text.Transliterator; public class LanguageTest extends TestCase { @@ -170,8 +170,8 @@ public class LanguageTest extends TestCase { } public void testEnWiktionaryNames() { - final Set enLangs = new LinkedHashSet(EnWiktionaryLangs.isoCodeToWikiName.keySet()); - for (final String code : EnWiktionaryLangs.isoCodeToWikiName.keySet()) { + final Set enLangs = new LinkedHashSet(WiktionaryLangs.isoCodeToWikiName.keySet()); + for (final String code : WiktionaryLangs.isoCodeToWikiName.keySet()) { enLangs.add(code.toLowerCase()); } assertEquals(enLangs.toString(), Language.isoCodeToResources.keySet().toString()); diff --git a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java index 628d356..d0423b3 100644 --- a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java +++ b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java @@ -20,6 +20,7 @@ import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.util.ArrayList; +import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; @@ -32,72 +33,63 @@ import javax.xml.parsers.SAXParserFactory; import org.xml.sax.Attributes; import org.xml.sax.SAXException; -import com.hughes.android.dictionary.parser.enwiktionary.EnWiktionaryLangs; +import com.hughes.android.dictionary.parser.enwiktionary.WiktionaryLangs; public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { - - private static final String FILE_TO_SPLIT = "data/inputs/enwiktionary-pages-articles.xml"; - - static class Section implements java.io.Serializable { - private static final long serialVersionUID = -7676549898325856822L; - final String title; - final String heading; - final String text; - - public Section(final String title, final String heading, final String text) { - this.title = title; - this.heading = heading; - this.text = text; - - //System.out.printf("TITLE:%s\nHEADING:%s\nTEXT:%s\n\n\n\n\n\n", title, heading, text); - } - } + // The matches the whole line, otherwise regexes don't work well on French: + // {{=uk=}} + static final Pattern headingStart = Pattern.compile("^(=+)[^=].*$", Pattern.MULTILINE); + + final Map> pathToSelectors = new LinkedHashMap>(); + List currentSelectors = null; - static class Selector { - DataOutputStream out; - Pattern pattern; - - public Selector(final String filename, final String pattern) throws IOException { - this.out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(filename))); - this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE); - } - } - - final List selectors = new ArrayList(); StringBuilder titleBuilder; StringBuilder textBuilder; StringBuilder currentBuilder = null; public static void main(final String[] args) throws SAXException, IOException, ParserConfigurationException { - final SAXParser parser = SAXParserFactory.newInstance().newSAXParser(); final WiktionarySplitter wiktionarySplitter = new WiktionarySplitter(); - - // Configure things. - - final List selectors = wiktionarySplitter.selectors; - for (int i = 1; i < args.length; i += 2) { - final Selector selector = new Selector(args[i], args[i+1]); - selectors.add(selector); + wiktionarySplitter.go(); + } + + private WiktionarySplitter() { + List selectors; + for (final String code : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.keySet()) { + //if (!code.equals("fr")) {continue;} + selectors = new ArrayList(); + pathToSelectors.put(String.format("data/inputs/%swiktionary-pages-articles.xml", code), selectors); + for (final Map.Entry entry : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.get(code).entrySet()) { + final String dir = String.format("data/inputs/wikiSplit/%s", code); + new File(dir).mkdirs(); + selectors.add(new Selector(String.format("%s/%s.data", dir, entry.getKey()), entry.getValue())); + } } + } - if (selectors.isEmpty()) { - for (final Map.Entry entry : EnWiktionaryLangs.isoCodeToWikiName.entrySet()) { - selectors.add(new Selector(String.format("data/inputs/enWikiSplit/%s.data", entry.getKey()), entry.getValue())); + private void go() throws ParserConfigurationException, SAXException, IOException { + final SAXParser parser = SAXParserFactory.newInstance().newSAXParser(); + + // Configure things. + for (final Map.Entry> pathToSelectorsEntry : pathToSelectors.entrySet()) { + + currentSelectors = pathToSelectorsEntry.getValue(); + + for (final Selector selector : currentSelectors) { + selector.out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(selector.outFilename))); } - } - - // Do it. - parser.parse(new File(FILE_TO_SPLIT), wiktionarySplitter); - - // Shutdown. - for (final Selector selector : selectors) { - selector.out.close(); + + // Do it. + parser.parse(new File(pathToSelectorsEntry.getKey()), this); + + // Shutdown. + for (final Selector selector : currentSelectors) { + selector.out.close(); + } + } } - static final Pattern headingStart = Pattern.compile("^(=+)[^=]+=+", Pattern.MULTILINE); - int pageCount = 0; private void endPage() { final String title = titleBuilder.toString(); @@ -116,12 +108,12 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { text = text.substring(startMatcher.end()); final String heading = startMatcher.group(); - for (final Selector selector : selectors) { + for (final Selector selector : currentSelectors) { if (selector.pattern.matcher(heading).find()) { // Find end. final int depth = startMatcher.group(1).length(); - final Pattern endPattern = Pattern.compile(String.format("^={1,%d}[^=]+=+", depth), Pattern.MULTILINE); + final Pattern endPattern = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE); final Matcher endMatcher = endPattern.matcher(text); final int end; @@ -151,6 +143,36 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { } + // ----------------------------------------------------------------------- + + static class Section implements java.io.Serializable { + private static final long serialVersionUID = -7676549898325856822L; + + final String title; + final String heading; + final String text; + + public Section(final String title, final String heading, final String text) { + this.title = title; + this.heading = heading; + this.text = text; + + //System.out.printf("TITLE:%s\nHEADING:%s\nTEXT:%s\n\n\n\n\n\n", title, heading, text); + } + } + + static class Selector { + final String outFilename; + final Pattern pattern; + + DataOutputStream out; + + public Selector(final String filename, final String pattern) { + this.outFilename = filename; + this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE); + } + } + // ----------------------------------------------------------------------- @Override @@ -191,5 +213,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { final SAXParser parser = SAXParserFactory.newInstance().newSAXParser(); parser.parse(file, this); } + + } diff --git a/src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryLangs.java b/src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryLangs.java deleted file mode 100644 index 0b24556..0000000 --- a/src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryLangs.java +++ /dev/null @@ -1,90 +0,0 @@ -package com.hughes.android.dictionary.parser.enwiktionary; - -import java.util.LinkedHashMap; -import java.util.Map; - -public class EnWiktionaryLangs { - - public static final Map isoCodeToWikiName = new LinkedHashMap(); - static { - isoCodeToWikiName.put("AF", "Afrikaans"); - isoCodeToWikiName.put("SQ", "Albanian"); - isoCodeToWikiName.put("AR", "Arabic"); - isoCodeToWikiName.put("HY", "Armenian"); - isoCodeToWikiName.put("BE", "Belarusian"); - isoCodeToWikiName.put("BN", "Bengali"); - isoCodeToWikiName.put("BS", "Bosnian"); - isoCodeToWikiName.put("BG", "Bulgarian"); - isoCodeToWikiName.put("CA", "Catalan"); - isoCodeToWikiName.put("HR", "Croatian"); - isoCodeToWikiName.put("CS", "Czech"); - isoCodeToWikiName.put("ZH", "Chinese|Mandarin|Cantonese"); - isoCodeToWikiName.put("DA", "Danish"); - isoCodeToWikiName.put("NL", "Dutch"); - isoCodeToWikiName.put("EN", "English"); - isoCodeToWikiName.put("EO", "Esperanto"); - isoCodeToWikiName.put("ET", "Estonian"); - isoCodeToWikiName.put("FI", "Finnish"); - isoCodeToWikiName.put("FR", "French"); - isoCodeToWikiName.put("DE", "German"); - isoCodeToWikiName.put("EL", "Greek"); - isoCodeToWikiName.put("haw", "Hawaiian"); - isoCodeToWikiName.put("HE", "Hebrew"); - isoCodeToWikiName.put("HI", "Hindi"); - isoCodeToWikiName.put("HU", "Hungarian"); - isoCodeToWikiName.put("IS", "Icelandic"); - isoCodeToWikiName.put("ID", "Indonesian"); - isoCodeToWikiName.put("GA", "Gaelic"); - isoCodeToWikiName.put("IT", "Italian"); - isoCodeToWikiName.put("LA", "Latin"); - isoCodeToWikiName.put("LV", "Latvian"); - isoCodeToWikiName.put("LT", "Lithuanian"); - isoCodeToWikiName.put("JA", "Japanese"); - isoCodeToWikiName.put("KO", "Korean"); - isoCodeToWikiName.put("KU", "Kurdish"); - isoCodeToWikiName.put("MS", "Malay"); - isoCodeToWikiName.put("MI", "Maori"); - isoCodeToWikiName.put("MN", "Mongolian"); - isoCodeToWikiName.put("NE", "Nepali"); - isoCodeToWikiName.put("NO", "Norwegian"); - isoCodeToWikiName.put("FA", "Persian"); - isoCodeToWikiName.put("PL", "Polish"); - isoCodeToWikiName.put("PT", "Portuguese"); - isoCodeToWikiName.put("PA", "Punjabi"); - isoCodeToWikiName.put("RO", "Romanian"); - isoCodeToWikiName.put("RU", "Russian"); - isoCodeToWikiName.put("SA", "Sanskrit"); - isoCodeToWikiName.put("SR", "Serbian"); - isoCodeToWikiName.put("SK", "Slovak"); - isoCodeToWikiName.put("SO", "Somali"); - isoCodeToWikiName.put("ES", "Spanish"); - isoCodeToWikiName.put("SW", "Swahili"); - isoCodeToWikiName.put("SV", "Swedish"); - isoCodeToWikiName.put("TL", "Tagalog"); - isoCodeToWikiName.put("TG", "Tajik"); - isoCodeToWikiName.put("TH", "Thai"); - isoCodeToWikiName.put("BO", "Tibetan"); - isoCodeToWikiName.put("TR", "Turkish"); - isoCodeToWikiName.put("UK", "Ukrainian"); - isoCodeToWikiName.put("UR", "Urdu"); - isoCodeToWikiName.put("VI", "Vietnamese"); - isoCodeToWikiName.put("CI", "Welsh"); - isoCodeToWikiName.put("YI", "Yiddish"); - isoCodeToWikiName.put("ZU", "Zulu"); - - - isoCodeToWikiName.put("AZ", "Azeri"); - isoCodeToWikiName.put("EU", "Basque"); - isoCodeToWikiName.put("BR", "Breton"); - isoCodeToWikiName.put("MR", "Burmese"); - isoCodeToWikiName.put("FO", "Faroese"); - isoCodeToWikiName.put("GL", "Galician"); - isoCodeToWikiName.put("KA", "Georgian"); - isoCodeToWikiName.put("HT", "Haitian Creole"); - isoCodeToWikiName.put("LB", "Luxembourgish"); - isoCodeToWikiName.put("MK", "Macedonian"); - - } - - -} diff --git a/src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryXmlParser.java b/src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryXmlParser.java index 27246cd..babc696 100644 --- a/src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryXmlParser.java +++ b/src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryXmlParser.java @@ -119,10 +119,10 @@ public class EnWiktionaryXmlParser { } } finally { System.out.println("lang Counts: " + appendAndIndexWikiCallback.langCodeToTCount); - appendAndIndexWikiCallback.langCodeToTCount.keySet().removeAll(EnWiktionaryLangs.isoCodeToWikiName.keySet()); + appendAndIndexWikiCallback.langCodeToTCount.keySet().removeAll(WiktionaryLangs.isoCodeToWikiName.keySet()); System.out.println("unused Counts: " + appendAndIndexWikiCallback.langCodeToTCount); System.out.println("lang Counts: " + langNameToTCount); - langNameToTCount.keySet().removeAll(EnWiktionaryLangs.isoCodeToWikiName.values()); + langNameToTCount.keySet().removeAll(WiktionaryLangs.isoCodeToWikiName.values()); System.out.println("unknown counts: " + langNameToTCount); } } diff --git a/src/com/hughes/android/dictionary/parser/enwiktionary/WiktionaryLangs.java b/src/com/hughes/android/dictionary/parser/enwiktionary/WiktionaryLangs.java new file mode 100644 index 0000000..77ff567 --- /dev/null +++ b/src/com/hughes/android/dictionary/parser/enwiktionary/WiktionaryLangs.java @@ -0,0 +1,154 @@ +package com.hughes.android.dictionary.parser.enwiktionary; + +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.regex.Pattern; + +public class WiktionaryLangs { + + public static final Map isoCodeToWikiName = new LinkedHashMap(); + static { + isoCodeToWikiName.put("AF", "Afrikaans"); + isoCodeToWikiName.put("SQ", "Albanian"); + isoCodeToWikiName.put("AR", "Arabic"); + isoCodeToWikiName.put("HY", "Armenian"); + isoCodeToWikiName.put("BE", "Belarusian"); + isoCodeToWikiName.put("BN", "Bengali"); + isoCodeToWikiName.put("BS", "Bosnian"); + isoCodeToWikiName.put("BG", "Bulgarian"); + isoCodeToWikiName.put("CA", "Catalan"); + isoCodeToWikiName.put("HR", "Croatian"); + isoCodeToWikiName.put("CS", "Czech"); + isoCodeToWikiName.put("ZH", "Chinese|Mandarin|Cantonese"); + isoCodeToWikiName.put("DA", "Danish"); + isoCodeToWikiName.put("NL", "Dutch"); + isoCodeToWikiName.put("EN", "English"); + isoCodeToWikiName.put("EO", "Esperanto"); + isoCodeToWikiName.put("ET", "Estonian"); + isoCodeToWikiName.put("FI", "Finnish"); + isoCodeToWikiName.put("FR", "French"); + isoCodeToWikiName.put("DE", "German"); + isoCodeToWikiName.put("EL", "Greek"); + isoCodeToWikiName.put("haw", "Hawaiian"); + isoCodeToWikiName.put("HE", "Hebrew"); + isoCodeToWikiName.put("HI", "Hindi"); + isoCodeToWikiName.put("HU", "Hungarian"); + isoCodeToWikiName.put("IS", "Icelandic"); + isoCodeToWikiName.put("ID", "Indonesian"); + isoCodeToWikiName.put("GA", "Gaelic"); + isoCodeToWikiName.put("IT", "Italian"); + isoCodeToWikiName.put("LA", "Latin"); + isoCodeToWikiName.put("LV", "Latvian"); + isoCodeToWikiName.put("LT", "Lithuanian"); + isoCodeToWikiName.put("JA", "Japanese"); + isoCodeToWikiName.put("KO", "Korean"); + isoCodeToWikiName.put("KU", "Kurdish"); + isoCodeToWikiName.put("MS", "Malay"); + isoCodeToWikiName.put("MI", "Maori"); + isoCodeToWikiName.put("MN", "Mongolian"); + isoCodeToWikiName.put("NE", "Nepali"); + isoCodeToWikiName.put("NO", "Norwegian"); + isoCodeToWikiName.put("FA", "Persian"); + isoCodeToWikiName.put("PL", "Polish"); + isoCodeToWikiName.put("PT", "Portuguese"); + isoCodeToWikiName.put("PA", "Punjabi"); + isoCodeToWikiName.put("RO", "Romanian"); + isoCodeToWikiName.put("RU", "Russian"); + isoCodeToWikiName.put("SA", "Sanskrit"); + isoCodeToWikiName.put("SR", "Serbian"); + isoCodeToWikiName.put("SK", "Slovak"); + isoCodeToWikiName.put("SO", "Somali"); + isoCodeToWikiName.put("ES", "Spanish"); + isoCodeToWikiName.put("SW", "Swahili"); + isoCodeToWikiName.put("SV", "Swedish"); + isoCodeToWikiName.put("TL", "Tagalog"); + isoCodeToWikiName.put("TG", "Tajik"); + isoCodeToWikiName.put("TH", "Thai"); + isoCodeToWikiName.put("BO", "Tibetan"); + isoCodeToWikiName.put("TR", "Turkish"); + isoCodeToWikiName.put("UK", "Ukrainian"); + isoCodeToWikiName.put("UR", "Urdu"); + isoCodeToWikiName.put("VI", "Vietnamese"); + isoCodeToWikiName.put("CI", "Welsh"); + isoCodeToWikiName.put("YI", "Yiddish"); + isoCodeToWikiName.put("ZU", "Zulu"); + + isoCodeToWikiName.put("AZ", "Azeri"); + isoCodeToWikiName.put("EU", "Basque"); + isoCodeToWikiName.put("BR", "Breton"); + isoCodeToWikiName.put("MR", "Burmese"); + isoCodeToWikiName.put("FO", "Faroese"); + isoCodeToWikiName.put("GL", "Galician"); + isoCodeToWikiName.put("KA", "Georgian"); + isoCodeToWikiName.put("HT", "Haitian Creole"); + isoCodeToWikiName.put("LB", "Luxembourgish"); + isoCodeToWikiName.put("MK", "Macedonian"); + + } + + public static final Map> wikiCodeToIsoCodeToWikiName = new LinkedHashMap>(); + static { + // en + wikiCodeToIsoCodeToWikiName.put("en", isoCodeToWikiName); + + Map isoCodeToWikiName; + + // egrep -o '\{\{Wortart[^}]+\}\}' dewiktionary-pages-articles.xml | cut -d \| -f3 | sort | uniq -c | sort -nr + isoCodeToWikiName = new LinkedHashMap(); + wikiCodeToIsoCodeToWikiName.put("de", isoCodeToWikiName); + isoCodeToWikiName.put("DE", "Deutsch"); + isoCodeToWikiName.put("EN", "Englisch"); + isoCodeToWikiName.put("IT", "Italienisch"); + isoCodeToWikiName.put("PL", "Polnisch"); + isoCodeToWikiName.put("FR", "Französisch"); + isoCodeToWikiName.put("EO", "Esperanto"); + isoCodeToWikiName.put("CA", "Katalanisch"); + isoCodeToWikiName.put("LA", "Lateinisch"); + isoCodeToWikiName.put("CS", "Tschechisch"); + isoCodeToWikiName.put("HU", "Ungarisch"); + isoCodeToWikiName.put("SV", "Schwedisch"); + isoCodeToWikiName.put("ES", "Spanisch"); + + // egrep -o '\{\{=[a-zA-Z]+=\}\}' frwiktionary-pages-articles.xml | sort | uniq -c | sort -nr + isoCodeToWikiName = new LinkedHashMap(); + wikiCodeToIsoCodeToWikiName.put("fr", isoCodeToWikiName); + isoCodeToWikiName.put("FR", Pattern.quote("{{=fr=}}")); + isoCodeToWikiName.put("RU", Pattern.quote("{{=ru=}}")); + isoCodeToWikiName.put("BG", Pattern.quote("{{=bg=}}")); // Bulgarian + isoCodeToWikiName.put("EN", Pattern.quote("{{=en=}}")); + //isoCodeToWikiName.put("", Pattern.quote("{{=sl=}}")); + isoCodeToWikiName.put("LA", Pattern.quote("{{=la=}}")); + isoCodeToWikiName.put("IT", Pattern.quote("{{=it=}}")); + isoCodeToWikiName.put("EO", Pattern.quote("{{=eo=}}")); + isoCodeToWikiName.put("CS", Pattern.quote("{{=cs=}}")); // Czech + isoCodeToWikiName.put("NL", Pattern.quote("{{=nl=}}")); // Dutch + //isoCodeToWikiName.put("", Pattern.quote("{{=mg=}}")); + //isoCodeToWikiName.put("", Pattern.quote("{{=hsb=}}")); + isoCodeToWikiName.put("ZH", Pattern.quote("{{=zh=}}")); + isoCodeToWikiName.put("JA", Pattern.quote("{{=ja=}}")); + isoCodeToWikiName.put("DE", Pattern.quote("{{=de=}}")); + isoCodeToWikiName.put("IS", Pattern.quote("{{=is=}}")); // Icelandic + isoCodeToWikiName.put("ES", Pattern.quote("{{=es=}}")); + isoCodeToWikiName.put("UK", Pattern.quote("{{=uk=}}")); + + // egrep -o '= *\{\{-[a-z]+-\}\} *=' itwiktionary-pages-articles.xml | sort | uniq -c | sort -n + isoCodeToWikiName = new LinkedHashMap(); + wikiCodeToIsoCodeToWikiName.put("it", isoCodeToWikiName); + isoCodeToWikiName.put("IT", "\\{\\{-(it|scn|nap|cal|lmo)-\\}\\}"); // scn, nap, cal, lmo + isoCodeToWikiName.put("EN", Pattern.quote("{{-en-}}")); + isoCodeToWikiName.put("FR", Pattern.quote("{{-fr-}}")); + isoCodeToWikiName.put("DE", Pattern.quote("{{-de-}}")); + isoCodeToWikiName.put("ES", Pattern.quote("{{-es-}}")); + isoCodeToWikiName.put("JA", Pattern.quote("{{-ja-}}")); + isoCodeToWikiName.put("PL", Pattern.quote("{{-pl-}}")); + isoCodeToWikiName.put("NL", Pattern.quote("{{-nl-}}")); + isoCodeToWikiName.put("LV", Pattern.quote("{{-lv-}}")); + isoCodeToWikiName.put("LV", Pattern.quote("{{-la-}}")); + isoCodeToWikiName.put("HU", Pattern.quote("{{-hu-}}")); + isoCodeToWikiName.put("PL", Pattern.quote("{{-pl-}}")); + isoCodeToWikiName.put("EL", Pattern.quote("{{-grc-}}")); + isoCodeToWikiName.put("SV", Pattern.quote("{{-sv-}}")); + + } + +} diff --git a/to_test.txt b/to_test.txt new file mode 100644 index 0000000..60e9c67 --- /dev/null +++ b/to_test.txt @@ -0,0 +1,5 @@ +On Android 1.6: +Landscape/portrait mode, switching between. +Download dictionary. +Add dictionary during runtime. +Remove dictionary during runtime. diff --git a/todo.txt b/todo.txt index 0cc215b..17bac82 100644 --- a/todo.txt +++ b/todo.txt @@ -1,9 +1,11 @@ +for i in res/raw/*.html; do tidy --input-encoding utf8 --output-file $i $i; done + + For next release: +flag images +test/fix return to last-used dictionary downloads history dialog -fix up dictionary manager: - thread that handles unzipping, downloading for the life of the application (so screen changes don't screw it up). - check over UI. check arabic UI fix handle examples like "asdf (asdf)" random word jump @@ -101,3 +103,6 @@ about dict dialog * timeout on the exact search... if it can't confirm, it should just switch and go.... * reload dictionaryInfo sometime... * change path of /sdcard/quickDic/... +fix up dictionary manager: + thread that handles unzipping, downloading for the life of the application (so screen changes don't screw it up). + check over UI.