line (hopefully this works ok).
import junit.framework.TestCase;
-import com.hughes.android.dictionary.parser.enwiktionary.EnWiktionaryLangs;
+import com.hughes.android.dictionary.parser.enwiktionary.WiktionaryLangs;
public class DictionaryBuilderMain extends TestCase {
// Builds all the dictionaries it can, outputs list to a text file.
- final Map<String,String> isoToWikiName = new LinkedHashMap<String, String>(EnWiktionaryLangs.isoCodeToWikiName);
+ final Map<String,String> isoToWikiName = new LinkedHashMap<String, String>(WiktionaryLangs.isoCodeToWikiName);
isoToWikiName.remove("EN");
isoToWikiName.remove("DE");
import junit.framework.TestCase;
-import com.hughes.android.dictionary.parser.enwiktionary.EnWiktionaryLangs;
+import com.hughes.android.dictionary.parser.enwiktionary.WiktionaryLangs;
import com.ibm.icu.text.Transliterator;
public class LanguageTest extends TestCase {
}
public void testEnWiktionaryNames() {
- final Set<String> enLangs = new LinkedHashSet<String>(EnWiktionaryLangs.isoCodeToWikiName.keySet());
- for (final String code : EnWiktionaryLangs.isoCodeToWikiName.keySet()) {
+ final Set<String> enLangs = new LinkedHashSet<String>(WiktionaryLangs.isoCodeToWikiName.keySet());
+ for (final String code : WiktionaryLangs.isoCodeToWikiName.keySet()) {
enLangs.add(code.toLowerCase());
}
assertEquals(enLangs.toString(), Language.isoCodeToResources.keySet().toString());
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
+import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
-import com.hughes.android.dictionary.parser.enwiktionary.EnWiktionaryLangs;
+import com.hughes.android.dictionary.parser.enwiktionary.WiktionaryLangs;
public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
-
- private static final String FILE_TO_SPLIT = "data/inputs/enwiktionary-pages-articles.xml";
-
- static class Section implements java.io.Serializable {
- private static final long serialVersionUID = -7676549898325856822L;
- final String title;
- final String heading;
- final String text;
-
- public Section(final String title, final String heading, final String text) {
- this.title = title;
- this.heading = heading;
- this.text = text;
-
- //System.out.printf("TITLE:%s\nHEADING:%s\nTEXT:%s\n\n\n\n\n\n", title, heading, text);
- }
- }
+ // The matches the whole line, otherwise regexes don't work well on French:
+ // {{=uk=}}
+ static final Pattern headingStart = Pattern.compile("^(=+)[^=].*$", Pattern.MULTILINE);
+
+ final Map<String,List<Selector>> pathToSelectors = new LinkedHashMap<String, List<Selector>>();
+ List<Selector> currentSelectors = null;
- static class Selector {
- DataOutputStream out;
- Pattern pattern;
-
- public Selector(final String filename, final String pattern) throws IOException {
- this.out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(filename)));
- this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE);
- }
- }
-
- final List<Selector> selectors = new ArrayList<Selector>();
StringBuilder titleBuilder;
StringBuilder textBuilder;
StringBuilder currentBuilder = null;
public static void main(final String[] args) throws SAXException, IOException, ParserConfigurationException {
- final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
final WiktionarySplitter wiktionarySplitter = new WiktionarySplitter();
-
- // Configure things.
-
- final List<Selector> selectors = wiktionarySplitter.selectors;
- for (int i = 1; i < args.length; i += 2) {
- final Selector selector = new Selector(args[i], args[i+1]);
- selectors.add(selector);
+ wiktionarySplitter.go();
+ }
+
+ private WiktionarySplitter() {
+ List<Selector> selectors;
+ for (final String code : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.keySet()) {
+ //if (!code.equals("fr")) {continue;}
+ selectors = new ArrayList<WiktionarySplitter.Selector>();
+ pathToSelectors.put(String.format("data/inputs/%swiktionary-pages-articles.xml", code), selectors);
+ for (final Map.Entry<String, String> entry : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.get(code).entrySet()) {
+ final String dir = String.format("data/inputs/wikiSplit/%s", code);
+ new File(dir).mkdirs();
+ selectors.add(new Selector(String.format("%s/%s.data", dir, entry.getKey()), entry.getValue()));
+ }
}
+ }
- if (selectors.isEmpty()) {
- for (final Map.Entry<String, String> entry : EnWiktionaryLangs.isoCodeToWikiName.entrySet()) {
- selectors.add(new Selector(String.format("data/inputs/enWikiSplit/%s.data", entry.getKey()), entry.getValue()));
+ private void go() throws ParserConfigurationException, SAXException, IOException {
+ final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
+
+ // Configure things.
+ for (final Map.Entry<String, List<Selector>> pathToSelectorsEntry : pathToSelectors.entrySet()) {
+
+ currentSelectors = pathToSelectorsEntry.getValue();
+
+ for (final Selector selector : currentSelectors) {
+ selector.out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(selector.outFilename)));
}
- }
-
- // Do it.
- parser.parse(new File(FILE_TO_SPLIT), wiktionarySplitter);
-
- // Shutdown.
- for (final Selector selector : selectors) {
- selector.out.close();
+
+ // Do it.
+ parser.parse(new File(pathToSelectorsEntry.getKey()), this);
+
+ // Shutdown.
+ for (final Selector selector : currentSelectors) {
+ selector.out.close();
+ }
+
}
}
- static final Pattern headingStart = Pattern.compile("^(=+)[^=]+=+", Pattern.MULTILINE);
-
int pageCount = 0;
private void endPage() {
final String title = titleBuilder.toString();
text = text.substring(startMatcher.end());
final String heading = startMatcher.group();
- for (final Selector selector : selectors) {
+ for (final Selector selector : currentSelectors) {
if (selector.pattern.matcher(heading).find()) {
// Find end.
final int depth = startMatcher.group(1).length();
- final Pattern endPattern = Pattern.compile(String.format("^={1,%d}[^=]+=+", depth), Pattern.MULTILINE);
+ final Pattern endPattern = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE);
final Matcher endMatcher = endPattern.matcher(text);
final int end;
}
+ // -----------------------------------------------------------------------
+
+ static class Section implements java.io.Serializable {
+ private static final long serialVersionUID = -7676549898325856822L;
+
+ final String title;
+ final String heading;
+ final String text;
+
+ public Section(final String title, final String heading, final String text) {
+ this.title = title;
+ this.heading = heading;
+ this.text = text;
+
+ //System.out.printf("TITLE:%s\nHEADING:%s\nTEXT:%s\n\n\n\n\n\n", title, heading, text);
+ }
+ }
+
+ static class Selector {
+ final String outFilename;
+ final Pattern pattern;
+
+ DataOutputStream out;
+
+ public Selector(final String filename, final String pattern) {
+ this.outFilename = filename;
+ this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE);
+ }
+ }
+
// -----------------------------------------------------------------------
@Override
final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
parser.parse(file, this);
}
+
+
}
+++ /dev/null
-package com.hughes.android.dictionary.parser.enwiktionary;
-
-import java.util.LinkedHashMap;
-import java.util.Map;
-
-public class EnWiktionaryLangs {
-
- public static final Map<String,String> isoCodeToWikiName = new LinkedHashMap<String,String>();
- static {
- isoCodeToWikiName.put("AF", "Afrikaans");
- isoCodeToWikiName.put("SQ", "Albanian");
- isoCodeToWikiName.put("AR", "Arabic");
- isoCodeToWikiName.put("HY", "Armenian");
- isoCodeToWikiName.put("BE", "Belarusian");
- isoCodeToWikiName.put("BN", "Bengali");
- isoCodeToWikiName.put("BS", "Bosnian");
- isoCodeToWikiName.put("BG", "Bulgarian");
- isoCodeToWikiName.put("CA", "Catalan");
- isoCodeToWikiName.put("HR", "Croatian");
- isoCodeToWikiName.put("CS", "Czech");
- isoCodeToWikiName.put("ZH", "Chinese|Mandarin|Cantonese");
- isoCodeToWikiName.put("DA", "Danish");
- isoCodeToWikiName.put("NL", "Dutch");
- isoCodeToWikiName.put("EN", "English");
- isoCodeToWikiName.put("EO", "Esperanto");
- isoCodeToWikiName.put("ET", "Estonian");
- isoCodeToWikiName.put("FI", "Finnish");
- isoCodeToWikiName.put("FR", "French");
- isoCodeToWikiName.put("DE", "German");
- isoCodeToWikiName.put("EL", "Greek");
- isoCodeToWikiName.put("haw", "Hawaiian");
- isoCodeToWikiName.put("HE", "Hebrew");
- isoCodeToWikiName.put("HI", "Hindi");
- isoCodeToWikiName.put("HU", "Hungarian");
- isoCodeToWikiName.put("IS", "Icelandic");
- isoCodeToWikiName.put("ID", "Indonesian");
- isoCodeToWikiName.put("GA", "Gaelic");
- isoCodeToWikiName.put("IT", "Italian");
- isoCodeToWikiName.put("LA", "Latin");
- isoCodeToWikiName.put("LV", "Latvian");
- isoCodeToWikiName.put("LT", "Lithuanian");
- isoCodeToWikiName.put("JA", "Japanese");
- isoCodeToWikiName.put("KO", "Korean");
- isoCodeToWikiName.put("KU", "Kurdish");
- isoCodeToWikiName.put("MS", "Malay");
- isoCodeToWikiName.put("MI", "Maori");
- isoCodeToWikiName.put("MN", "Mongolian");
- isoCodeToWikiName.put("NE", "Nepali");
- isoCodeToWikiName.put("NO", "Norwegian");
- isoCodeToWikiName.put("FA", "Persian");
- isoCodeToWikiName.put("PL", "Polish");
- isoCodeToWikiName.put("PT", "Portuguese");
- isoCodeToWikiName.put("PA", "Punjabi");
- isoCodeToWikiName.put("RO", "Romanian");
- isoCodeToWikiName.put("RU", "Russian");
- isoCodeToWikiName.put("SA", "Sanskrit");
- isoCodeToWikiName.put("SR", "Serbian");
- isoCodeToWikiName.put("SK", "Slovak");
- isoCodeToWikiName.put("SO", "Somali");
- isoCodeToWikiName.put("ES", "Spanish");
- isoCodeToWikiName.put("SW", "Swahili");
- isoCodeToWikiName.put("SV", "Swedish");
- isoCodeToWikiName.put("TL", "Tagalog");
- isoCodeToWikiName.put("TG", "Tajik");
- isoCodeToWikiName.put("TH", "Thai");
- isoCodeToWikiName.put("BO", "Tibetan");
- isoCodeToWikiName.put("TR", "Turkish");
- isoCodeToWikiName.put("UK", "Ukrainian");
- isoCodeToWikiName.put("UR", "Urdu");
- isoCodeToWikiName.put("VI", "Vietnamese");
- isoCodeToWikiName.put("CI", "Welsh");
- isoCodeToWikiName.put("YI", "Yiddish");
- isoCodeToWikiName.put("ZU", "Zulu");
-
-
- isoCodeToWikiName.put("AZ", "Azeri");
- isoCodeToWikiName.put("EU", "Basque");
- isoCodeToWikiName.put("BR", "Breton");
- isoCodeToWikiName.put("MR", "Burmese");
- isoCodeToWikiName.put("FO", "Faroese");
- isoCodeToWikiName.put("GL", "Galician");
- isoCodeToWikiName.put("KA", "Georgian");
- isoCodeToWikiName.put("HT", "Haitian Creole");
- isoCodeToWikiName.put("LB", "Luxembourgish");
- isoCodeToWikiName.put("MK", "Macedonian");
-
- }
-
-
-}
}
} finally {
System.out.println("lang Counts: " + appendAndIndexWikiCallback.langCodeToTCount);
- appendAndIndexWikiCallback.langCodeToTCount.keySet().removeAll(EnWiktionaryLangs.isoCodeToWikiName.keySet());
+ appendAndIndexWikiCallback.langCodeToTCount.keySet().removeAll(WiktionaryLangs.isoCodeToWikiName.keySet());
System.out.println("unused Counts: " + appendAndIndexWikiCallback.langCodeToTCount);
System.out.println("lang Counts: " + langNameToTCount);
- langNameToTCount.keySet().removeAll(EnWiktionaryLangs.isoCodeToWikiName.values());
+ langNameToTCount.keySet().removeAll(WiktionaryLangs.isoCodeToWikiName.values());
System.out.println("unknown counts: " + langNameToTCount);
}
}
--- /dev/null
+package com.hughes.android.dictionary.parser.enwiktionary;
+
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+public class WiktionaryLangs {
+
+ public static final Map<String,String> isoCodeToWikiName = new LinkedHashMap<String,String>();
+ static {
+ isoCodeToWikiName.put("AF", "Afrikaans");
+ isoCodeToWikiName.put("SQ", "Albanian");
+ isoCodeToWikiName.put("AR", "Arabic");
+ isoCodeToWikiName.put("HY", "Armenian");
+ isoCodeToWikiName.put("BE", "Belarusian");
+ isoCodeToWikiName.put("BN", "Bengali");
+ isoCodeToWikiName.put("BS", "Bosnian");
+ isoCodeToWikiName.put("BG", "Bulgarian");
+ isoCodeToWikiName.put("CA", "Catalan");
+ isoCodeToWikiName.put("HR", "Croatian");
+ isoCodeToWikiName.put("CS", "Czech");
+ isoCodeToWikiName.put("ZH", "Chinese|Mandarin|Cantonese");
+ isoCodeToWikiName.put("DA", "Danish");
+ isoCodeToWikiName.put("NL", "Dutch");
+ isoCodeToWikiName.put("EN", "English");
+ isoCodeToWikiName.put("EO", "Esperanto");
+ isoCodeToWikiName.put("ET", "Estonian");
+ isoCodeToWikiName.put("FI", "Finnish");
+ isoCodeToWikiName.put("FR", "French");
+ isoCodeToWikiName.put("DE", "German");
+ isoCodeToWikiName.put("EL", "Greek");
+ isoCodeToWikiName.put("haw", "Hawaiian");
+ isoCodeToWikiName.put("HE", "Hebrew");
+ isoCodeToWikiName.put("HI", "Hindi");
+ isoCodeToWikiName.put("HU", "Hungarian");
+ isoCodeToWikiName.put("IS", "Icelandic");
+ isoCodeToWikiName.put("ID", "Indonesian");
+ isoCodeToWikiName.put("GA", "Gaelic");
+ isoCodeToWikiName.put("IT", "Italian");
+ isoCodeToWikiName.put("LA", "Latin");
+ isoCodeToWikiName.put("LV", "Latvian");
+ isoCodeToWikiName.put("LT", "Lithuanian");
+ isoCodeToWikiName.put("JA", "Japanese");
+ isoCodeToWikiName.put("KO", "Korean");
+ isoCodeToWikiName.put("KU", "Kurdish");
+ isoCodeToWikiName.put("MS", "Malay");
+ isoCodeToWikiName.put("MI", "Maori");
+ isoCodeToWikiName.put("MN", "Mongolian");
+ isoCodeToWikiName.put("NE", "Nepali");
+ isoCodeToWikiName.put("NO", "Norwegian");
+ isoCodeToWikiName.put("FA", "Persian");
+ isoCodeToWikiName.put("PL", "Polish");
+ isoCodeToWikiName.put("PT", "Portuguese");
+ isoCodeToWikiName.put("PA", "Punjabi");
+ isoCodeToWikiName.put("RO", "Romanian");
+ isoCodeToWikiName.put("RU", "Russian");
+ isoCodeToWikiName.put("SA", "Sanskrit");
+ isoCodeToWikiName.put("SR", "Serbian");
+ isoCodeToWikiName.put("SK", "Slovak");
+ isoCodeToWikiName.put("SO", "Somali");
+ isoCodeToWikiName.put("ES", "Spanish");
+ isoCodeToWikiName.put("SW", "Swahili");
+ isoCodeToWikiName.put("SV", "Swedish");
+ isoCodeToWikiName.put("TL", "Tagalog");
+ isoCodeToWikiName.put("TG", "Tajik");
+ isoCodeToWikiName.put("TH", "Thai");
+ isoCodeToWikiName.put("BO", "Tibetan");
+ isoCodeToWikiName.put("TR", "Turkish");
+ isoCodeToWikiName.put("UK", "Ukrainian");
+ isoCodeToWikiName.put("UR", "Urdu");
+ isoCodeToWikiName.put("VI", "Vietnamese");
+ isoCodeToWikiName.put("CI", "Welsh");
+ isoCodeToWikiName.put("YI", "Yiddish");
+ isoCodeToWikiName.put("ZU", "Zulu");
+
+ isoCodeToWikiName.put("AZ", "Azeri");
+ isoCodeToWikiName.put("EU", "Basque");
+ isoCodeToWikiName.put("BR", "Breton");
+ isoCodeToWikiName.put("MR", "Burmese");
+ isoCodeToWikiName.put("FO", "Faroese");
+ isoCodeToWikiName.put("GL", "Galician");
+ isoCodeToWikiName.put("KA", "Georgian");
+ isoCodeToWikiName.put("HT", "Haitian Creole");
+ isoCodeToWikiName.put("LB", "Luxembourgish");
+ isoCodeToWikiName.put("MK", "Macedonian");
+
+ }
+
+ public static final Map<String,Map<String,String>> wikiCodeToIsoCodeToWikiName = new LinkedHashMap<String, Map<String,String>>();
+ static {
+ // en
+ wikiCodeToIsoCodeToWikiName.put("en", isoCodeToWikiName);
+
+ Map<String,String> isoCodeToWikiName;
+
+ // egrep -o '\{\{Wortart[^}]+\}\}' dewiktionary-pages-articles.xml | cut -d \| -f3 | sort | uniq -c | sort -nr
+ isoCodeToWikiName = new LinkedHashMap<String, String>();
+ wikiCodeToIsoCodeToWikiName.put("de", isoCodeToWikiName);
+ isoCodeToWikiName.put("DE", "Deutsch");
+ isoCodeToWikiName.put("EN", "Englisch");
+ isoCodeToWikiName.put("IT", "Italienisch");
+ isoCodeToWikiName.put("PL", "Polnisch");
+ isoCodeToWikiName.put("FR", "Französisch");
+ isoCodeToWikiName.put("EO", "Esperanto");
+ isoCodeToWikiName.put("CA", "Katalanisch");
+ isoCodeToWikiName.put("LA", "Lateinisch");
+ isoCodeToWikiName.put("CS", "Tschechisch");
+ isoCodeToWikiName.put("HU", "Ungarisch");
+ isoCodeToWikiName.put("SV", "Schwedisch");
+ isoCodeToWikiName.put("ES", "Spanisch");
+
+ // egrep -o '\{\{=[a-zA-Z]+=\}\}' frwiktionary-pages-articles.xml | sort | uniq -c | sort -nr
+ isoCodeToWikiName = new LinkedHashMap<String, String>();
+ wikiCodeToIsoCodeToWikiName.put("fr", isoCodeToWikiName);
+ isoCodeToWikiName.put("FR", Pattern.quote("{{=fr=}}"));
+ isoCodeToWikiName.put("RU", Pattern.quote("{{=ru=}}"));
+ isoCodeToWikiName.put("BG", Pattern.quote("{{=bg=}}")); // Bulgarian
+ isoCodeToWikiName.put("EN", Pattern.quote("{{=en=}}"));
+ //isoCodeToWikiName.put("", Pattern.quote("{{=sl=}}"));
+ isoCodeToWikiName.put("LA", Pattern.quote("{{=la=}}"));
+ isoCodeToWikiName.put("IT", Pattern.quote("{{=it=}}"));
+ isoCodeToWikiName.put("EO", Pattern.quote("{{=eo=}}"));
+ isoCodeToWikiName.put("CS", Pattern.quote("{{=cs=}}")); // Czech
+ isoCodeToWikiName.put("NL", Pattern.quote("{{=nl=}}")); // Dutch
+ //isoCodeToWikiName.put("", Pattern.quote("{{=mg=}}"));
+ //isoCodeToWikiName.put("", Pattern.quote("{{=hsb=}}"));
+ isoCodeToWikiName.put("ZH", Pattern.quote("{{=zh=}}"));
+ isoCodeToWikiName.put("JA", Pattern.quote("{{=ja=}}"));
+ isoCodeToWikiName.put("DE", Pattern.quote("{{=de=}}"));
+ isoCodeToWikiName.put("IS", Pattern.quote("{{=is=}}")); // Icelandic
+ isoCodeToWikiName.put("ES", Pattern.quote("{{=es=}}"));
+ isoCodeToWikiName.put("UK", Pattern.quote("{{=uk=}}"));
+
+ // egrep -o '= *\{\{-[a-z]+-\}\} *=' itwiktionary-pages-articles.xml | sort | uniq -c | sort -n
+ isoCodeToWikiName = new LinkedHashMap<String, String>();
+ wikiCodeToIsoCodeToWikiName.put("it", isoCodeToWikiName);
+ isoCodeToWikiName.put("IT", "\\{\\{-(it|scn|nap|cal|lmo)-\\}\\}"); // scn, nap, cal, lmo
+ isoCodeToWikiName.put("EN", Pattern.quote("{{-en-}}"));
+ isoCodeToWikiName.put("FR", Pattern.quote("{{-fr-}}"));
+ isoCodeToWikiName.put("DE", Pattern.quote("{{-de-}}"));
+ isoCodeToWikiName.put("ES", Pattern.quote("{{-es-}}"));
+ isoCodeToWikiName.put("JA", Pattern.quote("{{-ja-}}"));
+ isoCodeToWikiName.put("PL", Pattern.quote("{{-pl-}}"));
+ isoCodeToWikiName.put("NL", Pattern.quote("{{-nl-}}"));
+ isoCodeToWikiName.put("LV", Pattern.quote("{{-lv-}}"));
+ isoCodeToWikiName.put("LV", Pattern.quote("{{-la-}}"));
+ isoCodeToWikiName.put("HU", Pattern.quote("{{-hu-}}"));
+ isoCodeToWikiName.put("PL", Pattern.quote("{{-pl-}}"));
+ isoCodeToWikiName.put("EL", Pattern.quote("{{-grc-}}"));
+ isoCodeToWikiName.put("SV", Pattern.quote("{{-sv-}}"));
+
+ }
+
+}
--- /dev/null
+On Android 1.6:
+Landscape/portrait mode, switching between.
+Download dictionary.
+Add dictionary during runtime.
+Remove dictionary during runtime.
+for i in res/raw/*.html; do tidy --input-encoding utf8 --output-file $i $i; done
+
+
For next release:
+flag images
+test/fix return to last-used dictionary
downloads
history dialog
-fix up dictionary manager:
- thread that handles unzipping, downloading for the life of the application (so screen changes don't screw it up).
- check over UI.
check arabic UI fix
handle examples like "asdf (asdf)"
random word jump
* timeout on the exact search... if it can't confirm, it should just switch and go....
* reload dictionaryInfo sometime...
* change path of /sdcard/quickDic/...
+fix up dictionary manager:
+ thread that handles unzipping, downloading for the life of the application (so screen changes don't screw it up).
+ check over UI.