]> gitweb.fperrin.net Git - DictionaryPC.git/commitdiff
Split EN, DE, IT, FR wiktionaries! Fix splitting to use entire header
authorThad Hughes <thad.hughes@gmail.com>
Wed, 8 Feb 2012 23:45:40 +0000 (15:45 -0800)
committerThad Hughes <thad.hughes@gmail.com>
Wed, 8 Feb 2012 23:45:40 +0000 (15:45 -0800)
line (hopefully this works ok).

37 files changed:
data/inputs/flag_graphics/americanFlag.jpg [deleted file]
data/inputs/flag_graphics/as-lgflag.gif [deleted file]
data/inputs/flag_graphics/au-lgflag.gif [deleted file]
data/inputs/flag_graphics/be-lgflag.gif [deleted file]
data/inputs/flag_graphics/br-lgflag.gif [deleted file]
data/inputs/flag_graphics/ca-lgflag.gif [deleted file]
data/inputs/flag_graphics/ee-lgflag.gif [deleted file]
data/inputs/flag_graphics/ei-lgflag.gif [deleted file]
data/inputs/flag_graphics/flags.xcf [deleted file]
data/inputs/flag_graphics/fr-lgflag.gif [deleted file]
data/inputs/flag_graphics/germanFlag.jpg [deleted file]
data/inputs/flag_graphics/gm-lgflag.gif [deleted file]
data/inputs/flag_graphics/gr-lgflag.gif [deleted file]
data/inputs/flag_graphics/it-lgflag.gif [deleted file]
data/inputs/flag_graphics/ja-lgflag.gif [deleted file]
data/inputs/flag_graphics/mx-lgflag.gif [deleted file]
data/inputs/flag_graphics/nl-lgflag.gif [deleted file]
data/inputs/flag_graphics/no-lgflag.gif [deleted file]
data/inputs/flag_graphics/po-lgflag.gif [deleted file]
data/inputs/flag_graphics/rs-lgflag.gif [deleted file]
data/inputs/flag_graphics/sf-lgflag.gif [deleted file]
data/inputs/flag_graphics/sp-lgflag.gif [deleted file]
data/inputs/flag_graphics/sw-lgflag.gif [deleted file]
data/inputs/flag_graphics/sz-lgflag.gif [deleted file]
data/inputs/flag_graphics/ts-lgflag.gif [deleted file]
data/inputs/flag_graphics/tw-lgflag.gif [deleted file]
data/inputs/flag_graphics/uk-lgflag.gif [deleted file]
data/inputs/flag_graphics/us-lgflag.gif [deleted file]
data/inputs/flag_graphics/wa-lgflag.gif [deleted file]
src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java
src/com/hughes/android/dictionary/engine/LanguageTest.java
src/com/hughes/android/dictionary/engine/WiktionarySplitter.java
src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryLangs.java [deleted file]
src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryXmlParser.java
src/com/hughes/android/dictionary/parser/enwiktionary/WiktionaryLangs.java [new file with mode: 0644]
to_test.txt [new file with mode: 0644]
todo.txt

diff --git a/data/inputs/flag_graphics/americanFlag.jpg b/data/inputs/flag_graphics/americanFlag.jpg
deleted file mode 100755 (executable)
index 8d85b25..0000000
Binary files a/data/inputs/flag_graphics/americanFlag.jpg and /dev/null differ
diff --git a/data/inputs/flag_graphics/as-lgflag.gif b/data/inputs/flag_graphics/as-lgflag.gif
deleted file mode 100644 (file)
index 07cc33b..0000000
Binary files a/data/inputs/flag_graphics/as-lgflag.gif and /dev/null differ
diff --git a/data/inputs/flag_graphics/au-lgflag.gif b/data/inputs/flag_graphics/au-lgflag.gif
deleted file mode 100644 (file)
index ff3e618..0000000
Binary files a/data/inputs/flag_graphics/au-lgflag.gif and /dev/null differ
diff --git a/data/inputs/flag_graphics/be-lgflag.gif b/data/inputs/flag_graphics/be-lgflag.gif
deleted file mode 100644 (file)
index bc57368..0000000
Binary files a/data/inputs/flag_graphics/be-lgflag.gif and /dev/null differ
diff --git a/data/inputs/flag_graphics/br-lgflag.gif b/data/inputs/flag_graphics/br-lgflag.gif
deleted file mode 100644 (file)
index 940432b..0000000
Binary files a/data/inputs/flag_graphics/br-lgflag.gif and /dev/null differ
diff --git a/data/inputs/flag_graphics/ca-lgflag.gif b/data/inputs/flag_graphics/ca-lgflag.gif
deleted file mode 100644 (file)
index efc4496..0000000
Binary files a/data/inputs/flag_graphics/ca-lgflag.gif and /dev/null differ
diff --git a/data/inputs/flag_graphics/ee-lgflag.gif b/data/inputs/flag_graphics/ee-lgflag.gif
deleted file mode 100644 (file)
index 97c742b..0000000
Binary files a/data/inputs/flag_graphics/ee-lgflag.gif and /dev/null differ
diff --git a/data/inputs/flag_graphics/ei-lgflag.gif b/data/inputs/flag_graphics/ei-lgflag.gif
deleted file mode 100644 (file)
index 68a9273..0000000
Binary files a/data/inputs/flag_graphics/ei-lgflag.gif and /dev/null differ
diff --git a/data/inputs/flag_graphics/flags.xcf b/data/inputs/flag_graphics/flags.xcf
deleted file mode 100755 (executable)
index b322c28..0000000
Binary files a/data/inputs/flag_graphics/flags.xcf and /dev/null differ
diff --git a/data/inputs/flag_graphics/fr-lgflag.gif b/data/inputs/flag_graphics/fr-lgflag.gif
deleted file mode 100644 (file)
index 9fa5027..0000000
Binary files a/data/inputs/flag_graphics/fr-lgflag.gif and /dev/null differ
diff --git a/data/inputs/flag_graphics/germanFlag.jpg b/data/inputs/flag_graphics/germanFlag.jpg
deleted file mode 100755 (executable)
index b796c87..0000000
Binary files a/data/inputs/flag_graphics/germanFlag.jpg and /dev/null differ
diff --git a/data/inputs/flag_graphics/gm-lgflag.gif b/data/inputs/flag_graphics/gm-lgflag.gif
deleted file mode 100644 (file)
index ef82b20..0000000
Binary files a/data/inputs/flag_graphics/gm-lgflag.gif and /dev/null differ
diff --git a/data/inputs/flag_graphics/gr-lgflag.gif b/data/inputs/flag_graphics/gr-lgflag.gif
deleted file mode 100644 (file)
index f8f35d0..0000000
Binary files a/data/inputs/flag_graphics/gr-lgflag.gif and /dev/null differ
diff --git a/data/inputs/flag_graphics/it-lgflag.gif b/data/inputs/flag_graphics/it-lgflag.gif
deleted file mode 100644 (file)
index 18b6f03..0000000
Binary files a/data/inputs/flag_graphics/it-lgflag.gif and /dev/null differ
diff --git a/data/inputs/flag_graphics/ja-lgflag.gif b/data/inputs/flag_graphics/ja-lgflag.gif
deleted file mode 100644 (file)
index af4419a..0000000
Binary files a/data/inputs/flag_graphics/ja-lgflag.gif and /dev/null differ
diff --git a/data/inputs/flag_graphics/mx-lgflag.gif b/data/inputs/flag_graphics/mx-lgflag.gif
deleted file mode 100644 (file)
index 2569250..0000000
Binary files a/data/inputs/flag_graphics/mx-lgflag.gif and /dev/null differ
diff --git a/data/inputs/flag_graphics/nl-lgflag.gif b/data/inputs/flag_graphics/nl-lgflag.gif
deleted file mode 100644 (file)
index e6fa805..0000000
Binary files a/data/inputs/flag_graphics/nl-lgflag.gif and /dev/null differ
diff --git a/data/inputs/flag_graphics/no-lgflag.gif b/data/inputs/flag_graphics/no-lgflag.gif
deleted file mode 100644 (file)
index c29a5eb..0000000
Binary files a/data/inputs/flag_graphics/no-lgflag.gif and /dev/null differ
diff --git a/data/inputs/flag_graphics/po-lgflag.gif b/data/inputs/flag_graphics/po-lgflag.gif
deleted file mode 100644 (file)
index e7a49d8..0000000
Binary files a/data/inputs/flag_graphics/po-lgflag.gif and /dev/null differ
diff --git a/data/inputs/flag_graphics/rs-lgflag.gif b/data/inputs/flag_graphics/rs-lgflag.gif
deleted file mode 100644 (file)
index c958629..0000000
Binary files a/data/inputs/flag_graphics/rs-lgflag.gif and /dev/null differ
diff --git a/data/inputs/flag_graphics/sf-lgflag.gif b/data/inputs/flag_graphics/sf-lgflag.gif
deleted file mode 100644 (file)
index 3750169..0000000
Binary files a/data/inputs/flag_graphics/sf-lgflag.gif and /dev/null differ
diff --git a/data/inputs/flag_graphics/sp-lgflag.gif b/data/inputs/flag_graphics/sp-lgflag.gif
deleted file mode 100644 (file)
index 7cf2cb7..0000000
Binary files a/data/inputs/flag_graphics/sp-lgflag.gif and /dev/null differ
diff --git a/data/inputs/flag_graphics/sw-lgflag.gif b/data/inputs/flag_graphics/sw-lgflag.gif
deleted file mode 100644 (file)
index 4fdb247..0000000
Binary files a/data/inputs/flag_graphics/sw-lgflag.gif and /dev/null differ
diff --git a/data/inputs/flag_graphics/sz-lgflag.gif b/data/inputs/flag_graphics/sz-lgflag.gif
deleted file mode 100644 (file)
index 8184466..0000000
Binary files a/data/inputs/flag_graphics/sz-lgflag.gif and /dev/null differ
diff --git a/data/inputs/flag_graphics/ts-lgflag.gif b/data/inputs/flag_graphics/ts-lgflag.gif
deleted file mode 100644 (file)
index ae0c7f9..0000000
Binary files a/data/inputs/flag_graphics/ts-lgflag.gif and /dev/null differ
diff --git a/data/inputs/flag_graphics/tw-lgflag.gif b/data/inputs/flag_graphics/tw-lgflag.gif
deleted file mode 100644 (file)
index 4c9c400..0000000
Binary files a/data/inputs/flag_graphics/tw-lgflag.gif and /dev/null differ
diff --git a/data/inputs/flag_graphics/uk-lgflag.gif b/data/inputs/flag_graphics/uk-lgflag.gif
deleted file mode 100644 (file)
index 17b15b7..0000000
Binary files a/data/inputs/flag_graphics/uk-lgflag.gif and /dev/null differ
diff --git a/data/inputs/flag_graphics/us-lgflag.gif b/data/inputs/flag_graphics/us-lgflag.gif
deleted file mode 100644 (file)
index 7269199..0000000
Binary files a/data/inputs/flag_graphics/us-lgflag.gif and /dev/null differ
diff --git a/data/inputs/flag_graphics/wa-lgflag.gif b/data/inputs/flag_graphics/wa-lgflag.gif
deleted file mode 100644 (file)
index 63a7799..0000000
Binary files a/data/inputs/flag_graphics/wa-lgflag.gif and /dev/null differ
index 6e640d6da2a480c99dbeaaf203677be2da415a78..9f33164dde87925f6c8caca39d0b50c1fecf7739 100644 (file)
@@ -19,7 +19,7 @@ import java.util.Map;
 
 import junit.framework.TestCase;
 
-import com.hughes.android.dictionary.parser.enwiktionary.EnWiktionaryLangs;
+import com.hughes.android.dictionary.parser.enwiktionary.WiktionaryLangs;
 
 public class DictionaryBuilderMain extends TestCase {
   
@@ -31,7 +31,7 @@ public class DictionaryBuilderMain extends TestCase {
     
     // Builds all the dictionaries it can, outputs list to a text file.
     
-    final Map<String,String> isoToWikiName = new LinkedHashMap<String, String>(EnWiktionaryLangs.isoCodeToWikiName);
+    final Map<String,String> isoToWikiName = new LinkedHashMap<String, String>(WiktionaryLangs.isoCodeToWikiName);
     isoToWikiName.remove("EN");
     isoToWikiName.remove("DE");
 
index ee0b227e22bf764ba52e212fe9efa38d62a47e8a..e281e50acb7c4cc0aee5128a8ce5dbeb51d0db35 100644 (file)
@@ -23,7 +23,7 @@ import java.util.Set;
 
 import junit.framework.TestCase;
 
-import com.hughes.android.dictionary.parser.enwiktionary.EnWiktionaryLangs;
+import com.hughes.android.dictionary.parser.enwiktionary.WiktionaryLangs;
 import com.ibm.icu.text.Transliterator;
 
 public class LanguageTest extends TestCase {
@@ -170,8 +170,8 @@ public class LanguageTest extends TestCase {
   }
 
   public void testEnWiktionaryNames() {
-    final Set<String> enLangs = new LinkedHashSet<String>(EnWiktionaryLangs.isoCodeToWikiName.keySet());
-    for (final String code : EnWiktionaryLangs.isoCodeToWikiName.keySet()) {
+    final Set<String> enLangs = new LinkedHashSet<String>(WiktionaryLangs.isoCodeToWikiName.keySet());
+    for (final String code : WiktionaryLangs.isoCodeToWikiName.keySet()) {
       enLangs.add(code.toLowerCase());
     }
     assertEquals(enLangs.toString(), Language.isoCodeToResources.keySet().toString());
index 628d3567e7a1ad030a5e828599e8458088501db0..d0423b3bd95b4880117429c06ec9938be588de13 100644 (file)
@@ -20,6 +20,7 @@ import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.regex.Matcher;
@@ -32,72 +33,63 @@ import javax.xml.parsers.SAXParserFactory;
 import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
 
-import com.hughes.android.dictionary.parser.enwiktionary.EnWiktionaryLangs;
+import com.hughes.android.dictionary.parser.enwiktionary.WiktionaryLangs;
 
 public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
-  
-  private static final String FILE_TO_SPLIT = "data/inputs/enwiktionary-pages-articles.xml";
-  
-  static class Section implements java.io.Serializable {
-    private static final long serialVersionUID = -7676549898325856822L;
 
-    final String title;
-    final String heading;
-    final String text;
-    
-    public Section(final String title, final String heading, final String text) {
-      this.title = title;
-      this.heading = heading;
-      this.text = text;
-      
-      //System.out.printf("TITLE:%s\nHEADING:%s\nTEXT:%s\n\n\n\n\n\n", title, heading, text);
-    }
-  }
+  // The matches the whole line, otherwise regexes don't work well on French:
+  // {{=uk=}}
+  static final Pattern headingStart = Pattern.compile("^(=+)[^=].*$", Pattern.MULTILINE);
+  
+  final Map<String,List<Selector>> pathToSelectors = new LinkedHashMap<String, List<Selector>>();
+  List<Selector> currentSelectors = null;
   
-  static class Selector {
-    DataOutputStream out;
-    Pattern pattern;
-    
-    public Selector(final String filename, final String pattern) throws IOException {
-      this.out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(filename)));
-      this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE);
-    }
-  }
-
-  final List<Selector> selectors = new ArrayList<Selector>();
   StringBuilder titleBuilder;
   StringBuilder textBuilder;
   StringBuilder currentBuilder = null;
 
   public static void main(final String[] args) throws SAXException, IOException, ParserConfigurationException {
-    final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
     final WiktionarySplitter wiktionarySplitter = new WiktionarySplitter();
-    
-    // Configure things.
-    
-    final List<Selector> selectors = wiktionarySplitter.selectors;
-    for (int i = 1; i < args.length; i += 2) {
-      final Selector selector = new Selector(args[i], args[i+1]);
-      selectors.add(selector);
+    wiktionarySplitter.go();
+  }
+  
+  private WiktionarySplitter() {
+    List<Selector> selectors;
+    for (final String code : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.keySet()) {
+      //if (!code.equals("fr")) {continue;}
+      selectors = new ArrayList<WiktionarySplitter.Selector>();
+      pathToSelectors.put(String.format("data/inputs/%swiktionary-pages-articles.xml", code), selectors);
+      for (final Map.Entry<String, String> entry : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.get(code).entrySet()) {
+        final String dir = String.format("data/inputs/wikiSplit/%s", code);
+        new File(dir).mkdirs();
+        selectors.add(new Selector(String.format("%s/%s.data", dir, entry.getKey()), entry.getValue()));
+      }
     }
+  }
 
-    if (selectors.isEmpty()) {
-      for (final Map.Entry<String, String> entry : EnWiktionaryLangs.isoCodeToWikiName.entrySet()) {
-        selectors.add(new Selector(String.format("data/inputs/enWikiSplit/%s.data", entry.getKey()), entry.getValue()));
+  private void go() throws ParserConfigurationException, SAXException, IOException {
+    final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
+
+    // Configure things.
+    for (final Map.Entry<String, List<Selector>> pathToSelectorsEntry : pathToSelectors.entrySet()) {
+      
+      currentSelectors = pathToSelectorsEntry.getValue();
+      
+      for (final Selector selector : currentSelectors) {
+        selector.out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(selector.outFilename)));
       }
-    }
-    
-    // Do it.
-    parser.parse(new File(FILE_TO_SPLIT), wiktionarySplitter);
-    
-    // Shutdown.
-    for (final Selector selector : selectors) {
-      selector.out.close();
+  
+      // Do it.
+      parser.parse(new File(pathToSelectorsEntry.getKey()), this);
+      
+      // Shutdown.
+      for (final Selector selector : currentSelectors) {
+        selector.out.close();
+      }
+      
     }
   }
 
-  static final Pattern headingStart = Pattern.compile("^(=+)[^=]+=+", Pattern.MULTILINE);
-  
   int pageCount = 0;
   private void endPage() {
     final String title = titleBuilder.toString();
@@ -116,12 +108,12 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
       text = text.substring(startMatcher.end());
       
       final String heading = startMatcher.group();
-      for (final Selector selector : selectors) {
+      for (final Selector selector : currentSelectors) {
         if (selector.pattern.matcher(heading).find()) {
           
           // Find end.
           final int depth = startMatcher.group(1).length();
-          final Pattern endPattern = Pattern.compile(String.format("^={1,%d}[^=]+=+", depth), Pattern.MULTILINE);
+          final Pattern endPattern = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE);
           
           final Matcher endMatcher = endPattern.matcher(text);
           final int end;
@@ -151,6 +143,36 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
     
   }
 
+  // -----------------------------------------------------------------------
+
+  static class Section implements java.io.Serializable {
+    private static final long serialVersionUID = -7676549898325856822L;
+
+    final String title;
+    final String heading;
+    final String text;
+    
+    public Section(final String title, final String heading, final String text) {
+      this.title = title;
+      this.heading = heading;
+      this.text = text;
+      
+      //System.out.printf("TITLE:%s\nHEADING:%s\nTEXT:%s\n\n\n\n\n\n", title, heading, text);
+    }
+  }
+  
+  static class Selector {
+    final String outFilename;
+    final Pattern pattern;
+
+    DataOutputStream out;
+
+    public Selector(final String filename, final String pattern) {
+      this.outFilename = filename;
+      this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE);
+    }
+  }
+
   // -----------------------------------------------------------------------
   
     @Override
@@ -191,5 +213,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
       final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
       parser.parse(file, this);
     }
+
+    
     
 }
diff --git a/src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryLangs.java b/src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryLangs.java
deleted file mode 100644 (file)
index 0b24556..0000000
+++ /dev/null
@@ -1,90 +0,0 @@
-package com.hughes.android.dictionary.parser.enwiktionary;
-
-import java.util.LinkedHashMap;
-import java.util.Map;
-
-public class EnWiktionaryLangs {
-  
-  public static final Map<String,String> isoCodeToWikiName = new LinkedHashMap<String,String>();
-  static {
-    isoCodeToWikiName.put("AF", "Afrikaans");
-    isoCodeToWikiName.put("SQ", "Albanian");
-    isoCodeToWikiName.put("AR", "Arabic");
-    isoCodeToWikiName.put("HY", "Armenian");
-    isoCodeToWikiName.put("BE", "Belarusian");
-    isoCodeToWikiName.put("BN", "Bengali");
-    isoCodeToWikiName.put("BS", "Bosnian");
-    isoCodeToWikiName.put("BG", "Bulgarian");
-    isoCodeToWikiName.put("CA", "Catalan");
-    isoCodeToWikiName.put("HR", "Croatian");
-    isoCodeToWikiName.put("CS", "Czech");
-    isoCodeToWikiName.put("ZH", "Chinese|Mandarin|Cantonese");
-    isoCodeToWikiName.put("DA", "Danish");
-    isoCodeToWikiName.put("NL", "Dutch");
-    isoCodeToWikiName.put("EN", "English");
-    isoCodeToWikiName.put("EO", "Esperanto");
-    isoCodeToWikiName.put("ET", "Estonian");
-    isoCodeToWikiName.put("FI", "Finnish");
-    isoCodeToWikiName.put("FR", "French");
-    isoCodeToWikiName.put("DE", "German");
-    isoCodeToWikiName.put("EL", "Greek");
-    isoCodeToWikiName.put("haw", "Hawaiian");
-    isoCodeToWikiName.put("HE", "Hebrew");
-    isoCodeToWikiName.put("HI", "Hindi");
-    isoCodeToWikiName.put("HU", "Hungarian");
-    isoCodeToWikiName.put("IS", "Icelandic");
-    isoCodeToWikiName.put("ID", "Indonesian");
-    isoCodeToWikiName.put("GA", "Gaelic");
-    isoCodeToWikiName.put("IT", "Italian");
-    isoCodeToWikiName.put("LA", "Latin");
-    isoCodeToWikiName.put("LV", "Latvian");
-    isoCodeToWikiName.put("LT", "Lithuanian");
-    isoCodeToWikiName.put("JA", "Japanese");
-    isoCodeToWikiName.put("KO", "Korean");
-    isoCodeToWikiName.put("KU", "Kurdish");
-    isoCodeToWikiName.put("MS", "Malay");
-    isoCodeToWikiName.put("MI", "Maori");
-    isoCodeToWikiName.put("MN", "Mongolian");
-    isoCodeToWikiName.put("NE", "Nepali");
-    isoCodeToWikiName.put("NO", "Norwegian");
-    isoCodeToWikiName.put("FA", "Persian");
-    isoCodeToWikiName.put("PL", "Polish");
-    isoCodeToWikiName.put("PT", "Portuguese");
-    isoCodeToWikiName.put("PA", "Punjabi");
-    isoCodeToWikiName.put("RO", "Romanian");
-    isoCodeToWikiName.put("RU", "Russian");
-    isoCodeToWikiName.put("SA", "Sanskrit");
-    isoCodeToWikiName.put("SR", "Serbian");
-    isoCodeToWikiName.put("SK", "Slovak");
-    isoCodeToWikiName.put("SO", "Somali");
-    isoCodeToWikiName.put("ES", "Spanish");
-    isoCodeToWikiName.put("SW", "Swahili");
-    isoCodeToWikiName.put("SV", "Swedish");
-    isoCodeToWikiName.put("TL", "Tagalog");
-    isoCodeToWikiName.put("TG", "Tajik");
-    isoCodeToWikiName.put("TH", "Thai");
-    isoCodeToWikiName.put("BO", "Tibetan");
-    isoCodeToWikiName.put("TR", "Turkish");
-    isoCodeToWikiName.put("UK", "Ukrainian");
-    isoCodeToWikiName.put("UR", "Urdu");
-    isoCodeToWikiName.put("VI", "Vietnamese");
-    isoCodeToWikiName.put("CI", "Welsh");
-    isoCodeToWikiName.put("YI", "Yiddish");
-    isoCodeToWikiName.put("ZU", "Zulu");
-
-    
-    isoCodeToWikiName.put("AZ", "Azeri");
-    isoCodeToWikiName.put("EU", "Basque");
-    isoCodeToWikiName.put("BR", "Breton");
-    isoCodeToWikiName.put("MR", "Burmese");
-    isoCodeToWikiName.put("FO", "Faroese");
-    isoCodeToWikiName.put("GL", "Galician");
-    isoCodeToWikiName.put("KA", "Georgian");
-    isoCodeToWikiName.put("HT", "Haitian Creole");
-    isoCodeToWikiName.put("LB", "Luxembourgish");
-    isoCodeToWikiName.put("MK", "Macedonian");
-    
-  }
-
-
-}
index 27246cd42f1fab8d8385a83984f71b63fa50e94a..babc696577233bd3e6d0d3a267b547f47e0643e7 100644 (file)
@@ -119,10 +119,10 @@ public class EnWiktionaryXmlParser {
     }
     } finally {
       System.out.println("lang Counts: " + appendAndIndexWikiCallback.langCodeToTCount);
-      appendAndIndexWikiCallback.langCodeToTCount.keySet().removeAll(EnWiktionaryLangs.isoCodeToWikiName.keySet());
+      appendAndIndexWikiCallback.langCodeToTCount.keySet().removeAll(WiktionaryLangs.isoCodeToWikiName.keySet());
       System.out.println("unused Counts: " + appendAndIndexWikiCallback.langCodeToTCount);
       System.out.println("lang Counts: " + langNameToTCount);
-      langNameToTCount.keySet().removeAll(EnWiktionaryLangs.isoCodeToWikiName.values());
+      langNameToTCount.keySet().removeAll(WiktionaryLangs.isoCodeToWikiName.values());
       System.out.println("unknown counts: " + langNameToTCount);
     }
   }
diff --git a/src/com/hughes/android/dictionary/parser/enwiktionary/WiktionaryLangs.java b/src/com/hughes/android/dictionary/parser/enwiktionary/WiktionaryLangs.java
new file mode 100644 (file)
index 0000000..77ff567
--- /dev/null
@@ -0,0 +1,154 @@
+package com.hughes.android.dictionary.parser.enwiktionary;
+
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+public class WiktionaryLangs {
+  
+  public static final Map<String,String> isoCodeToWikiName = new LinkedHashMap<String,String>();
+  static {
+    isoCodeToWikiName.put("AF", "Afrikaans");
+    isoCodeToWikiName.put("SQ", "Albanian");
+    isoCodeToWikiName.put("AR", "Arabic");
+    isoCodeToWikiName.put("HY", "Armenian");
+    isoCodeToWikiName.put("BE", "Belarusian");
+    isoCodeToWikiName.put("BN", "Bengali");
+    isoCodeToWikiName.put("BS", "Bosnian");
+    isoCodeToWikiName.put("BG", "Bulgarian");
+    isoCodeToWikiName.put("CA", "Catalan");
+    isoCodeToWikiName.put("HR", "Croatian");
+    isoCodeToWikiName.put("CS", "Czech");
+    isoCodeToWikiName.put("ZH", "Chinese|Mandarin|Cantonese");
+    isoCodeToWikiName.put("DA", "Danish");
+    isoCodeToWikiName.put("NL", "Dutch");
+    isoCodeToWikiName.put("EN", "English");
+    isoCodeToWikiName.put("EO", "Esperanto");
+    isoCodeToWikiName.put("ET", "Estonian");
+    isoCodeToWikiName.put("FI", "Finnish");
+    isoCodeToWikiName.put("FR", "French");
+    isoCodeToWikiName.put("DE", "German");
+    isoCodeToWikiName.put("EL", "Greek");
+    isoCodeToWikiName.put("haw", "Hawaiian");
+    isoCodeToWikiName.put("HE", "Hebrew");
+    isoCodeToWikiName.put("HI", "Hindi");
+    isoCodeToWikiName.put("HU", "Hungarian");
+    isoCodeToWikiName.put("IS", "Icelandic");
+    isoCodeToWikiName.put("ID", "Indonesian");
+    isoCodeToWikiName.put("GA", "Gaelic");
+    isoCodeToWikiName.put("IT", "Italian");
+    isoCodeToWikiName.put("LA", "Latin");
+    isoCodeToWikiName.put("LV", "Latvian");
+    isoCodeToWikiName.put("LT", "Lithuanian");
+    isoCodeToWikiName.put("JA", "Japanese");
+    isoCodeToWikiName.put("KO", "Korean");
+    isoCodeToWikiName.put("KU", "Kurdish");
+    isoCodeToWikiName.put("MS", "Malay");
+    isoCodeToWikiName.put("MI", "Maori");
+    isoCodeToWikiName.put("MN", "Mongolian");
+    isoCodeToWikiName.put("NE", "Nepali");
+    isoCodeToWikiName.put("NO", "Norwegian");
+    isoCodeToWikiName.put("FA", "Persian");
+    isoCodeToWikiName.put("PL", "Polish");
+    isoCodeToWikiName.put("PT", "Portuguese");
+    isoCodeToWikiName.put("PA", "Punjabi");
+    isoCodeToWikiName.put("RO", "Romanian");
+    isoCodeToWikiName.put("RU", "Russian");
+    isoCodeToWikiName.put("SA", "Sanskrit");
+    isoCodeToWikiName.put("SR", "Serbian");
+    isoCodeToWikiName.put("SK", "Slovak");
+    isoCodeToWikiName.put("SO", "Somali");
+    isoCodeToWikiName.put("ES", "Spanish");
+    isoCodeToWikiName.put("SW", "Swahili");
+    isoCodeToWikiName.put("SV", "Swedish");
+    isoCodeToWikiName.put("TL", "Tagalog");
+    isoCodeToWikiName.put("TG", "Tajik");
+    isoCodeToWikiName.put("TH", "Thai");
+    isoCodeToWikiName.put("BO", "Tibetan");
+    isoCodeToWikiName.put("TR", "Turkish");
+    isoCodeToWikiName.put("UK", "Ukrainian");
+    isoCodeToWikiName.put("UR", "Urdu");
+    isoCodeToWikiName.put("VI", "Vietnamese");
+    isoCodeToWikiName.put("CI", "Welsh");
+    isoCodeToWikiName.put("YI", "Yiddish");
+    isoCodeToWikiName.put("ZU", "Zulu");
+    
+    isoCodeToWikiName.put("AZ", "Azeri");
+    isoCodeToWikiName.put("EU", "Basque");
+    isoCodeToWikiName.put("BR", "Breton");
+    isoCodeToWikiName.put("MR", "Burmese");
+    isoCodeToWikiName.put("FO", "Faroese");
+    isoCodeToWikiName.put("GL", "Galician");
+    isoCodeToWikiName.put("KA", "Georgian");
+    isoCodeToWikiName.put("HT", "Haitian Creole");
+    isoCodeToWikiName.put("LB", "Luxembourgish");
+    isoCodeToWikiName.put("MK", "Macedonian");
+    
+  }
+
+  public static final Map<String,Map<String,String>> wikiCodeToIsoCodeToWikiName = new LinkedHashMap<String, Map<String,String>>();
+  static {
+    // en
+    wikiCodeToIsoCodeToWikiName.put("en", isoCodeToWikiName);
+    
+    Map<String,String> isoCodeToWikiName;
+    
+    // egrep -o '\{\{Wortart[^}]+\}\}' dewiktionary-pages-articles.xml | cut -d \| -f3 | sort | uniq -c | sort -nr
+    isoCodeToWikiName = new LinkedHashMap<String, String>();
+    wikiCodeToIsoCodeToWikiName.put("de", isoCodeToWikiName);
+    isoCodeToWikiName.put("DE", "Deutsch");
+    isoCodeToWikiName.put("EN", "Englisch");
+    isoCodeToWikiName.put("IT", "Italienisch");
+    isoCodeToWikiName.put("PL", "Polnisch");
+    isoCodeToWikiName.put("FR", "Französisch");
+    isoCodeToWikiName.put("EO", "Esperanto");
+    isoCodeToWikiName.put("CA", "Katalanisch");
+    isoCodeToWikiName.put("LA", "Lateinisch");
+    isoCodeToWikiName.put("CS", "Tschechisch");
+    isoCodeToWikiName.put("HU", "Ungarisch");
+    isoCodeToWikiName.put("SV", "Schwedisch");
+    isoCodeToWikiName.put("ES", "Spanisch");
+
+    // egrep -o '\{\{=[a-zA-Z]+=\}\}' frwiktionary-pages-articles.xml | sort | uniq -c | sort -nr
+    isoCodeToWikiName = new LinkedHashMap<String, String>();
+    wikiCodeToIsoCodeToWikiName.put("fr", isoCodeToWikiName);
+    isoCodeToWikiName.put("FR", Pattern.quote("{{=fr=}}"));
+    isoCodeToWikiName.put("RU", Pattern.quote("{{=ru=}}"));
+    isoCodeToWikiName.put("BG", Pattern.quote("{{=bg=}}"));  // Bulgarian
+    isoCodeToWikiName.put("EN", Pattern.quote("{{=en=}}"));
+    //isoCodeToWikiName.put("", Pattern.quote("{{=sl=}}"));
+    isoCodeToWikiName.put("LA", Pattern.quote("{{=la=}}"));
+    isoCodeToWikiName.put("IT", Pattern.quote("{{=it=}}"));
+    isoCodeToWikiName.put("EO", Pattern.quote("{{=eo=}}"));
+    isoCodeToWikiName.put("CS", Pattern.quote("{{=cs=}}"));  // Czech
+    isoCodeToWikiName.put("NL", Pattern.quote("{{=nl=}}"));  // Dutch
+    //isoCodeToWikiName.put("", Pattern.quote("{{=mg=}}"));
+    //isoCodeToWikiName.put("", Pattern.quote("{{=hsb=}}"));
+    isoCodeToWikiName.put("ZH", Pattern.quote("{{=zh=}}"));
+    isoCodeToWikiName.put("JA", Pattern.quote("{{=ja=}}"));
+    isoCodeToWikiName.put("DE", Pattern.quote("{{=de=}}"));
+    isoCodeToWikiName.put("IS", Pattern.quote("{{=is=}}"));  // Icelandic
+    isoCodeToWikiName.put("ES", Pattern.quote("{{=es=}}"));
+    isoCodeToWikiName.put("UK", Pattern.quote("{{=uk=}}"));
+
+    // egrep -o '= *\{\{-[a-z]+-\}\} *=' itwiktionary-pages-articles.xml | sort | uniq -c | sort -n
+    isoCodeToWikiName = new LinkedHashMap<String, String>();
+    wikiCodeToIsoCodeToWikiName.put("it", isoCodeToWikiName);
+    isoCodeToWikiName.put("IT", "\\{\\{-(it|scn|nap|cal|lmo)-\\}\\}");  // scn, nap, cal, lmo
+    isoCodeToWikiName.put("EN", Pattern.quote("{{-en-}}"));
+    isoCodeToWikiName.put("FR", Pattern.quote("{{-fr-}}"));
+    isoCodeToWikiName.put("DE", Pattern.quote("{{-de-}}"));
+    isoCodeToWikiName.put("ES", Pattern.quote("{{-es-}}"));
+    isoCodeToWikiName.put("JA", Pattern.quote("{{-ja-}}"));
+    isoCodeToWikiName.put("PL", Pattern.quote("{{-pl-}}"));
+    isoCodeToWikiName.put("NL", Pattern.quote("{{-nl-}}"));
+    isoCodeToWikiName.put("LV", Pattern.quote("{{-lv-}}"));
+    isoCodeToWikiName.put("LV", Pattern.quote("{{-la-}}"));
+    isoCodeToWikiName.put("HU", Pattern.quote("{{-hu-}}"));
+    isoCodeToWikiName.put("PL", Pattern.quote("{{-pl-}}"));
+    isoCodeToWikiName.put("EL", Pattern.quote("{{-grc-}}"));
+    isoCodeToWikiName.put("SV", Pattern.quote("{{-sv-}}"));
+
+  }
+  
+}
diff --git a/to_test.txt b/to_test.txt
new file mode 100644 (file)
index 0000000..60e9c67
--- /dev/null
@@ -0,0 +1,5 @@
+On Android 1.6:
+Landscape/portrait mode, switching between.
+Download dictionary.
+Add dictionary during runtime.
+Remove dictionary during runtime.
index 0cc215b48758fd03ed80cca31f1d847ad3531964..17bac82c5a247f14e9e995357961187d20f3b1dc 100644 (file)
--- a/todo.txt
+++ b/todo.txt
@@ -1,9 +1,11 @@
+for i in res/raw/*.html; do tidy --input-encoding utf8  --output-file $i $i; done
+
+
 For next release:
+flag images
+test/fix return to last-used dictionary
 downloads
 history dialog
-fix up dictionary manager:
-  thread that handles unzipping, downloading for the life of the application (so screen changes don't screw it up).
-  check over UI.
 check arabic UI fix
 handle examples like "asdf (asdf)"
 random word jump
@@ -101,3 +103,6 @@ about dict dialog
 * timeout on the exact search...  if it can't confirm, it should just switch and go....
 * reload dictionaryInfo sometime...
 * change path of /sdcard/quickDic/...
+fix up dictionary manager:
+  thread that handles unzipping, downloading for the life of the application (so screen changes don't screw it up).
+  check over UI.