]> gitweb.fperrin.net Git - DictionaryPC.git/blobdiff - src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java
Add support for generating IT-RU dictionary.
[DictionaryPC.git] / src / com / hughes / android / dictionary / parser / wiktionary / WiktionaryLangs.java
index 65f14427617973202ed9875f49e56a04c04ddd6e..7f52642821bc4b8b2bbafac65be248a0642fa504 100644 (file)
 
 package com.hughes.android.dictionary.parser.wiktionary;
 
+import com.hughes.android.dictionary.engine.Language;
+
 import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
 import java.util.Map;
+import java.util.Set;
 import java.util.regex.Pattern;
 
 public class WiktionaryLangs {
@@ -28,14 +32,13 @@ public class WiktionaryLangs {
     isoCodeToEnWikiName.put("HY", "Armenian");
     isoCodeToEnWikiName.put("BE", "Belarusian");
     isoCodeToEnWikiName.put("BN", "Bengali");
-    isoCodeToEnWikiName.put("BS", "Bosnian");
     isoCodeToEnWikiName.put("BG", "Bulgarian");
-    isoCodeToEnWikiName.put("MY", "Burmese");
-    isoCodeToEnWikiName.put("yue", "Cantonese");
     isoCodeToEnWikiName.put("CA", "Catalan");
-    isoCodeToEnWikiName.put("HR", "Croatian");
+    isoCodeToEnWikiName.put("SH", "Serbo-Croatian");
     isoCodeToEnWikiName.put("CS", "Czech");
-    isoCodeToEnWikiName.put("ZH", "Chinese|Mandarin");
+    isoCodeToEnWikiName.put("ZH", "Chinese");
+    isoCodeToEnWikiName.put("cmn", "Mandarin");
+    isoCodeToEnWikiName.put("yue", "Cantonese");
     isoCodeToEnWikiName.put("DA", "Danish");
     isoCodeToEnWikiName.put("NL", "Dutch");
     isoCodeToEnWikiName.put("EN", "English");
@@ -44,8 +47,8 @@ public class WiktionaryLangs {
     isoCodeToEnWikiName.put("FI", "Finnish");
     isoCodeToEnWikiName.put("FR", "French");
     isoCodeToEnWikiName.put("DE", "German");
-    isoCodeToEnWikiName.put("EL", "Greek");
     isoCodeToEnWikiName.put("grc", "Ancient Greek");
+    isoCodeToEnWikiName.put("EL", "Greek");
     isoCodeToEnWikiName.put("haw", "Hawaiian");
     isoCodeToEnWikiName.put("HE", "Hebrew");
     isoCodeToEnWikiName.put("HI", "Hindi");
@@ -54,6 +57,7 @@ public class WiktionaryLangs {
     isoCodeToEnWikiName.put("ID", "Indonesian");
     isoCodeToEnWikiName.put("GA", "Irish");
     isoCodeToEnWikiName.put("GD", "Gaelic");
+    isoCodeToEnWikiName.put("GV", "Manx");
     isoCodeToEnWikiName.put("IT", "Italian");
     isoCodeToEnWikiName.put("LA", "Latin");
     isoCodeToEnWikiName.put("LV", "Latvian");
@@ -62,8 +66,8 @@ public class WiktionaryLangs {
     isoCodeToEnWikiName.put("KO", "Korean");
     isoCodeToEnWikiName.put("KU", "Kurdish");
     isoCodeToEnWikiName.put("LO", "Lao");
-    isoCodeToEnWikiName.put("MS", "Malay");
     isoCodeToEnWikiName.put("ML", "Malayalam");
+    isoCodeToEnWikiName.put("MS", "Malay");
     isoCodeToEnWikiName.put("MI", "Maori");
     isoCodeToEnWikiName.put("MN", "Mongolian");
     isoCodeToEnWikiName.put("NE", "Nepali");
@@ -75,7 +79,6 @@ public class WiktionaryLangs {
     isoCodeToEnWikiName.put("RO", "Romanian");
     isoCodeToEnWikiName.put("RU", "Russian");
     isoCodeToEnWikiName.put("SA", "Sanskrit");
-    isoCodeToEnWikiName.put("SR", "Serbian");
     isoCodeToEnWikiName.put("SK", "Slovak");
     isoCodeToEnWikiName.put("SL", "Slovene|Slovenian");
     isoCodeToEnWikiName.put("SO", "Somali");
@@ -94,18 +97,33 @@ public class WiktionaryLangs {
     isoCodeToEnWikiName.put("CI", "Welsh");
     isoCodeToEnWikiName.put("YI", "Yiddish");
     isoCodeToEnWikiName.put("ZU", "Zulu");
-    
     isoCodeToEnWikiName.put("AZ", "Azeri");
     isoCodeToEnWikiName.put("EU", "Basque");
     isoCodeToEnWikiName.put("BR", "Breton");
-    isoCodeToEnWikiName.put("MR", "Burmese");
+    isoCodeToEnWikiName.put("MR", "Marathi");
     isoCodeToEnWikiName.put("FO", "Faroese");
     isoCodeToEnWikiName.put("GL", "Galician");
     isoCodeToEnWikiName.put("KA", "Georgian");
     isoCodeToEnWikiName.put("HT", "Haitian Creole");
     isoCodeToEnWikiName.put("LB", "Luxembourgish");
     isoCodeToEnWikiName.put("MK", "Macedonian");
+    isoCodeToEnWikiName.put("GV", "Manx");
+    
+    // No longer exists in EN:
+    // isoCodeToEnWikiName.put("BS", "Bosnian");
+    // isoCodeToEnWikiName.put("SR", "Serbian");
+    // isoCodeToEnWikiName.put("HR", "Croatian");
     
+    // Font doesn't work:
+    //isoCodeToEnWikiName.put("MY", "Burmese");
+
+
+    {
+        //Set<String> missing = new LinkedHashSet<String>(isoCodeToEnWikiName.keySet());
+        //missing.removeAll(Language.isoCodeToResources.keySet());
+        //System.out.println(missing);
+    }
+    //assert Language.isoCodeToResources.keySet().containsAll(isoCodeToEnWikiName.keySet());
   }
 
   public static final Map<String,Map<String,String>> wikiCodeToIsoCodeToWikiName = new LinkedHashMap<String, Map<String,String>>();
@@ -125,33 +143,36 @@ public class WiktionaryLangs {
     isoCodeToWikiName.put("FR", "Französisch");
     isoCodeToWikiName.put("EO", "Esperanto");
     isoCodeToWikiName.put("CA", "Katalanisch");
-    isoCodeToWikiName.put("LA", "Lateinisch");
+    isoCodeToWikiName.put("LA", "Latein");
     isoCodeToWikiName.put("CS", "Tschechisch");
     isoCodeToWikiName.put("HU", "Ungarisch");
     isoCodeToWikiName.put("SV", "Schwedisch");
     isoCodeToWikiName.put("ES", "Spanisch");
 
-    // egrep -o '\{\{=[a-zA-Z]+=\}\}' frwiktionary-pages-articles.xml | sort | uniq -c | sort -nr
+    // egrep -o '== *\{\{langue\|[a-zA-Z]+\}\} *==' frwiktionary-pages-articles.xml | sort | uniq -c | sort -nr
     isoCodeToWikiName = new LinkedHashMap<String, String>();
     wikiCodeToIsoCodeToWikiName.put("fr", isoCodeToWikiName);
-    isoCodeToWikiName.put("FR", Pattern.quote("{{=fr=}}"));
-    isoCodeToWikiName.put("RU", Pattern.quote("{{=ru=}}"));
-    isoCodeToWikiName.put("BG", Pattern.quote("{{=bg=}}"));  // Bulgarian
-    isoCodeToWikiName.put("EN", Pattern.quote("{{=en=}}"));
-    //isoCodeToWikiName.put("", Pattern.quote("{{=sl=}}"));
-    isoCodeToWikiName.put("LA", Pattern.quote("{{=la=}}"));
-    isoCodeToWikiName.put("IT", Pattern.quote("{{=it=}}"));
-    isoCodeToWikiName.put("EO", Pattern.quote("{{=eo=}}"));
-    isoCodeToWikiName.put("CS", Pattern.quote("{{=cs=}}"));  // Czech
-    isoCodeToWikiName.put("NL", Pattern.quote("{{=nl=}}"));  // Dutch
-    //isoCodeToWikiName.put("", Pattern.quote("{{=mg=}}"));
-    //isoCodeToWikiName.put("", Pattern.quote("{{=hsb=}}"));
-    isoCodeToWikiName.put("ZH", Pattern.quote("{{=zh=}}"));
-    isoCodeToWikiName.put("JA", Pattern.quote("{{=ja=}}"));
-    isoCodeToWikiName.put("DE", Pattern.quote("{{=de=}}"));
-    isoCodeToWikiName.put("IS", Pattern.quote("{{=is=}}"));  // Icelandic
-    isoCodeToWikiName.put("ES", Pattern.quote("{{=es=}}"));
-    isoCodeToWikiName.put("UK", Pattern.quote("{{=uk=}}"));
+    isoCodeToWikiName.put("FR", Pattern.quote("{{langue|fr}}"));
+    isoCodeToWikiName.put("RU", Pattern.quote("{{langue|ru}}"));
+    isoCodeToWikiName.put("AR", Pattern.quote("{{langue|ar}}"));  // Arabic
+    isoCodeToWikiName.put("BG", Pattern.quote("{{langue|bg}}"));  // Bulgarian
+    isoCodeToWikiName.put("EN", Pattern.quote("{{langue|en}}"));
+    //isoCodeToWikiName.put("", Pattern.quote("{{langue|sl}}"));
+    isoCodeToWikiName.put("LA", Pattern.quote("{{langue|la}}"));
+    isoCodeToWikiName.put("IT", Pattern.quote("{{langue|it}}"));
+    isoCodeToWikiName.put("EO", Pattern.quote("{{langue|eo}}"));
+    isoCodeToWikiName.put("CS", Pattern.quote("{{langue|cs}}"));  // Czech
+    isoCodeToWikiName.put("NL", Pattern.quote("{{langue|nl}}"));  // Dutch
+    //isoCodeToWikiName.put("", Pattern.quote("{{langue|mg}}"));
+    //isoCodeToWikiName.put("", Pattern.quote("{{langue|hsb}}"));
+    isoCodeToWikiName.put("ZH", Pattern.quote("{{langue|zh}}"));
+    isoCodeToWikiName.put("cmn", Pattern.quote("{{langue|cmn}}"));
+    isoCodeToWikiName.put("yue", Pattern.quote("{{langue|yue}}"));
+    isoCodeToWikiName.put("JA", Pattern.quote("{{langue|ja}}"));
+    isoCodeToWikiName.put("DE", Pattern.quote("{{langue|de}}"));
+    isoCodeToWikiName.put("IS", Pattern.quote("{{langue|is}}"));  // Icelandic
+    isoCodeToWikiName.put("ES", Pattern.quote("{{langue|es}}"));
+    isoCodeToWikiName.put("UK", Pattern.quote("{{langue|uk}}"));
 
     // egrep -o '= *\{\{-[a-z]+-\}\} *=' itwiktionary-pages-articles.xml | sort | uniq -c | sort -n
     isoCodeToWikiName = new LinkedHashMap<String, String>();
@@ -169,7 +190,28 @@ public class WiktionaryLangs {
     isoCodeToWikiName.put("HU", Pattern.quote("{{-hu-}}"));
     isoCodeToWikiName.put("EL", Pattern.quote("{{-grc-}}"));
     isoCodeToWikiName.put("SV", Pattern.quote("{{-sv-}}"));
+    isoCodeToWikiName.put("RU", Pattern.quote("{{-ru-}}"));
 
+    // There seems to be no consistent pattern and few foreign language entries anyway
+    isoCodeToWikiName = new LinkedHashMap<String, String>();
+    wikiCodeToIsoCodeToWikiName.put("es", isoCodeToWikiName);
+    isoCodeToWikiName.put("ES", Pattern.quote("{{ES"));
+  }
+  public static String getEnglishName(String langCode) {
+      String name = isoCodeToEnWikiName.get(langCode);
+      if (name == null) {
+          name = isoCodeToEnWikiName.get(langCode.toUpperCase());
+      }
+      if (name == null) {
+          return null;
+      }
+      if (name.indexOf('|') != -1) {
+          return name.substring(0, name.indexOf('|'));
+      }
+      if (name.indexOf('$') != -1) {
+          return name.substring(0, name.indexOf('$'));
+      }
+      return name;  // can be null.
   }
   
 }