]> gitweb.fperrin.net Git - DictionaryPC.git/blob - src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java
c3d4a230c6e485b961cc05731ada10476c6d2e3f
[DictionaryPC.git] / src / com / hughes / android / dictionary / parser / wiktionary / WiktionaryLangs.java
1 // Copyright 2012 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 package com.hughes.android.dictionary.parser.wiktionary;
16
17 import com.hughes.android.dictionary.engine.Language;
18
19 import java.util.LinkedHashMap;
20 import java.util.LinkedHashSet;
21 import java.util.Map;
22 import java.util.Set;
23 import java.util.regex.Pattern;
24
25 public class WiktionaryLangs {
26   
27   public static final Map<String,String> isoCodeToEnWikiName = new LinkedHashMap<String,String>();
28   static {
29     isoCodeToEnWikiName.put("AF", "Afrikaans");
30     isoCodeToEnWikiName.put("SQ", "Albanian");
31     isoCodeToEnWikiName.put("AR", "Arabic");
32     isoCodeToEnWikiName.put("HY", "Armenian");
33     isoCodeToEnWikiName.put("BE", "Belarusian");
34     isoCodeToEnWikiName.put("BN", "Bengali");
35     isoCodeToEnWikiName.put("BG", "Bulgarian");
36     isoCodeToEnWikiName.put("CA", "Catalan");
37     isoCodeToEnWikiName.put("SH", "Serbo-Croatian");
38     isoCodeToEnWikiName.put("CS", "Czech");
39     isoCodeToEnWikiName.put("ZH", "Chinese");
40     isoCodeToEnWikiName.put("cmn", "Mandarin");
41     isoCodeToEnWikiName.put("yue", "Cantonese");
42     isoCodeToEnWikiName.put("DA", "Danish");
43     isoCodeToEnWikiName.put("NL", "Dutch");
44     isoCodeToEnWikiName.put("EN", "English");
45     isoCodeToEnWikiName.put("EO", "Esperanto");
46     isoCodeToEnWikiName.put("ET", "Estonian");
47     isoCodeToEnWikiName.put("FI", "Finnish");
48     isoCodeToEnWikiName.put("FR", "French");
49     isoCodeToEnWikiName.put("DE", "German");
50     isoCodeToEnWikiName.put("grc", "Ancient Greek");
51     isoCodeToEnWikiName.put("EL", "Greek");
52     isoCodeToEnWikiName.put("haw", "Hawaiian");
53     isoCodeToEnWikiName.put("HE", "Hebrew");
54     isoCodeToEnWikiName.put("HI", "Hindi");
55     isoCodeToEnWikiName.put("HU", "Hungarian");
56     isoCodeToEnWikiName.put("IS", "Icelandic");
57     isoCodeToEnWikiName.put("ID", "Indonesian");
58     isoCodeToEnWikiName.put("GA", "Irish");
59     isoCodeToEnWikiName.put("GD", "Gaelic");
60     isoCodeToEnWikiName.put("GV", "Manx");
61     isoCodeToEnWikiName.put("IT", "Italian");
62     isoCodeToEnWikiName.put("LA", "Latin");
63     isoCodeToEnWikiName.put("LV", "Latvian");
64     isoCodeToEnWikiName.put("LT", "Lithuanian");
65     isoCodeToEnWikiName.put("JA", "Japanese");
66     isoCodeToEnWikiName.put("KO", "Korean");
67     isoCodeToEnWikiName.put("KU", "Kurdish");
68     isoCodeToEnWikiName.put("LO", "Lao");
69     isoCodeToEnWikiName.put("ML", "Malayalam");
70     isoCodeToEnWikiName.put("MS", "Malay");
71     isoCodeToEnWikiName.put("MI", "Maori");
72     isoCodeToEnWikiName.put("MN", "Mongolian");
73     isoCodeToEnWikiName.put("NE", "Nepali");
74     isoCodeToEnWikiName.put("NO", "Norwegian");
75     isoCodeToEnWikiName.put("FA", "Persian");
76     isoCodeToEnWikiName.put("PL", "Polish");
77     isoCodeToEnWikiName.put("PT", "Portuguese");
78     isoCodeToEnWikiName.put("PA", "Punjabi");
79     isoCodeToEnWikiName.put("RO", "Romanian");
80     isoCodeToEnWikiName.put("RU", "Russian");
81     isoCodeToEnWikiName.put("SA", "Sanskrit");
82     isoCodeToEnWikiName.put("SK", "Slovak");
83     isoCodeToEnWikiName.put("SL", "Slovene|Slovenian");
84     isoCodeToEnWikiName.put("SO", "Somali");
85     isoCodeToEnWikiName.put("ES", "Spanish");
86     isoCodeToEnWikiName.put("SW", "Swahili");
87     isoCodeToEnWikiName.put("SV", "Swedish");
88     isoCodeToEnWikiName.put("TL", "Tagalog");
89     isoCodeToEnWikiName.put("TG", "Tajik");
90     isoCodeToEnWikiName.put("TA", "Tamil");
91     isoCodeToEnWikiName.put("TH", "Thai");
92     isoCodeToEnWikiName.put("BO", "Tibetan");
93     isoCodeToEnWikiName.put("TR", "Turkish");
94     isoCodeToEnWikiName.put("UK", "Ukrainian");
95     isoCodeToEnWikiName.put("UR", "Urdu");
96     isoCodeToEnWikiName.put("VI", "Vietnamese");
97     isoCodeToEnWikiName.put("CI", "Welsh");
98     isoCodeToEnWikiName.put("YI", "Yiddish");
99     isoCodeToEnWikiName.put("ZU", "Zulu");
100     isoCodeToEnWikiName.put("AZ", "Azeri");
101     isoCodeToEnWikiName.put("EU", "Basque");
102     isoCodeToEnWikiName.put("BR", "Breton");
103     isoCodeToEnWikiName.put("MR", "Marathi");
104     isoCodeToEnWikiName.put("FO", "Faroese");
105     isoCodeToEnWikiName.put("GL", "Galician");
106     isoCodeToEnWikiName.put("KA", "Georgian");
107     isoCodeToEnWikiName.put("HT", "Haitian Creole");
108     isoCodeToEnWikiName.put("LB", "Luxembourgish");
109     isoCodeToEnWikiName.put("MK", "Macedonian");
110     isoCodeToEnWikiName.put("GV", "Manx");
111     
112     // No longer exists in EN:
113     // isoCodeToEnWikiName.put("BS", "Bosnian");
114     // isoCodeToEnWikiName.put("SR", "Serbian");
115     // isoCodeToEnWikiName.put("HR", "Croatian");
116     
117     // Font doesn't work:
118     //isoCodeToEnWikiName.put("MY", "Burmese");
119
120
121     {
122         //Set<String> missing = new LinkedHashSet<String>(isoCodeToEnWikiName.keySet());
123         //missing.removeAll(Language.isoCodeToResources.keySet());
124         //System.out.println(missing);
125     }
126     //assert Language.isoCodeToResources.keySet().containsAll(isoCodeToEnWikiName.keySet());
127   }
128
129   public static final Map<String,Map<String,String>> wikiCodeToIsoCodeToWikiName = new LinkedHashMap<String, Map<String,String>>();
130   static {
131     // en
132     wikiCodeToIsoCodeToWikiName.put("en", isoCodeToEnWikiName);
133     
134     Map<String,String> isoCodeToWikiName;
135     
136     // egrep -o '\{\{Wortart[^}]+\}\}' dewiktionary-pages-articles.xml | cut -d \| -f3 | sort | uniq -c | sort -nr
137     isoCodeToWikiName = new LinkedHashMap<String, String>();
138     wikiCodeToIsoCodeToWikiName.put("de", isoCodeToWikiName);
139     isoCodeToWikiName.put("DE", "Deutsch");
140     isoCodeToWikiName.put("EN", "Englisch");
141     isoCodeToWikiName.put("IT", "Italienisch");
142     isoCodeToWikiName.put("PL", "Polnisch");
143     isoCodeToWikiName.put("FR", "Französisch");
144     isoCodeToWikiName.put("EO", "Esperanto");
145     isoCodeToWikiName.put("CA", "Katalanisch");
146     isoCodeToWikiName.put("LA", "Latein");
147     isoCodeToWikiName.put("CS", "Tschechisch");
148     isoCodeToWikiName.put("HU", "Ungarisch");
149     isoCodeToWikiName.put("SV", "Schwedisch");
150     isoCodeToWikiName.put("ES", "Spanisch");
151
152     // egrep -o '== *\{\{langue\|[a-zA-Z]+\}\} *==' frwiktionary-pages-articles.xml | sort | uniq -c | sort -nr
153     isoCodeToWikiName = new LinkedHashMap<String, String>();
154     wikiCodeToIsoCodeToWikiName.put("fr", isoCodeToWikiName);
155     isoCodeToWikiName.put("FR", Pattern.quote("{{langue|fr}}"));
156     isoCodeToWikiName.put("RU", Pattern.quote("{{langue|ru}}"));
157     isoCodeToWikiName.put("AR", Pattern.quote("{{langue|ar}}"));  // Arabic
158     isoCodeToWikiName.put("BG", Pattern.quote("{{langue|bg}}"));  // Bulgarian
159     isoCodeToWikiName.put("EN", Pattern.quote("{{langue|en}}"));
160     //isoCodeToWikiName.put("", Pattern.quote("{{langue|sl}}"));
161     isoCodeToWikiName.put("LA", Pattern.quote("{{langue|la}}"));
162     isoCodeToWikiName.put("IT", Pattern.quote("{{langue|it}}"));
163     isoCodeToWikiName.put("EO", Pattern.quote("{{langue|eo}}"));
164     isoCodeToWikiName.put("CS", Pattern.quote("{{langue|cs}}"));  // Czech
165     isoCodeToWikiName.put("NL", Pattern.quote("{{langue|nl}}"));  // Dutch
166     //isoCodeToWikiName.put("", Pattern.quote("{{langue|mg}}"));
167     //isoCodeToWikiName.put("", Pattern.quote("{{langue|hsb}}"));
168     isoCodeToWikiName.put("ZH", Pattern.quote("{{langue|zh}}"));
169     isoCodeToWikiName.put("cmn", Pattern.quote("{{langue|cmn}}"));
170     isoCodeToWikiName.put("yue", Pattern.quote("{{langue|yue}}"));
171     isoCodeToWikiName.put("JA", Pattern.quote("{{langue|ja}}"));
172     isoCodeToWikiName.put("DE", Pattern.quote("{{langue|de}}"));
173     isoCodeToWikiName.put("IS", Pattern.quote("{{langue|is}}"));  // Icelandic
174     isoCodeToWikiName.put("ES", Pattern.quote("{{langue|es}}"));
175     isoCodeToWikiName.put("UK", Pattern.quote("{{langue|uk}}"));
176
177     // egrep -o '= *\{\{-[a-z]+-\}\} *=' itwiktionary-pages-articles.xml | sort | uniq -c | sort -n
178     isoCodeToWikiName = new LinkedHashMap<String, String>();
179     wikiCodeToIsoCodeToWikiName.put("it", isoCodeToWikiName);
180     isoCodeToWikiName.put("IT", "\\{\\{-(it|scn|nap|cal|lmo)-\\}\\}");  // scn, nap, cal, lmo
181     isoCodeToWikiName.put("EN", Pattern.quote("{{-en-}}"));
182     isoCodeToWikiName.put("FR", Pattern.quote("{{-fr-}}"));
183     isoCodeToWikiName.put("DE", Pattern.quote("{{-de-}}"));
184     isoCodeToWikiName.put("ES", Pattern.quote("{{-es-}}"));
185     isoCodeToWikiName.put("JA", Pattern.quote("{{-ja-}}"));
186     isoCodeToWikiName.put("PL", Pattern.quote("{{-pl-}}"));
187     isoCodeToWikiName.put("NL", Pattern.quote("{{-nl-}}"));
188     isoCodeToWikiName.put("LV", Pattern.quote("{{-lv-}}"));
189     isoCodeToWikiName.put("LA", Pattern.quote("{{-la-}}"));
190     isoCodeToWikiName.put("HU", Pattern.quote("{{-hu-}}"));
191     isoCodeToWikiName.put("EL", Pattern.quote("{{-grc-}}"));
192     isoCodeToWikiName.put("SV", Pattern.quote("{{-sv-}}"));
193
194     // There seems to be no consistent pattern and few foreign language entries anyway
195     isoCodeToWikiName = new LinkedHashMap<String, String>();
196     wikiCodeToIsoCodeToWikiName.put("es", isoCodeToWikiName);
197     isoCodeToWikiName.put("ES", Pattern.quote("{{ES"));
198   }
199   public static String getEnglishName(String langCode) {
200       String name = isoCodeToEnWikiName.get(langCode);
201       if (name == null) {
202           name = isoCodeToEnWikiName.get(langCode.toUpperCase());
203       }
204       if (name == null) {
205           return null;
206       }
207       if (name.indexOf('|') != -1) {
208           return name.substring(0, name.indexOf('|'));
209       }
210       if (name.indexOf('$') != -1) {
211           return name.substring(0, name.indexOf('$'));
212       }
213       return name;  // can be null.
214   }
215   
216 }