1 // Copyright 2012 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 package com.hughes.android.dictionary.parser.wiktionary;
17 import com.hughes.android.dictionary.engine.Language;
19 import java.util.LinkedHashMap;
20 import java.util.LinkedHashSet;
23 import java.util.regex.Pattern;
25 public class WiktionaryLangs {
27 public static final Map<String,String> isoCodeToEnWikiName = new LinkedHashMap<String,String>();
29 isoCodeToEnWikiName.put("AF", "Afrikaans");
30 isoCodeToEnWikiName.put("SQ", "Albanian");
31 isoCodeToEnWikiName.put("AR", "Arabic");
32 isoCodeToEnWikiName.put("HY", "Armenian");
33 isoCodeToEnWikiName.put("BE", "Belarusian");
34 isoCodeToEnWikiName.put("BN", "Bengali");
35 isoCodeToEnWikiName.put("BG", "Bulgarian");
36 isoCodeToEnWikiName.put("CA", "Catalan");
37 isoCodeToEnWikiName.put("SH", "Serbo-Croatian");
38 isoCodeToEnWikiName.put("CS", "Czech");
39 isoCodeToEnWikiName.put("ZH", "Chinese");
40 isoCodeToEnWikiName.put("cmn", "Mandarin");
41 isoCodeToEnWikiName.put("yue", "Cantonese");
42 isoCodeToEnWikiName.put("DA", "Danish");
43 isoCodeToEnWikiName.put("NL", "Dutch");
44 isoCodeToEnWikiName.put("EN", "English");
45 isoCodeToEnWikiName.put("EO", "Esperanto");
46 isoCodeToEnWikiName.put("ET", "Estonian");
47 isoCodeToEnWikiName.put("FI", "Finnish");
48 isoCodeToEnWikiName.put("FR", "French");
49 // Note: must be before German since matcher
50 // simply takes first match instead of best.
51 isoCodeToEnWikiName.put("nds", "Low German");
52 isoCodeToEnWikiName.put("pdc", "Pennsylvania German");
53 isoCodeToEnWikiName.put("DE", "German");
54 isoCodeToEnWikiName.put("grc", "Ancient Greek");
55 isoCodeToEnWikiName.put("EL", "Greek");
56 isoCodeToEnWikiName.put("haw", "Hawaiian");
57 isoCodeToEnWikiName.put("HE", "Hebrew");
58 isoCodeToEnWikiName.put("HI", "Hindi");
59 isoCodeToEnWikiName.put("HU", "Hungarian");
60 isoCodeToEnWikiName.put("IS", "Icelandic");
61 isoCodeToEnWikiName.put("ID", "Indonesian");
62 isoCodeToEnWikiName.put("GA", "Irish");
63 isoCodeToEnWikiName.put("GD", "Gaelic");
64 isoCodeToEnWikiName.put("GV", "Manx");
65 isoCodeToEnWikiName.put("IT", "Italian");
66 isoCodeToEnWikiName.put("LA", "Latin");
67 isoCodeToEnWikiName.put("LV", "Latvian");
68 isoCodeToEnWikiName.put("LT", "Lithuanian");
69 isoCodeToEnWikiName.put("JA", "Japanese");
70 isoCodeToEnWikiName.put("KO", "Korean");
71 isoCodeToEnWikiName.put("KU", "Kurdish");
72 isoCodeToEnWikiName.put("LO", "Lao");
73 isoCodeToEnWikiName.put("ML", "Malayalam");
74 isoCodeToEnWikiName.put("MS", "Malay");
75 isoCodeToEnWikiName.put("MI", "Maori");
76 isoCodeToEnWikiName.put("MN", "Mongolian");
77 isoCodeToEnWikiName.put("NE", "Nepali");
78 isoCodeToEnWikiName.put("NO", "Norwegian");
79 isoCodeToEnWikiName.put("FA", "Persian");
80 isoCodeToEnWikiName.put("PL", "Polish");
81 isoCodeToEnWikiName.put("PT", "Portuguese");
82 isoCodeToEnWikiName.put("PA", "Punjabi");
83 isoCodeToEnWikiName.put("RO", "Romanian");
84 isoCodeToEnWikiName.put("RU", "Russian");
85 isoCodeToEnWikiName.put("SA", "Sanskrit");
86 isoCodeToEnWikiName.put("SK", "Slovak");
87 isoCodeToEnWikiName.put("SL", "Slovene|Slovenian");
88 isoCodeToEnWikiName.put("SO", "Somali");
89 isoCodeToEnWikiName.put("ES", "Spanish");
90 isoCodeToEnWikiName.put("SW", "Swahili");
91 isoCodeToEnWikiName.put("SV", "Swedish");
92 isoCodeToEnWikiName.put("TL", "Tagalog");
93 isoCodeToEnWikiName.put("TG", "Tajik");
94 isoCodeToEnWikiName.put("TA", "Tamil");
95 isoCodeToEnWikiName.put("TH", "Thai");
96 isoCodeToEnWikiName.put("BO", "Tibetan");
97 isoCodeToEnWikiName.put("TR", "Turkish");
98 isoCodeToEnWikiName.put("UK", "Ukrainian");
99 isoCodeToEnWikiName.put("UR", "Urdu");
100 isoCodeToEnWikiName.put("VI", "Vietnamese");
101 isoCodeToEnWikiName.put("CI", "Welsh");
102 isoCodeToEnWikiName.put("YI", "Yiddish");
103 isoCodeToEnWikiName.put("ZU", "Zulu");
104 isoCodeToEnWikiName.put("AZ", "Azeri");
105 isoCodeToEnWikiName.put("EU", "Basque");
106 isoCodeToEnWikiName.put("BR", "Breton");
107 isoCodeToEnWikiName.put("MR", "Marathi");
108 isoCodeToEnWikiName.put("FO", "Faroese");
109 isoCodeToEnWikiName.put("GL", "Galician");
110 isoCodeToEnWikiName.put("KA", "Georgian");
111 isoCodeToEnWikiName.put("HT", "Haitian Creole");
112 isoCodeToEnWikiName.put("LB", "Luxembourgish");
113 isoCodeToEnWikiName.put("MK", "Macedonian");
114 isoCodeToEnWikiName.put("GV", "Manx");
115 isoCodeToEnWikiName.put("scn", "Sicilian");
116 isoCodeToEnWikiName.put("cu", "Old Church Slavonic");
117 isoCodeToEnWikiName.put("rom", "Romani");
119 // No longer exists in EN:
120 // isoCodeToEnWikiName.put("BS", "Bosnian");
121 // isoCodeToEnWikiName.put("SR", "Serbian");
122 // isoCodeToEnWikiName.put("HR", "Croatian");
124 // Font doesn't work:
125 //isoCodeToEnWikiName.put("MY", "Burmese");
129 //Set<String> missing = new LinkedHashSet<String>(isoCodeToEnWikiName.keySet());
130 //missing.removeAll(Language.isoCodeToResources.keySet());
131 //System.out.println(missing);
133 //assert Language.isoCodeToResources.keySet().containsAll(isoCodeToEnWikiName.keySet());
136 public static final Map<String,Map<String,String>> wikiCodeToIsoCodeToWikiName = new LinkedHashMap<String, Map<String,String>>();
138 Map<String,String> isoCodeToWikiName;
141 wikiCodeToIsoCodeToWikiName.put("en", isoCodeToEnWikiName);
143 // egrep -o '\{\{Wortart[^}]+\}\}' dewiktionary-pages-articles.xml | cut -d \| -f3 | sort | uniq -c | sort -nr
144 isoCodeToWikiName = new LinkedHashMap<String, String>();
145 wikiCodeToIsoCodeToWikiName.put("de", isoCodeToWikiName);
146 isoCodeToWikiName.put("nds", "Niederdeutsch");
147 isoCodeToWikiName.put("DE", "Deutsch");
148 isoCodeToWikiName.put("EN", "Englisch");
149 isoCodeToWikiName.put("IT", "Italienisch");
150 isoCodeToWikiName.put("PL", "Polnisch");
151 isoCodeToWikiName.put("FR", "Französisch");
152 isoCodeToWikiName.put("EO", "Esperanto");
153 isoCodeToWikiName.put("CA", "Katalanisch");
154 isoCodeToWikiName.put("LA", "Latein");
155 isoCodeToWikiName.put("CS", "Tschechisch");
156 isoCodeToWikiName.put("HU", "Ungarisch");
157 isoCodeToWikiName.put("SV", "Schwedisch");
158 isoCodeToWikiName.put("ES", "Spanisch");
159 isoCodeToWikiName.put("RO", "Rumänisch");
161 // egrep -o '== *\{\{langue\|[a-zA-Z]+\}\} *==' frwiktionary-pages-articles.xml | sort | uniq -c | sort -nr
162 isoCodeToWikiName = new LinkedHashMap<String, String>();
163 wikiCodeToIsoCodeToWikiName.put("fr", isoCodeToWikiName);
164 isoCodeToWikiName.put("FR", Pattern.quote("{{langue|fr}}"));
165 isoCodeToWikiName.put("RU", Pattern.quote("{{langue|ru}}"));
166 isoCodeToWikiName.put("AR", Pattern.quote("{{langue|ar}}")); // Arabic
167 isoCodeToWikiName.put("BG", Pattern.quote("{{langue|bg}}")); // Bulgarian
168 isoCodeToWikiName.put("EL", Pattern.quote("{{langue|el}}"));
169 isoCodeToWikiName.put("EN", Pattern.quote("{{langue|en}}"));
170 //isoCodeToWikiName.put("", Pattern.quote("{{langue|sl}}"));
171 isoCodeToWikiName.put("LA", Pattern.quote("{{langue|la}}"));
172 isoCodeToWikiName.put("IT", Pattern.quote("{{langue|it}}"));
173 isoCodeToWikiName.put("EO", Pattern.quote("{{langue|eo}}"));
174 isoCodeToWikiName.put("CS", Pattern.quote("{{langue|cs}}")); // Czech
175 isoCodeToWikiName.put("NL", Pattern.quote("{{langue|nl}}")); // Dutch
176 //isoCodeToWikiName.put("", Pattern.quote("{{langue|mg}}"));
177 //isoCodeToWikiName.put("", Pattern.quote("{{langue|hsb}}"));
178 isoCodeToWikiName.put("ZH", Pattern.quote("{{langue|zh}}"));
179 isoCodeToWikiName.put("cmn", Pattern.quote("{{langue|cmn}}"));
180 isoCodeToWikiName.put("yue", Pattern.quote("{{langue|yue}}"));
181 isoCodeToWikiName.put("JA", Pattern.quote("{{langue|ja}}"));
182 isoCodeToWikiName.put("DE", Pattern.quote("{{langue|de}}"));
183 isoCodeToWikiName.put("IS", Pattern.quote("{{langue|is}}")); // Icelandic
184 isoCodeToWikiName.put("ES", Pattern.quote("{{langue|es}}"));
185 isoCodeToWikiName.put("UK", Pattern.quote("{{langue|uk}}"));
186 isoCodeToWikiName.put("PT", Pattern.quote("{{langue|pt}}"));
187 isoCodeToWikiName.put("SV", Pattern.quote("{{langue|sv}}"));
189 // egrep -o '= *\{\{-[a-z]+-\}\} *=' itwiktionary-pages-articles.xml | sort | uniq -c | sort -n
190 isoCodeToWikiName = new LinkedHashMap<String, String>();
191 wikiCodeToIsoCodeToWikiName.put("it", isoCodeToWikiName);
192 isoCodeToWikiName.put("IT", "\\{\\{-(it|scn|nap|cal|lmo)-\\}\\}"); // scn, nap, cal, lmo
193 isoCodeToWikiName.put("EN", Pattern.quote("{{-en-}}"));
194 isoCodeToWikiName.put("FR", Pattern.quote("{{-fr-}}"));
195 isoCodeToWikiName.put("DE", Pattern.quote("{{-de-}}"));
196 isoCodeToWikiName.put("ES", Pattern.quote("{{-es-}}"));
197 isoCodeToWikiName.put("JA", Pattern.quote("{{-ja-}}"));
198 isoCodeToWikiName.put("PL", Pattern.quote("{{-pl-}}"));
199 isoCodeToWikiName.put("NL", Pattern.quote("{{-nl-}}"));
200 isoCodeToWikiName.put("LV", Pattern.quote("{{-lv-}}"));
201 isoCodeToWikiName.put("LA", Pattern.quote("{{-la-}}"));
202 isoCodeToWikiName.put("HU", Pattern.quote("{{-hu-}}"));
203 isoCodeToWikiName.put("EL", Pattern.quote("{{-grc-}}"));
204 isoCodeToWikiName.put("SV", Pattern.quote("{{-sv-}}"));
205 isoCodeToWikiName.put("RU", Pattern.quote("{{-ru-}}"));
207 // egrep -o '== *\{\{lengua\|[a-zA-Z]+\}\} *==' frwiktionary-pages-articles.xml | sort | uniq -c | sort -nr
208 isoCodeToWikiName = new LinkedHashMap<String, String>();
209 wikiCodeToIsoCodeToWikiName.put("es", isoCodeToWikiName);
210 isoCodeToWikiName.put("AR", Pattern.quote("{{lengua|ar}}"));
211 isoCodeToWikiName.put("ES", Pattern.quote("{{lengua|es}}"));
212 isoCodeToWikiName.put("EN", Pattern.quote("{{lengua|en}}"));
213 isoCodeToWikiName.put("FR", Pattern.quote("{{lengua|fr}}"));
214 isoCodeToWikiName.put("IT", Pattern.quote("{{lengua|it}}"));
216 // Pattern seems to match Italian one
217 isoCodeToWikiName = new LinkedHashMap<String, String>();
218 wikiCodeToIsoCodeToWikiName.put("pt", isoCodeToWikiName);
219 isoCodeToWikiName.put("PT", Pattern.quote("{{-pt-}}"));
220 isoCodeToWikiName.put("EN", Pattern.quote("{{-en-}}"));
221 isoCodeToWikiName.put("ES", Pattern.quote("{{-es-}}"));
223 public static String getEnglishName(String langCode) {
224 String name = isoCodeToEnWikiName.get(langCode);
226 name = isoCodeToEnWikiName.get(langCode.toUpperCase());
231 if (name.indexOf('|') != -1) {
232 return name.substring(0, name.indexOf('|'));
234 if (name.indexOf('$') != -1) {
235 return name.substring(0, name.indexOf('$'));
237 return name; // can be null.