]> gitweb.fperrin.net Git - DictionaryPC.git/blob - src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java
c4935341d999e13b687b9c0878f4ff9bcdfaa743
[DictionaryPC.git] / src / com / hughes / android / dictionary / parser / wiktionary / WiktionaryLangs.java
1 // Copyright 2012 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 package com.hughes.android.dictionary.parser.wiktionary;
16
17 import java.util.LinkedHashMap;
18 import java.util.Map;
19 import java.util.regex.Pattern;
20
21 public class WiktionaryLangs {
22
23     public static final Map<String,String> isoCodeToEnWikiName = new LinkedHashMap<String,String>();
24     static {
25         isoCodeToEnWikiName.put("AF", "Afrikaans");
26         isoCodeToEnWikiName.put("SQ", "Albanian");
27         isoCodeToEnWikiName.put("AR", "Arabic");
28         isoCodeToEnWikiName.put("HY", "Armenian");
29         isoCodeToEnWikiName.put("BE", "Belarusian");
30         isoCodeToEnWikiName.put("BN", "Bengali");
31         isoCodeToEnWikiName.put("BG", "Bulgarian");
32         isoCodeToEnWikiName.put("CA", "Catalan");
33         isoCodeToEnWikiName.put("SH", "Serbo-Croatian");
34         isoCodeToEnWikiName.put("CS", "Czech");
35         isoCodeToEnWikiName.put("ZH", "Chinese");
36         isoCodeToEnWikiName.put("cmn", "Mandarin");
37         isoCodeToEnWikiName.put("yue", "Cantonese");
38         isoCodeToEnWikiName.put("DA", "Danish");
39         isoCodeToEnWikiName.put("NL", "Dutch");
40         isoCodeToEnWikiName.put("EN", "English");
41         isoCodeToEnWikiName.put("EO", "Esperanto");
42         isoCodeToEnWikiName.put("ET", "Estonian");
43         isoCodeToEnWikiName.put("FI", "Finnish");
44         isoCodeToEnWikiName.put("FR", "French");
45         // Note: must be before German since matcher
46         // simply takes first match instead of best.
47         isoCodeToEnWikiName.put("nds", "Low German");
48         isoCodeToEnWikiName.put("pdc", "Pennsylvania German");
49         isoCodeToEnWikiName.put("DE", "German");
50         isoCodeToEnWikiName.put("grc", "Ancient Greek");
51         isoCodeToEnWikiName.put("EL", "Greek");
52         isoCodeToEnWikiName.put("haw", "Hawaiian");
53         isoCodeToEnWikiName.put("HE", "Hebrew");
54         isoCodeToEnWikiName.put("HI", "Hindi");
55         isoCodeToEnWikiName.put("HU", "Hungarian");
56         isoCodeToEnWikiName.put("IS", "Icelandic");
57         isoCodeToEnWikiName.put("ID", "Indonesian");
58         isoCodeToEnWikiName.put("GA", "Irish");
59         isoCodeToEnWikiName.put("GD", "Gaelic");
60         isoCodeToEnWikiName.put("GV", "Manx");
61         isoCodeToEnWikiName.put("IT", "Italian");
62         isoCodeToEnWikiName.put("LA", "Latin");
63         isoCodeToEnWikiName.put("LV", "Latvian");
64         isoCodeToEnWikiName.put("LT", "Lithuanian");
65         isoCodeToEnWikiName.put("JA", "Japanese");
66         isoCodeToEnWikiName.put("KO", "Korean");
67         isoCodeToEnWikiName.put("KU", "Kurdish");
68         isoCodeToEnWikiName.put("LO", "Lao");
69         isoCodeToEnWikiName.put("ML", "Malayalam");
70         isoCodeToEnWikiName.put("MS", "Malay");
71         isoCodeToEnWikiName.put("MI", "Maori");
72         isoCodeToEnWikiName.put("MN", "Mongolian");
73         isoCodeToEnWikiName.put("NE", "Nepali");
74         isoCodeToEnWikiName.put("NO", "Norwegian");
75         isoCodeToEnWikiName.put("FA", "Persian");
76         isoCodeToEnWikiName.put("PL", "Polish");
77         isoCodeToEnWikiName.put("PT", "Portuguese");
78         isoCodeToEnWikiName.put("PA", "Punjabi");
79         isoCodeToEnWikiName.put("RO", "Romanian");
80         isoCodeToEnWikiName.put("RU", "Russian");
81         isoCodeToEnWikiName.put("SA", "Sanskrit");
82         isoCodeToEnWikiName.put("SK", "Slovak");
83         isoCodeToEnWikiName.put("SL", "Slovene|Slovenian");
84         isoCodeToEnWikiName.put("SO", "Somali");
85         isoCodeToEnWikiName.put("ES", "Spanish");
86         isoCodeToEnWikiName.put("SW", "Swahili");
87         isoCodeToEnWikiName.put("SV", "Swedish");
88         isoCodeToEnWikiName.put("TL", "Tagalog");
89         isoCodeToEnWikiName.put("TG", "Tajik");
90         isoCodeToEnWikiName.put("TA", "Tamil");
91         isoCodeToEnWikiName.put("TH", "Thai");
92         isoCodeToEnWikiName.put("BO", "Tibetan");
93         isoCodeToEnWikiName.put("TR", "Turkish");
94         isoCodeToEnWikiName.put("UK", "Ukrainian");
95         isoCodeToEnWikiName.put("UR", "Urdu");
96         isoCodeToEnWikiName.put("VI", "Vietnamese");
97         isoCodeToEnWikiName.put("CI", "Welsh");
98         isoCodeToEnWikiName.put("YI", "Yiddish");
99         isoCodeToEnWikiName.put("ZU", "Zulu");
100         isoCodeToEnWikiName.put("AZ", "Azeri");
101         isoCodeToEnWikiName.put("EU", "Basque");
102         isoCodeToEnWikiName.put("BR", "Breton");
103         isoCodeToEnWikiName.put("MR", "Marathi");
104         isoCodeToEnWikiName.put("FO", "Faroese");
105         isoCodeToEnWikiName.put("GL", "Galician");
106         isoCodeToEnWikiName.put("KA", "Georgian");
107         isoCodeToEnWikiName.put("HT", "Haitian Creole");
108         isoCodeToEnWikiName.put("LB", "Luxembourgish");
109         isoCodeToEnWikiName.put("MK", "Macedonian");
110         isoCodeToEnWikiName.put("GV", "Manx");
111         isoCodeToEnWikiName.put("scn", "Sicilian");
112         isoCodeToEnWikiName.put("cu", "Old Church Slavonic");
113         isoCodeToEnWikiName.put("rom", "Romani");
114
115         // No longer exists in EN:
116         // isoCodeToEnWikiName.put("BS", "Bosnian");
117         // isoCodeToEnWikiName.put("SR", "Serbian");
118         // isoCodeToEnWikiName.put("HR", "Croatian");
119
120         // Font doesn't work:
121         //isoCodeToEnWikiName.put("MY", "Burmese");
122
123
124         {
125             //Set<String> missing = new LinkedHashSet<String>(isoCodeToEnWikiName.keySet());
126             //missing.removeAll(Language.isoCodeToResources.keySet());
127             //System.out.println(missing);
128         }
129         //assert Language.isoCodeToResources.keySet().containsAll(isoCodeToEnWikiName.keySet());
130     }
131
132     public static final Map<String,Map<String,String>> wikiCodeToIsoCodeToWikiName = new LinkedHashMap<String, Map<String,String>>();
133     static {
134         Map<String,String> isoCodeToWikiName;
135
136         // en
137         wikiCodeToIsoCodeToWikiName.put("en", isoCodeToEnWikiName);
138
139         // egrep -o '\{\{Wortart[^}]+\}\}' dewiktionary-pages-articles.xml | cut -d \| -f3 | sort | uniq -c | sort -nr
140         isoCodeToWikiName = new LinkedHashMap<String, String>();
141         wikiCodeToIsoCodeToWikiName.put("de", isoCodeToWikiName);
142         isoCodeToWikiName.put("nds", "Niederdeutsch");
143         isoCodeToWikiName.put("DE", "Deutsch");
144         isoCodeToWikiName.put("EN", "Englisch");
145         isoCodeToWikiName.put("IT", "Italienisch");
146         isoCodeToWikiName.put("PL", "Polnisch");
147         isoCodeToWikiName.put("FR", "Französisch");
148         isoCodeToWikiName.put("EO", "Esperanto");
149         isoCodeToWikiName.put("CA", "Katalanisch");
150         isoCodeToWikiName.put("LA", "Latein");
151         isoCodeToWikiName.put("CS", "Tschechisch");
152         isoCodeToWikiName.put("HU", "Ungarisch");
153         isoCodeToWikiName.put("SV", "Schwedisch");
154         isoCodeToWikiName.put("ES", "Spanisch");
155         isoCodeToWikiName.put("RO", "Rumänisch");
156
157         // egrep -o '== *\{\{langue\|[a-zA-Z]+\}\} *==' frwiktionary-pages-articles.xml | sort | uniq -c | sort -nr
158         isoCodeToWikiName = new LinkedHashMap<String, String>();
159         wikiCodeToIsoCodeToWikiName.put("fr", isoCodeToWikiName);
160         isoCodeToWikiName.put("FR", Pattern.quote("{{langue|fr}}"));
161         isoCodeToWikiName.put("RU", Pattern.quote("{{langue|ru}}"));
162         isoCodeToWikiName.put("AR", Pattern.quote("{{langue|ar}}"));  // Arabic
163         isoCodeToWikiName.put("BG", Pattern.quote("{{langue|bg}}"));  // Bulgarian
164         isoCodeToWikiName.put("EL", Pattern.quote("{{langue|el}}"));
165         isoCodeToWikiName.put("EN", Pattern.quote("{{langue|en}}"));
166         //isoCodeToWikiName.put("", Pattern.quote("{{langue|sl}}"));
167         isoCodeToWikiName.put("LA", Pattern.quote("{{langue|la}}"));
168         isoCodeToWikiName.put("IT", Pattern.quote("{{langue|it}}"));
169         isoCodeToWikiName.put("EO", Pattern.quote("{{langue|eo}}"));
170         isoCodeToWikiName.put("CS", Pattern.quote("{{langue|cs}}"));  // Czech
171         isoCodeToWikiName.put("NL", Pattern.quote("{{langue|nl}}"));  // Dutch
172         //isoCodeToWikiName.put("", Pattern.quote("{{langue|mg}}"));
173         //isoCodeToWikiName.put("", Pattern.quote("{{langue|hsb}}"));
174         isoCodeToWikiName.put("ZH", Pattern.quote("{{langue|zh}}"));
175         isoCodeToWikiName.put("cmn", Pattern.quote("{{langue|cmn}}"));
176         isoCodeToWikiName.put("yue", Pattern.quote("{{langue|yue}}"));
177         isoCodeToWikiName.put("JA", Pattern.quote("{{langue|ja}}"));
178         isoCodeToWikiName.put("DE", Pattern.quote("{{langue|de}}"));
179         isoCodeToWikiName.put("IS", Pattern.quote("{{langue|is}}"));  // Icelandic
180         isoCodeToWikiName.put("ES", Pattern.quote("{{langue|es}}"));
181         isoCodeToWikiName.put("UK", Pattern.quote("{{langue|uk}}"));
182         isoCodeToWikiName.put("PT", Pattern.quote("{{langue|pt}}"));
183
184         // egrep -o '= *\{\{-[a-z]+-\}\} *=' itwiktionary-pages-articles.xml | sort | uniq -c | sort -n
185         isoCodeToWikiName = new LinkedHashMap<String, String>();
186         wikiCodeToIsoCodeToWikiName.put("it", isoCodeToWikiName);
187         isoCodeToWikiName.put("IT", "\\{\\{-(it|scn|nap|cal|lmo)-\\}\\}");  // scn, nap, cal, lmo
188         isoCodeToWikiName.put("EN", Pattern.quote("{{-en-}}"));
189         isoCodeToWikiName.put("FR", Pattern.quote("{{-fr-}}"));
190         isoCodeToWikiName.put("DE", Pattern.quote("{{-de-}}"));
191         isoCodeToWikiName.put("ES", Pattern.quote("{{-es-}}"));
192         isoCodeToWikiName.put("JA", Pattern.quote("{{-ja-}}"));
193         isoCodeToWikiName.put("PL", Pattern.quote("{{-pl-}}"));
194         isoCodeToWikiName.put("NL", Pattern.quote("{{-nl-}}"));
195         isoCodeToWikiName.put("LV", Pattern.quote("{{-lv-}}"));
196         isoCodeToWikiName.put("LA", Pattern.quote("{{-la-}}"));
197         isoCodeToWikiName.put("HU", Pattern.quote("{{-hu-}}"));
198         isoCodeToWikiName.put("EL", Pattern.quote("{{-grc-}}"));
199         isoCodeToWikiName.put("SV", Pattern.quote("{{-sv-}}"));
200         isoCodeToWikiName.put("RU", Pattern.quote("{{-ru-}}"));
201
202         // egrep -o '== *\{\{lengua\|[a-zA-Z]+\}\} *==' frwiktionary-pages-articles.xml | sort | uniq -c | sort -nr
203         isoCodeToWikiName = new LinkedHashMap<String, String>();
204         wikiCodeToIsoCodeToWikiName.put("es", isoCodeToWikiName);
205         isoCodeToWikiName.put("AR", Pattern.quote("{{lengua|ar}}"));
206         isoCodeToWikiName.put("ES", Pattern.quote("{{lengua|es}}"));
207         isoCodeToWikiName.put("EN", Pattern.quote("{{lengua|en}}"));
208         isoCodeToWikiName.put("FR", Pattern.quote("{{lengua|fr}}"));
209         isoCodeToWikiName.put("IT", Pattern.quote("{{lengua|it}}"));
210
211         // Pattern seems to match Italian one
212         isoCodeToWikiName = new LinkedHashMap<String, String>();
213         wikiCodeToIsoCodeToWikiName.put("pt", isoCodeToWikiName);
214         isoCodeToWikiName.put("PT", Pattern.quote("{{-pt-}}"));
215         isoCodeToWikiName.put("EN", Pattern.quote("{{-en-}}"));
216         isoCodeToWikiName.put("ES", Pattern.quote("{{-es-}}"));
217     }
218     public static String getEnglishName(String langCode) {
219         String name = isoCodeToEnWikiName.get(langCode);
220         if (name == null) {
221             name = isoCodeToEnWikiName.get(langCode.toUpperCase());
222         }
223         if (name == null) {
224             return null;
225         }
226         if (name.indexOf('|') != -1) {
227             return name.substring(0, name.indexOf('|'));
228         }
229         if (name.indexOf('$') != -1) {
230             return name.substring(0, name.indexOf('$'));
231         }
232         return name;  // can be null.
233     }
234
235 }