2 ****************************************************************************************
\r
3 * Copyright (C) 2009-2010, Google, Inc.; International Business Machines Corporation *
\r
4 * and others. All Rights Reserved. *
\r
5 ****************************************************************************************
\r
7 package com.ibm.icu.util;
\r
9 import java.util.HashMap;
\r
10 import java.util.Iterator;
\r
11 import java.util.LinkedHashMap;
\r
12 import java.util.LinkedHashSet;
\r
13 import java.util.Map;
\r
14 import java.util.regex.Matcher;
\r
15 import java.util.regex.Pattern;
\r
17 import com.ibm.icu.impl.Row;
\r
18 import com.ibm.icu.impl.Row.R2;
\r
19 import com.ibm.icu.impl.Row.R3;
\r
22 * Provides a way to match the languages (locales) supported by a product to the
\r
23 * languages (locales) acceptable to a user, and get the best match. For
\r
27 * LanguageMatcher matcher = new StandardLanguageMatcher("fr, en-GB, en");
\r
30 * matcher.getBestMatch(LanguageCode.US).first == LanguageCode.ENGLISH
\r
33 * It takes into account when languages are close to one another, such as fil
\r
34 * and tl, and when language regional variants are close, like en-GB and en-AU.
\r
35 * It also handles scripts, like zh-Hant vs zh-TW. For examples, see the test
\r
37 * <p>All classes implementing this interface should be immutable. Often a
\r
38 * product will just need one static instance, built with the languages
\r
39 * that it supports. However, it may want multiple instances with different
\r
40 * default languages based on additional information, such as the domain.
\r
42 * @author markdavis@google.com
\r
44 * @provisional This API might change or be removed in a future release.
\r
46 public class LocaleMatcher {
\r
47 private static final boolean DEBUG = false;
\r
50 * Threshold for falling back to the default (first) language. May make this
\r
51 * a parameter in the future.
\r
53 private static final double DEFAULT_THRESHOLD = 0.5;
\r
56 * The default language, in case the threshold is not met.
\r
58 private final ULocale defaultLanguage;
\r
61 * Create a new language matcher. The highest-weighted language is the
\r
62 * default. That means that if no other language is matches closer than a given
\r
63 * threshold, that default language is chosen. Typically the default is English,
\r
64 * but it could be different based on additional information, such as the domain
\r
67 * @param languagePriorityList weighted list
\r
69 * @provisional This API might change or be removed in a future release.
\r
71 public LocaleMatcher(LocalePriorityList languagePriorityList) {
\r
72 this(languagePriorityList, defaultWritten);
\r
76 * Create a new language matcher from a String form. The highest-weighted
\r
77 * language is the default.
\r
79 * @param languagePriorityListString String form of LanguagePriorityList
\r
81 * @provisional This API might change or be removed in a future release.
\r
83 public LocaleMatcher(String languagePriorityListString) {
\r
84 this(LocalePriorityList.add(languagePriorityListString).build());
\r
88 * Internal testing function; may expose API later.
\r
89 * @param languagePriorityList LocalePriorityList to match
\r
90 * @param matcherData Internal matching data
\r
92 * @deprecated This API is ICU internal only.
\r
94 public LocaleMatcher(LocalePriorityList languagePriorityList, LanguageMatcherData matcherData) {
\r
95 this.matcherData = matcherData;
\r
96 for (final ULocale language : languagePriorityList) {
\r
97 add(language, languagePriorityList.getWeight(language));
\r
99 Iterator<ULocale> it = languagePriorityList.iterator();
\r
100 defaultLanguage = it.hasNext() ? it.next() : null;
\r
105 * Returns a fraction between 0 and 1, where 1 means that the languages are a
\r
106 * perfect match, and 0 means that they are completely different. Note that
\r
107 * the precise values may change over time; no code should be made dependent
\r
108 * on the values remaining constant.
\r
109 * @param desired Desired locale
\r
110 * @param desiredMax Maximized locale (using likely subtags)
\r
111 * @param supported Supported locale
\r
112 * @param supportedMax Maximized locale (using likely subtags)
\r
113 * @return value between 0 and 1, inclusive.
\r
115 * @provisional This API might change or be removed in a future release.
\r
117 public double match(ULocale desired, ULocale desiredMax, ULocale supported, ULocale supportedMax) {
\r
118 return matcherData.match(desired, desiredMax, supported, supportedMax);
\r
123 * Canonicalize a locale (language). Note that for now, it is canonicalizing
\r
124 * according to CLDR conventions (he vs iw, etc), since that is what is needed
\r
125 * for likelySubtags.
\r
126 * @param ulocale language/locale code
\r
127 * @return ULocale with remapped subtags.
\r
129 * @provisional This API might change or be removed in a future release.
\r
131 public ULocale canonicalize(ULocale ulocale) {
\r
132 // TODO Get the data from CLDR, use Java conventions.
\r
133 String lang = ulocale.getLanguage();
\r
134 String lang2 = canonicalMap.get(lang);
\r
135 String script = ulocale.getScript();
\r
136 String script2 = canonicalMap.get(script);
\r
137 String region = ulocale.getCountry();
\r
138 String region2 = canonicalMap.get(region);
\r
139 if (lang2 != null || script2 != null || region2 != null) {
\r
140 return new ULocale(
\r
141 lang2 == null ? lang : lang2,
\r
142 script2 == null ? script : script2,
\r
143 region2 == null ? region : region2
\r
150 * Get the best match for a LanguagePriorityList
\r
152 * @param languageList list to match
\r
153 * @return best matching language code
\r
155 * @provisional This API might change or be removed in a future release.
\r
157 public ULocale getBestMatch(LocalePriorityList languageList) {
\r
158 double bestWeight = 0;
\r
159 ULocale bestTableMatch = null;
\r
160 for (final ULocale language : languageList) {
\r
161 final Row.R2<ULocale, Double> matchRow = getBestMatchInternal(language);
\r
162 final double weight = matchRow.get1() * languageList.getWeight(language);
\r
163 if (weight > bestWeight) {
\r
164 bestWeight = weight;
\r
165 bestTableMatch = matchRow.get0();
\r
168 if (bestWeight < DEFAULT_THRESHOLD) {
\r
169 bestTableMatch = defaultLanguage;
\r
171 return bestTableMatch;
\r
175 * Convenience method: Get the best match for a LanguagePriorityList
\r
177 * @param languageList String form of language priority list
\r
178 * @return best matching language code
\r
180 * @provisional This API might change or be removed in a future release.
\r
182 public ULocale getBestMatch(String languageList) {
\r
183 return getBestMatch(LocalePriorityList.add(languageList).build());
\r
187 * Get the best match for an individual language code.
\r
189 * @param ulocale locale/language code to match
\r
190 * @return best matching language code
\r
192 * @provisional This API might change or be removed in a future release.
\r
194 public ULocale getBestMatch(ULocale ulocale) {
\r
195 return getBestMatchInternal(ulocale).get0();
\r
201 * @provisional This API might change or be removed in a future release.
\r
204 public String toString() {
\r
205 return "{" + defaultLanguage + ", "
\r
206 + maximizedLanguageToWeight + "}";
\r
208 // ================= Privates =====================
\r
211 * Get the best match for an individual language code.
\r
213 * @param languageCode
\r
214 * @return best matching language code and weight (as per
\r
215 * {@link #match(ULocale, ULocale)})
\r
217 private Row.R2<ULocale, Double> getBestMatchInternal(ULocale languageCode) {
\r
218 languageCode = canonicalize(languageCode);
\r
219 final ULocale maximized = addLikelySubtags(languageCode);
\r
221 System.out.println("\n" + languageCode + ";\t" + maximized);
\r
223 double bestWeight = 0;
\r
224 ULocale bestTableMatch = null;
\r
225 for (final ULocale tableKey : maximizedLanguageToWeight.keySet()) {
\r
226 R2<ULocale, Double> row = maximizedLanguageToWeight.get(tableKey);
\r
227 final double match = match(languageCode, maximized, tableKey, row.get0());
\r
229 System.out.println("\t" + tableKey + ";\t" + row.toString() + ";\t" + match);
\r
231 final double weight = match * row.get1();
\r
232 if (weight > bestWeight) {
\r
233 bestWeight = weight;
\r
234 bestTableMatch = tableKey;
\r
237 if (bestWeight < DEFAULT_THRESHOLD) {
\r
238 bestTableMatch = defaultLanguage;
\r
240 return Row.R2.of(bestTableMatch, bestWeight);
\r
243 private void add(ULocale language, Double weight) {
\r
244 language = canonicalize(language);
\r
245 R2<ULocale, Double> row = Row.of(addLikelySubtags(language), weight);
\r
246 maximizedLanguageToWeight.put(language, row);
\r
249 Map<ULocale,Row.R2<ULocale, Double>> maximizedLanguageToWeight = new LinkedHashMap<ULocale, R2<ULocale, Double>>();
\r
252 // =============== Special Mapping Information ==============
\r
255 * We need to add another method to addLikelySubtags that doesn't return
\r
256 * null, but instead substitutes Zzzz and ZZ if unknown. There are also
\r
257 * a few cases where addLikelySubtags needs to have expanded data, to handle
\r
258 * all deprecated codes, and to update to CLDR 1.6.
\r
259 * @param languageCode
\r
260 * @return "fixed" addLikelySubtags
\r
262 // TODO(markdavis): update the above when CLDR 1.6 is final.
\r
263 private ULocale addLikelySubtags(ULocale languageCode) {
\r
264 final ULocale result = ULocale.addLikelySubtags(languageCode);
\r
265 // should have method on getLikelySubtags for this
\r
266 if (result == null || result.equals(languageCode)) {
\r
267 final String language = languageCode.getLanguage();
\r
268 final String script = languageCode.getScript();
\r
269 final String region = languageCode.getCountry();
\r
270 return new ULocale((language.length()==0 ? "und"
\r
273 + (script.length()==0 ? "Zzzz" : script)
\r
275 + (region.length()==0 ? "ZZ" : region));
\r
280 private static class LocalePatternMatcher {
\r
281 // a value of null means a wildcard; matches any.
\r
282 private String lang;
\r
283 private String script;
\r
284 private String region;
\r
285 private Level level;
\r
286 static Pattern pattern = Pattern.compile(
\r
287 "([a-zA-Z]{1,8}|\\*)" +
\r
288 "(?:-([a-zA-Z]{4}|\\*))?" +
\r
289 "(?:-([a-zA-Z]{2}|[0-9]{3}|\\*))?");
\r
291 public LocalePatternMatcher(String toMatch) {
\r
292 Matcher matcher = pattern.matcher(toMatch);
\r
293 if (!matcher.matches()) {
\r
294 throw new IllegalArgumentException("Bad pattern: " + toMatch);
\r
296 lang = matcher.group(1);
\r
297 script = matcher.group(2);
\r
298 region = matcher.group(3);
\r
299 level = region != null ? Level.region : script != null ? Level.script : Level.language;
\r
301 if (lang.equals("*")) {
\r
304 if (script != null && script.equals("*")) {
\r
307 if (region != null && region.equals("*")) {
\r
312 boolean matches(ULocale ulocale) {
\r
313 if (lang != null && !lang.equals(ulocale.getLanguage())) {
\r
316 if (script != null && !script.equals(ulocale.getScript())) {
\r
319 if (region != null && !region.equals(ulocale.getCountry())) {
\r
325 public Level getLevel() {
\r
329 public String getLanguage() {
\r
330 return (lang == null ? "*" : lang);
\r
333 public String getScript() {
\r
334 return (script == null ? "*" : script);
\r
337 public String getRegion() {
\r
338 return (region == null ? "*" : region);
\r
341 public String toString() {
\r
342 String result = getLanguage();
\r
343 if (level != Level.language) {
\r
344 result += "-" + getScript();
\r
345 if (level != Level.script) {
\r
346 result += "-" + getRegion();
\r
353 enum Level {language, script, region}
\r
355 private static class ScoreData implements Freezable<ScoreData> {
\r
356 LinkedHashSet<Row.R3<LocalePatternMatcher,LocalePatternMatcher,Double>> scores = new LinkedHashSet<R3<LocalePatternMatcher, LocalePatternMatcher, Double>>();
\r
357 final double worst;
\r
360 public ScoreData(Level level) {
\r
361 this.level = level;
\r
362 this.worst = (1-(level == Level.language ? 90 : level == Level.script ? 20 : 4))/100.0;
\r
365 void addDataToScores(String desired, String supported, R3<LocalePatternMatcher,LocalePatternMatcher,Double> data) {
\r
366 // Map<String, Set<R3<LocalePatternMatcher,LocalePatternMatcher,Double>>> lang_result = scores.get(desired);
\r
367 // if (lang_result == null) {
\r
368 // scores.put(desired, lang_result = new HashMap());
\r
370 // Set<R3<LocalePatternMatcher,LocalePatternMatcher,Double>> result = lang_result.get(supported);
\r
371 // if (result == null) {
\r
372 // lang_result.put(supported, result = new LinkedHashSet());
\r
374 // result.add(data);
\r
378 double getScore(ULocale desiredLocale, ULocale dMax, String desiredRaw, String desiredMax,
\r
379 ULocale supportedLocale, ULocale sMax, String supportedRaw, String supportedMax) {
\r
386 * rd = rd(dm,sm) // line 4
\r
388 * rd *= 0.75 // lines 3,8
\r
390 * rd *= 0.5 // lines 7
\r
395 * rd = 0.25*StdRDiff // lines 2,5
\r
398 boolean desiredChange = desiredRaw.equals(desiredMax);
\r
399 boolean supportedChange = supportedRaw.equals(supportedMax);
\r
401 if (!desiredMax.equals(supportedMax)) {
\r
402 // Map<String, Set<R3<LocalePatternMatcher,LocalePatternMatcher,Double>>> lang_result = scores.get(desiredMax);
\r
403 // if (lang_result == null) {
\r
404 // distance = worst;
\r
406 // Set<R3<LocalePatternMatcher,LocalePatternMatcher,Double>> result = lang_result.get(supportedMax);
\r
408 // if (result == null) {
\r
409 // distance = worst;
\r
411 distance = getRawScore(dMax, sMax);
\r
413 if (desiredChange == supportedChange) {
\r
415 } else if (desiredChange) {
\r
418 } else if (desiredChange == supportedChange) { // maxes are equal, changes are equal
\r
420 } else { // maxes are equal, changes are different
\r
421 distance = 0.25*worst;
\r
426 private double getRawScore(ULocale desiredLocale, ULocale supportedLocale) {
\r
428 System.out.println("\t\t\tRaw Score:\t" + desiredLocale + ";\t" + supportedLocale);
\r
430 for (R3<LocalePatternMatcher,LocalePatternMatcher,Double> datum : scores) { // : result
\r
431 if (datum.get0().matches(desiredLocale)
\r
432 && datum.get1().matches(supportedLocale)) {
\r
434 System.out.println("\t\t\tFOUND\t" + datum);
\r
436 return datum.get2();
\r
440 System.out.println("\t\t\tNOTFOUND\t" + worst);
\r
445 public String toString() {
\r
446 return level + ", " + scores;
\r
449 @SuppressWarnings("unchecked")
\r
450 public ScoreData cloneAsThawed() {
\r
452 ScoreData result = (ScoreData) clone();
\r
453 result.scores = (LinkedHashSet<R3<LocalePatternMatcher, LocalePatternMatcher, Double>>) result.scores.clone();
\r
454 result.frozen = false;
\r
456 } catch (CloneNotSupportedException e) {
\r
457 throw new IllegalArgumentException(e); // will never happen
\r
462 private boolean frozen = false;
\r
464 public ScoreData freeze() {
\r
468 public boolean isFrozen() {
\r
474 * Only for testing and use by tools. Interface may change!!
\r
476 * @deprecated This API is ICU internal only.
\r
478 public static class LanguageMatcherData implements Freezable<LanguageMatcherData> {
\r
479 ScoreData languageScores = new ScoreData(Level.language);
\r
480 ScoreData scriptScores = new ScoreData(Level.script);
\r
481 ScoreData regionScores = new ScoreData(Level.region);
\r
485 * @deprecated This API is ICU internal only.
\r
487 public LanguageMatcherData() {
\r
492 * @deprecated This API is ICU internal only.
\r
494 public double match(ULocale a, ULocale aMax, ULocale b, ULocale bMax) {
\r
496 diff += languageScores.getScore(a, aMax, a.getLanguage(), aMax.getLanguage(), b, bMax, b.getLanguage(), bMax.getLanguage());
\r
497 diff += scriptScores.getScore(a, aMax, a.getScript(), aMax.getScript(), b, bMax, b.getScript(), bMax.getScript());
\r
498 diff += regionScores.getScore(a, aMax, a.getCountry(), aMax.getCountry(), b, bMax, b.getCountry(), bMax.getCountry());
\r
500 if (!a.getVariant().equals(b.getVariant())) {
\r
505 } else if (diff > 1.0d) {
\r
513 * Add an exceptional distance between languages, typically because regional
\r
514 * dialects were given their own language codes. At this point the code is
\r
515 * symmetric. We don't bother producing an equivalence class because there are
\r
516 * so few cases; this function depends on the other permutations being
\r
517 * added specifically.
\r
519 * @deprecated This API is ICU internal only.
\r
521 private LanguageMatcherData addDistance(String desired, String supported, int percent) {
\r
522 return addDistance(desired, supported, percent, false, null);
\r
526 * @deprecated This API is ICU internal only.
\r
528 public LanguageMatcherData addDistance(String desired, String supported, int percent, String comment) {
\r
529 return addDistance(desired, supported, percent, false, comment);
\r
533 * @deprecated This API is ICU internal only.
\r
535 public LanguageMatcherData addDistance(String desired, String supported, int percent, boolean oneway) {
\r
536 return addDistance(desired, supported, percent, oneway, null);
\r
539 private LanguageMatcherData addDistance(String desired, String supported, int percent, boolean oneway, String comment) {
\r
541 System.out.println("\t<languageMatch desired=\"" + desired + "\"" +
\r
542 " supported=\"" + supported + "\"" +
\r
543 " percent=\"" + percent + "\""
\r
544 + (oneway ? " oneway=\"true\"" : "")
\r
546 + (comment == null ? "" : "\t<!-- " + comment + " -->"));
\r
547 // // .addDistance("nn", "nb", 4, true)
\r
548 // System.out.println(".addDistance(\"" + desired + "\"" +
\r
549 // ", \"" + supported + "\"" +
\r
550 // ", " + percent + ""
\r
551 // + (oneway ? "" : ", true")
\r
552 // + (comment == null ? "" : ", \"" + comment + "\"")
\r
557 double score = 1-percent/100.0; // convert from percentage
\r
558 LocalePatternMatcher desiredMatcher = new LocalePatternMatcher(desired);
\r
559 Level desiredLen = desiredMatcher.getLevel();
\r
560 LocalePatternMatcher supportedMatcher = new LocalePatternMatcher(supported);
\r
561 Level supportedLen = supportedMatcher.getLevel();
\r
562 if (desiredLen != supportedLen) {
\r
563 throw new IllegalArgumentException();
\r
565 R3<LocalePatternMatcher,LocalePatternMatcher,Double> data = Row.of(desiredMatcher, supportedMatcher, score);
\r
566 R3<LocalePatternMatcher,LocalePatternMatcher,Double> data2 = oneway ? null : Row.of(supportedMatcher, desiredMatcher, score);
\r
567 switch (desiredLen) {
\r
569 String dlanguage = desiredMatcher.getLanguage();
\r
570 String slanguage = supportedMatcher.getLanguage();
\r
571 languageScores.addDataToScores(dlanguage, slanguage, data);
\r
573 languageScores.addDataToScores(slanguage, dlanguage, data2);
\r
577 String dscript = desiredMatcher.getScript();
\r
578 String sscript = supportedMatcher.getScript();
\r
579 scriptScores.addDataToScores(dscript, sscript, data);
\r
581 scriptScores.addDataToScores(sscript, dscript, data2);
\r
585 String dregion = desiredMatcher.getRegion();
\r
586 String sregion = supportedMatcher.getRegion();
\r
587 regionScores.addDataToScores(dregion, sregion, data);
\r
589 regionScores.addDataToScores(sregion, dregion, data2);
\r
599 * @deprecated This API is ICU internal only.
\r
601 public LanguageMatcherData cloneAsThawed() {
\r
602 LanguageMatcherData result;
\r
604 result = (LanguageMatcherData) clone();
\r
605 result.languageScores = languageScores.cloneAsThawed();
\r
606 result.scriptScores = scriptScores.cloneAsThawed();
\r
607 result.regionScores = regionScores.cloneAsThawed();
\r
608 result.frozen = false;
\r
610 } catch (CloneNotSupportedException e) {
\r
611 throw new IllegalArgumentException(e); // will never happen
\r
615 private boolean frozen = false;
\r
620 * @deprecated This API is ICU internal only.
\r
622 public LanguageMatcherData freeze() {
\r
629 * @deprecated This API is ICU internal only.
\r
631 public boolean isFrozen() {
\r
636 LanguageMatcherData matcherData;
\r
638 private static LanguageMatcherData defaultWritten = new LanguageMatcherData()
\r
639 // TODO get data from CLDR
\r
640 .addDistance("no", "nb", 100, "The language no is normally taken as nb in content; we might alias this for lookup.")
\r
641 .addDistance("nn", "nb", 96)
\r
642 .addDistance("nn", "no", 96)
\r
643 .addDistance("da", "no", 90, "Danish and norwegian are reasonably close.")
\r
644 .addDistance("da", "nb", 90)
\r
645 .addDistance("hr", "br", 96, "Serbo-croatian variants are all very close.")
\r
646 .addDistance("sh", "br", 96)
\r
647 .addDistance("sr", "br", 96)
\r
648 .addDistance("sh", "hr", 96)
\r
649 .addDistance("sr", "hr", 96)
\r
650 .addDistance("sh", "sr", 96)
\r
651 .addDistance("sr-Latn", "sr-Cyrl", 90, "Most serbs can read either script.")
\r
652 .addDistance("*-Hans", "*-Hant", 85, true, "Readers of simplified can read traditional much better than reverse.")
\r
653 .addDistance("*-Hant", "*-Hans", 75, true)
\r
654 .addDistance("en-*-US", "en-*-CA", 98, "US is different than others, and Canadian is inbetween.")
\r
655 .addDistance("en-*-US", "en-*-*", 97)
\r
656 .addDistance("en-*-CA", "en-*-*", 98)
\r
657 .addDistance("en-*-*", "en-*-*", 99)
\r
658 .addDistance("es-*-ES", "es-*-ES", 100, "Latin American Spanishes are closer to each other. Approximate by having es-ES be further from everything else.")
\r
659 .addDistance("es-*-ES", "es-*-*", 93)
\r
660 .addDistance("*", "*", 1, "[Default value -- must be at end!] Normally there is no comprehension of different languages.")
\r
661 .addDistance("*-*", "*-*", 20, "[Default value -- must be at end!] Normally there is little comprehension of different scripts.")
\r
662 .addDistance("*-*-*", "*-*-*", 96, "[Default value -- must be at end!] Normally there are small differences across regions.")
\r
665 private static HashMap<String,String> canonicalMap = new HashMap<String, String>();
\r
668 // TODO get data from CLDR
\r
669 canonicalMap.put("iw", "he");
\r
670 canonicalMap.put("mo", "ro");
\r
671 canonicalMap.put("tl", "fil");
\r