2 ****************************************************************************************
3 * Copyright (C) 2009-2010, Google, Inc.; International Business Machines Corporation *
4 * and others. All Rights Reserved. *
5 ****************************************************************************************
7 package com.ibm.icu.util;
9 import java.util.HashMap;
10 import java.util.Iterator;
11 import java.util.LinkedHashMap;
12 import java.util.LinkedHashSet;
14 import java.util.regex.Matcher;
15 import java.util.regex.Pattern;
17 import com.ibm.icu.impl.Row;
18 import com.ibm.icu.impl.Row.R2;
19 import com.ibm.icu.impl.Row.R3;
22 * Provides a way to match the languages (locales) supported by a product to the
23 * languages (locales) acceptable to a user, and get the best match. For
27 * LanguageMatcher matcher = new StandardLanguageMatcher("fr, en-GB, en");
30 * matcher.getBestMatch(LanguageCode.US).first == LanguageCode.ENGLISH
33 * It takes into account when languages are close to one another, such as fil
34 * and tl, and when language regional variants are close, like en-GB and en-AU.
35 * It also handles scripts, like zh-Hant vs zh-TW. For examples, see the test
37 * <p>All classes implementing this interface should be immutable. Often a
38 * product will just need one static instance, built with the languages
39 * that it supports. However, it may want multiple instances with different
40 * default languages based on additional information, such as the domain.
42 * @author markdavis@google.com
45 public class LocaleMatcher {
46 private static final boolean DEBUG = false;
49 * Threshold for falling back to the default (first) language. May make this
50 * a parameter in the future.
52 private static final double DEFAULT_THRESHOLD = 0.5;
55 * The default language, in case the threshold is not met.
57 private final ULocale defaultLanguage;
60 * Create a new language matcher. The highest-weighted language is the
61 * default. That means that if no other language is matches closer than a given
62 * threshold, that default language is chosen. Typically the default is English,
63 * but it could be different based on additional information, such as the domain
66 * @param languagePriorityList weighted list
69 public LocaleMatcher(LocalePriorityList languagePriorityList) {
70 this(languagePriorityList, defaultWritten);
74 * Create a new language matcher from a String form. The highest-weighted
75 * language is the default.
77 * @param languagePriorityListString String form of LanguagePriorityList
80 public LocaleMatcher(String languagePriorityListString) {
81 this(LocalePriorityList.add(languagePriorityListString).build());
85 * Internal testing function; may expose API later.
86 * @param languagePriorityList LocalePriorityList to match
87 * @param matcherData Internal matching data
89 * @deprecated This API is ICU internal only.
91 public LocaleMatcher(LocalePriorityList languagePriorityList, LanguageMatcherData matcherData) {
92 this.matcherData = matcherData;
93 for (final ULocale language : languagePriorityList) {
94 add(language, languagePriorityList.getWeight(language));
96 Iterator<ULocale> it = languagePriorityList.iterator();
97 defaultLanguage = it.hasNext() ? it.next() : null;
102 * Returns a fraction between 0 and 1, where 1 means that the languages are a
103 * perfect match, and 0 means that they are completely different. Note that
104 * the precise values may change over time; no code should be made dependent
105 * on the values remaining constant.
106 * @param desired Desired locale
107 * @param desiredMax Maximized locale (using likely subtags)
108 * @param supported Supported locale
109 * @param supportedMax Maximized locale (using likely subtags)
110 * @return value between 0 and 1, inclusive.
113 public double match(ULocale desired, ULocale desiredMax, ULocale supported, ULocale supportedMax) {
114 return matcherData.match(desired, desiredMax, supported, supportedMax);
119 * Canonicalize a locale (language). Note that for now, it is canonicalizing
120 * according to CLDR conventions (he vs iw, etc), since that is what is needed
122 * @param ulocale language/locale code
123 * @return ULocale with remapped subtags.
126 public ULocale canonicalize(ULocale ulocale) {
127 // TODO Get the data from CLDR, use Java conventions.
128 String lang = ulocale.getLanguage();
129 String lang2 = canonicalMap.get(lang);
130 String script = ulocale.getScript();
131 String script2 = canonicalMap.get(script);
132 String region = ulocale.getCountry();
133 String region2 = canonicalMap.get(region);
134 if (lang2 != null || script2 != null || region2 != null) {
136 lang2 == null ? lang : lang2,
137 script2 == null ? script : script2,
138 region2 == null ? region : region2
145 * Get the best match for a LanguagePriorityList
147 * @param languageList list to match
148 * @return best matching language code
151 public ULocale getBestMatch(LocalePriorityList languageList) {
152 double bestWeight = 0;
153 ULocale bestTableMatch = null;
154 for (final ULocale language : languageList) {
155 final Row.R2<ULocale, Double> matchRow = getBestMatchInternal(language);
156 final double weight = matchRow.get1() * languageList.getWeight(language);
157 if (weight > bestWeight) {
159 bestTableMatch = matchRow.get0();
162 if (bestWeight < DEFAULT_THRESHOLD) {
163 bestTableMatch = defaultLanguage;
165 return bestTableMatch;
169 * Convenience method: Get the best match for a LanguagePriorityList
171 * @param languageList String form of language priority list
172 * @return best matching language code
175 public ULocale getBestMatch(String languageList) {
176 return getBestMatch(LocalePriorityList.add(languageList).build());
180 * Get the best match for an individual language code.
182 * @param ulocale locale/language code to match
183 * @return best matching language code
186 public ULocale getBestMatch(ULocale ulocale) {
187 return getBestMatchInternal(ulocale).get0();
195 public String toString() {
196 return "{" + defaultLanguage + ", "
197 + maximizedLanguageToWeight + "}";
199 // ================= Privates =====================
202 * Get the best match for an individual language code.
204 * @param languageCode
205 * @return best matching language code and weight (as per
206 * {@link #match(ULocale, ULocale)})
208 private Row.R2<ULocale, Double> getBestMatchInternal(ULocale languageCode) {
209 languageCode = canonicalize(languageCode);
210 final ULocale maximized = addLikelySubtags(languageCode);
212 System.out.println("\n" + languageCode + ";\t" + maximized);
214 double bestWeight = 0;
215 ULocale bestTableMatch = null;
216 for (final ULocale tableKey : maximizedLanguageToWeight.keySet()) {
217 R2<ULocale, Double> row = maximizedLanguageToWeight.get(tableKey);
218 final double match = match(languageCode, maximized, tableKey, row.get0());
220 System.out.println("\t" + tableKey + ";\t" + row.toString() + ";\t" + match);
222 final double weight = match * row.get1();
223 if (weight > bestWeight) {
225 bestTableMatch = tableKey;
228 if (bestWeight < DEFAULT_THRESHOLD) {
229 bestTableMatch = defaultLanguage;
231 return Row.R2.of(bestTableMatch, bestWeight);
234 private void add(ULocale language, Double weight) {
235 language = canonicalize(language);
236 R2<ULocale, Double> row = Row.of(addLikelySubtags(language), weight);
237 maximizedLanguageToWeight.put(language, row);
240 Map<ULocale,Row.R2<ULocale, Double>> maximizedLanguageToWeight = new LinkedHashMap<ULocale, R2<ULocale, Double>>();
243 // =============== Special Mapping Information ==============
246 * We need to add another method to addLikelySubtags that doesn't return
247 * null, but instead substitutes Zzzz and ZZ if unknown. There are also
248 * a few cases where addLikelySubtags needs to have expanded data, to handle
249 * all deprecated codes, and to update to CLDR 1.6.
250 * @param languageCode
251 * @return "fixed" addLikelySubtags
253 // TODO(markdavis): update the above when CLDR 1.6 is final.
254 private ULocale addLikelySubtags(ULocale languageCode) {
255 final ULocale result = ULocale.addLikelySubtags(languageCode);
256 // should have method on getLikelySubtags for this
257 if (result == null || result.equals(languageCode)) {
258 final String language = languageCode.getLanguage();
259 final String script = languageCode.getScript();
260 final String region = languageCode.getCountry();
261 return new ULocale((language.length()==0 ? "und"
264 + (script.length()==0 ? "Zzzz" : script)
266 + (region.length()==0 ? "ZZ" : region));
271 private static class LocalePatternMatcher {
272 // a value of null means a wildcard; matches any.
274 private String script;
275 private String region;
277 static Pattern pattern = Pattern.compile(
278 "([a-zA-Z]{1,8}|\\*)" +
279 "(?:-([a-zA-Z]{4}|\\*))?" +
280 "(?:-([a-zA-Z]{2}|[0-9]{3}|\\*))?");
282 public LocalePatternMatcher(String toMatch) {
283 Matcher matcher = pattern.matcher(toMatch);
284 if (!matcher.matches()) {
285 throw new IllegalArgumentException("Bad pattern: " + toMatch);
287 lang = matcher.group(1);
288 script = matcher.group(2);
289 region = matcher.group(3);
290 level = region != null ? Level.region : script != null ? Level.script : Level.language;
292 if (lang.equals("*")) {
295 if (script != null && script.equals("*")) {
298 if (region != null && region.equals("*")) {
303 boolean matches(ULocale ulocale) {
304 if (lang != null && !lang.equals(ulocale.getLanguage())) {
307 if (script != null && !script.equals(ulocale.getScript())) {
310 if (region != null && !region.equals(ulocale.getCountry())) {
316 public Level getLevel() {
320 public String getLanguage() {
321 return (lang == null ? "*" : lang);
324 public String getScript() {
325 return (script == null ? "*" : script);
328 public String getRegion() {
329 return (region == null ? "*" : region);
332 public String toString() {
333 String result = getLanguage();
334 if (level != Level.language) {
335 result += "-" + getScript();
336 if (level != Level.script) {
337 result += "-" + getRegion();
344 enum Level {language, script, region}
346 private static class ScoreData implements Freezable<ScoreData> {
347 LinkedHashSet<Row.R3<LocalePatternMatcher,LocalePatternMatcher,Double>> scores = new LinkedHashSet<R3<LocalePatternMatcher, LocalePatternMatcher, Double>>();
351 public ScoreData(Level level) {
353 this.worst = (1-(level == Level.language ? 90 : level == Level.script ? 20 : 4))/100.0;
356 void addDataToScores(String desired, String supported, R3<LocalePatternMatcher,LocalePatternMatcher,Double> data) {
357 // Map<String, Set<R3<LocalePatternMatcher,LocalePatternMatcher,Double>>> lang_result = scores.get(desired);
358 // if (lang_result == null) {
359 // scores.put(desired, lang_result = new HashMap());
361 // Set<R3<LocalePatternMatcher,LocalePatternMatcher,Double>> result = lang_result.get(supported);
362 // if (result == null) {
363 // lang_result.put(supported, result = new LinkedHashSet());
369 double getScore(ULocale desiredLocale, ULocale dMax, String desiredRaw, String desiredMax,
370 ULocale supportedLocale, ULocale sMax, String supportedRaw, String supportedMax) {
377 * rd = rd(dm,sm) // line 4
379 * rd *= 0.75 // lines 3,8
381 * rd *= 0.5 // lines 7
386 * rd = 0.25*StdRDiff // lines 2,5
389 boolean desiredChange = desiredRaw.equals(desiredMax);
390 boolean supportedChange = supportedRaw.equals(supportedMax);
392 if (!desiredMax.equals(supportedMax)) {
393 // Map<String, Set<R3<LocalePatternMatcher,LocalePatternMatcher,Double>>> lang_result = scores.get(desiredMax);
394 // if (lang_result == null) {
397 // Set<R3<LocalePatternMatcher,LocalePatternMatcher,Double>> result = lang_result.get(supportedMax);
399 // if (result == null) {
402 distance = getRawScore(dMax, sMax);
404 if (desiredChange == supportedChange) {
406 } else if (desiredChange) {
409 } else if (desiredChange == supportedChange) { // maxes are equal, changes are equal
411 } else { // maxes are equal, changes are different
412 distance = 0.25*worst;
417 private double getRawScore(ULocale desiredLocale, ULocale supportedLocale) {
419 System.out.println("\t\t\tRaw Score:\t" + desiredLocale + ";\t" + supportedLocale);
421 for (R3<LocalePatternMatcher,LocalePatternMatcher,Double> datum : scores) { // : result
422 if (datum.get0().matches(desiredLocale)
423 && datum.get1().matches(supportedLocale)) {
425 System.out.println("\t\t\tFOUND\t" + datum);
431 System.out.println("\t\t\tNOTFOUND\t" + worst);
436 public String toString() {
437 return level + ", " + scores;
440 @SuppressWarnings("unchecked")
441 public ScoreData cloneAsThawed() {
443 ScoreData result = (ScoreData) clone();
444 result.scores = (LinkedHashSet<R3<LocalePatternMatcher, LocalePatternMatcher, Double>>) result.scores.clone();
445 result.frozen = false;
447 } catch (CloneNotSupportedException e) {
448 throw new IllegalArgumentException(e); // will never happen
453 private boolean frozen = false;
455 public ScoreData freeze() {
459 public boolean isFrozen() {
465 * Only for testing and use by tools. Interface may change!!
467 * @deprecated This API is ICU internal only.
469 public static class LanguageMatcherData implements Freezable<LanguageMatcherData> {
470 ScoreData languageScores = new ScoreData(Level.language);
471 ScoreData scriptScores = new ScoreData(Level.script);
472 ScoreData regionScores = new ScoreData(Level.region);
476 * @deprecated This API is ICU internal only.
478 public LanguageMatcherData() {
483 * @deprecated This API is ICU internal only.
485 public double match(ULocale a, ULocale aMax, ULocale b, ULocale bMax) {
487 diff += languageScores.getScore(a, aMax, a.getLanguage(), aMax.getLanguage(), b, bMax, b.getLanguage(), bMax.getLanguage());
488 diff += scriptScores.getScore(a, aMax, a.getScript(), aMax.getScript(), b, bMax, b.getScript(), bMax.getScript());
489 diff += regionScores.getScore(a, aMax, a.getCountry(), aMax.getCountry(), b, bMax, b.getCountry(), bMax.getCountry());
491 if (!a.getVariant().equals(b.getVariant())) {
496 } else if (diff > 1.0d) {
504 * Add an exceptional distance between languages, typically because regional
505 * dialects were given their own language codes. At this point the code is
506 * symmetric. We don't bother producing an equivalence class because there are
507 * so few cases; this function depends on the other permutations being
508 * added specifically.
510 * @deprecated This API is ICU internal only.
512 private LanguageMatcherData addDistance(String desired, String supported, int percent) {
513 return addDistance(desired, supported, percent, false, null);
517 * @deprecated This API is ICU internal only.
519 public LanguageMatcherData addDistance(String desired, String supported, int percent, String comment) {
520 return addDistance(desired, supported, percent, false, comment);
524 * @deprecated This API is ICU internal only.
526 public LanguageMatcherData addDistance(String desired, String supported, int percent, boolean oneway) {
527 return addDistance(desired, supported, percent, oneway, null);
530 private LanguageMatcherData addDistance(String desired, String supported, int percent, boolean oneway, String comment) {
532 System.out.println("\t<languageMatch desired=\"" + desired + "\"" +
533 " supported=\"" + supported + "\"" +
534 " percent=\"" + percent + "\""
535 + (oneway ? " oneway=\"true\"" : "")
537 + (comment == null ? "" : "\t<!-- " + comment + " -->"));
538 // // .addDistance("nn", "nb", 4, true)
539 // System.out.println(".addDistance(\"" + desired + "\"" +
540 // ", \"" + supported + "\"" +
541 // ", " + percent + ""
542 // + (oneway ? "" : ", true")
543 // + (comment == null ? "" : ", \"" + comment + "\"")
548 double score = 1-percent/100.0; // convert from percentage
549 LocalePatternMatcher desiredMatcher = new LocalePatternMatcher(desired);
550 Level desiredLen = desiredMatcher.getLevel();
551 LocalePatternMatcher supportedMatcher = new LocalePatternMatcher(supported);
552 Level supportedLen = supportedMatcher.getLevel();
553 if (desiredLen != supportedLen) {
554 throw new IllegalArgumentException();
556 R3<LocalePatternMatcher,LocalePatternMatcher,Double> data = Row.of(desiredMatcher, supportedMatcher, score);
557 R3<LocalePatternMatcher,LocalePatternMatcher,Double> data2 = oneway ? null : Row.of(supportedMatcher, desiredMatcher, score);
558 switch (desiredLen) {
560 String dlanguage = desiredMatcher.getLanguage();
561 String slanguage = supportedMatcher.getLanguage();
562 languageScores.addDataToScores(dlanguage, slanguage, data);
564 languageScores.addDataToScores(slanguage, dlanguage, data2);
568 String dscript = desiredMatcher.getScript();
569 String sscript = supportedMatcher.getScript();
570 scriptScores.addDataToScores(dscript, sscript, data);
572 scriptScores.addDataToScores(sscript, dscript, data2);
576 String dregion = desiredMatcher.getRegion();
577 String sregion = supportedMatcher.getRegion();
578 regionScores.addDataToScores(dregion, sregion, data);
580 regionScores.addDataToScores(sregion, dregion, data2);
590 * @deprecated This API is ICU internal only.
592 public LanguageMatcherData cloneAsThawed() {
593 LanguageMatcherData result;
595 result = (LanguageMatcherData) clone();
596 result.languageScores = languageScores.cloneAsThawed();
597 result.scriptScores = scriptScores.cloneAsThawed();
598 result.regionScores = regionScores.cloneAsThawed();
599 result.frozen = false;
601 } catch (CloneNotSupportedException e) {
602 throw new IllegalArgumentException(e); // will never happen
606 private boolean frozen = false;
611 * @deprecated This API is ICU internal only.
613 public LanguageMatcherData freeze() {
620 * @deprecated This API is ICU internal only.
622 public boolean isFrozen() {
627 LanguageMatcherData matcherData;
629 private static LanguageMatcherData defaultWritten = new LanguageMatcherData()
630 // TODO get data from CLDR
631 .addDistance("no", "nb", 100, "The language no is normally taken as nb in content; we might alias this for lookup.")
632 .addDistance("nn", "nb", 96)
633 .addDistance("nn", "no", 96)
634 .addDistance("da", "no", 90, "Danish and norwegian are reasonably close.")
635 .addDistance("da", "nb", 90)
636 .addDistance("hr", "br", 96, "Serbo-croatian variants are all very close.")
637 .addDistance("sh", "br", 96)
638 .addDistance("sr", "br", 96)
639 .addDistance("sh", "hr", 96)
640 .addDistance("sr", "hr", 96)
641 .addDistance("sh", "sr", 96)
642 .addDistance("sr-Latn", "sr-Cyrl", 90, "Most serbs can read either script.")
643 .addDistance("*-Hans", "*-Hant", 85, true, "Readers of simplified can read traditional much better than reverse.")
644 .addDistance("*-Hant", "*-Hans", 75, true)
645 .addDistance("en-*-US", "en-*-CA", 98, "US is different than others, and Canadian is inbetween.")
646 .addDistance("en-*-US", "en-*-*", 97)
647 .addDistance("en-*-CA", "en-*-*", 98)
648 .addDistance("en-*-*", "en-*-*", 99)
649 .addDistance("es-*-ES", "es-*-ES", 100, "Latin American Spanishes are closer to each other. Approximate by having es-ES be further from everything else.")
650 .addDistance("es-*-ES", "es-*-*", 93)
651 .addDistance("*", "*", 1, "[Default value -- must be at end!] Normally there is no comprehension of different languages.")
652 .addDistance("*-*", "*-*", 20, "[Default value -- must be at end!] Normally there is little comprehension of different scripts.")
653 .addDistance("*-*-*", "*-*-*", 96, "[Default value -- must be at end!] Normally there are small differences across regions.")
656 private static HashMap<String,String> canonicalMap = new HashMap<String, String>();
659 // TODO get data from CLDR
660 canonicalMap.put("iw", "he");
661 canonicalMap.put("mo", "ro");
662 canonicalMap.put("tl", "fil");