jars/icu4j-52_1/main/classes/core/src/com/ibm/icu/impl/LocaleIDParser.java

   1 /*
   2 ******************************************************************************
   3 * Copyright (C) 2003-2011, International Business Machines Corporation and   *
   4 * others. All Rights Reserved.                                               *
   5 ******************************************************************************
   6 */
   7
   8 package com.ibm.icu.impl;
   9
  10 import java.util.Collections;
  11 import java.util.Comparator;
  12 import java.util.Iterator;
  13 import java.util.Map;
  14 import java.util.TreeMap;
  15
  16 import com.ibm.icu.impl.locale.AsciiUtil;
  17
  18 /**
  19  * Utility class to parse and normalize locale ids (including POSIX style)
  20  */
  21 public final class LocaleIDParser {
  22
  23     /**
  24      * Char array representing the locale ID.
  25      */
  26     private char[] id;
  27
  28     /**
  29      * Current position in {@link #id} (while parsing).
  30      */
  31     private int index;
  32
  33     /**
  34      * Temporary buffer for parsed sections of data.
  35      */
  36     private StringBuilder buffer;
  37
  38     // um, don't handle POSIX ids unless we request it.  why not?  well... because.
  39     private boolean canonicalize;
  40     private boolean hadCountry;
  41
  42     // used when canonicalizing
  43     Map<String, String> keywords;
  44     String baseName;
  45
  46     /**
  47      * Parsing constants.
  48      */
  49     private static final char KEYWORD_SEPARATOR     = '@';
  50     private static final char HYPHEN                = '-';
  51     private static final char KEYWORD_ASSIGN        = '=';
  52     private static final char COMMA                 = ',';
  53     private static final char ITEM_SEPARATOR        = ';';
  54     private static final char DOT                   = '.';
  55     private static final char UNDERSCORE            = '_';
  56
  57     public LocaleIDParser(String localeID) {
  58         this(localeID, false);
  59     }
  60
  61     public LocaleIDParser(String localeID, boolean canonicalize) {
  62         id = localeID.toCharArray();
  63         index = 0;
  64         buffer = new StringBuilder(id.length + 5);
  65         this.canonicalize = canonicalize;
  66     }
  67
  68     private void reset() {
  69         index = 0;
  70         buffer = new StringBuilder(id.length + 5);
  71     }
  72
  73     // utilities for working on text in the buffer
  74
  75     /**
  76      * Append c to the buffer.
  77      */
  78     private void append(char c) {
  79         buffer.append(c);
  80     }
  81
  82     private void addSeparator() {
  83         append(UNDERSCORE);
  84     }
  85
  86     /**
  87      * Returns the text in the buffer from start to blen as a String.
  88      */
  89     private String getString(int start) {
  90         return buffer.substring(start);
  91     }
  92
  93     /**
  94      * Set the length of the buffer to pos, then append the string.
  95      */
  96     private void set(int pos, String s) {
  97         buffer.delete(pos, buffer.length());
  98         buffer.insert(pos, s);
  99     }
 100
 101     /**
 102      * Append the string to the buffer.
 103      */
 104     private void append(String s) {
 105         buffer.append(s);
 106     }
 107
 108     // utilities for parsing text out of the id
 109
 110     /**
 111      * Character to indicate no more text is available in the id.
 112      */
 113     private static final char DONE = '\uffff';
 114
 115     /**
 116      * Returns the character at index in the id, and advance index.  The returned character
 117      * is DONE if index was at the limit of the buffer.  The index is advanced regardless
 118      * so that decrementing the index will always 'unget' the last character returned.
 119      */
 120     private char next() {
 121         if (index == id.length) {
 122             index++;
 123             return DONE;
 124         }
 125
 126         return id[index++];
 127     }
 128
 129     /**
 130      * Advance index until the next terminator or id separator, and leave it there.
 131      */
 132     private void skipUntilTerminatorOrIDSeparator() {
 133         while (!isTerminatorOrIDSeparator(next()));
 134         --index;
 135     }
 136
 137     /**
 138      * Returns true if the character at index in the id is a terminator.
 139      */
 140     private boolean atTerminator() {
 141         return index >= id.length || isTerminator(id[index]);
 142     }
 143
 144     /**
 145      * Returns true if the character is a terminator (keyword separator, dot, or DONE).
 146      * Dot is a terminator because of the POSIX form, where dot precedes the codepage.
 147      */
 148     private boolean isTerminator(char c) {
 149         // always terminate at DOT, even if not handling POSIX.  It's an error...
 150         return c == KEYWORD_SEPARATOR || c == DONE || c == DOT;
 151     }
 152
 153     /**
 154      * Returns true if the character is a terminator or id separator.
 155      */
 156     private boolean isTerminatorOrIDSeparator(char c) {
 157         return c == UNDERSCORE || c == HYPHEN || isTerminator(c);
 158     }
 159
 160     /**
 161      * Returns true if the start of the buffer has an experimental or private language
 162      * prefix, the pattern '[ixIX][-_].' shows the syntax checked.
 163      */
 164     private boolean haveExperimentalLanguagePrefix() {
 165         if (id.length > 2) {
 166             char c = id[1];
 167             if (c == HYPHEN || c == UNDERSCORE) {
 168                 c = id[0];
 169                 return c == 'x' || c == 'X' || c == 'i' || c == 'I';
 170             }
 171         }
 172         return false;
 173     }
 174
 175     /**
 176      * Returns true if a value separator occurs at or after index.
 177      */
 178     private boolean haveKeywordAssign() {
 179         // assume it is safe to start from index
 180         for (int i = index; i < id.length; ++i) {
 181             if (id[i] == KEYWORD_ASSIGN) {
 182                 return true;
 183             }
 184         }
 185         return false;
 186     }
 187
 188     /**
 189      * Advance index past language, and accumulate normalized language code in buffer.
 190      * Index must be at 0 when this is called.  Index is left at a terminator or id
 191      * separator.  Returns the start of the language code in the buffer.
 192      */
 193     private int parseLanguage() {
 194         int startLength = buffer.length();
 195
 196         if (haveExperimentalLanguagePrefix()) {
 197             append(AsciiUtil.toLower(id[0]));
 198             append(HYPHEN);
 199             index = 2;
 200         }
 201
 202         char c;
 203         while(!isTerminatorOrIDSeparator(c = next())) {
 204             append(AsciiUtil.toLower(c));
 205         }
 206         --index; // unget
 207
 208         if (buffer.length() - startLength == 3) {
 209             String lang = LocaleIDs.threeToTwoLetterLanguage(getString(0));
 210             if (lang != null) {
 211                 set(0, lang);
 212             }
 213         }
 214
 215         return 0;
 216     }
 217
 218     /**
 219      * Advance index past language.  Index must be at 0 when this is called.  Index
 220      * is left at a terminator or id separator.
 221      */
 222     private void skipLanguage() {
 223         if (haveExperimentalLanguagePrefix()) {
 224             index = 2;
 225         }
 226         skipUntilTerminatorOrIDSeparator();
 227     }
 228
 229     /**
 230      * Advance index past script, and accumulate normalized script in buffer.
 231      * Index must be immediately after the language.
 232      * If the item at this position is not a script (is not four characters
 233      * long) leave index and buffer unchanged.  Otherwise index is left at
 234      * a terminator or id separator.  Returns the start of the script code
 235      * in the buffer (this may be equal to the buffer length, if there is no
 236      * script).
 237      */
 238     private int parseScript() {
 239         if (!atTerminator()) {
 240             int oldIndex = index; // save original index
 241             ++index;
 242
 243             int oldBlen = buffer.length(); // get before append hyphen, if we truncate everything is undone
 244             char c;
 245             boolean firstPass = true;
 246             while(!isTerminatorOrIDSeparator(c = next()) && AsciiUtil.isAlpha(c)) {
 247                 if (firstPass) {
 248                     addSeparator();
 249                     append(AsciiUtil.toUpper(c));
 250                     firstPass = false;
 251                 } else {
 252                     append(AsciiUtil.toLower(c));
 253                 }
 254             }
 255             --index; // unget
 256
 257             /* If it's not exactly 4 characters long, then it's not a script. */
 258             if (index - oldIndex != 5) { // +1 to account for separator
 259                 index = oldIndex;
 260                 buffer.delete(oldBlen, buffer.length());
 261             } else {
 262                 oldBlen++; // index past hyphen, for clients who want to extract just the script
 263             }
 264
 265             return oldBlen;
 266         }
 267         return buffer.length();
 268     }
 269
 270     /**
 271      * Advance index past script.
 272      * Index must be immediately after the language and IDSeparator.
 273      * If the item at this position is not a script (is not four characters
 274      * long) leave index.  Otherwise index is left at a terminator or
 275      * id separator.
 276      */
 277     private void skipScript() {
 278         if (!atTerminator()) {
 279             int oldIndex = index;
 280             ++index;
 281
 282             char c;
 283             while (!isTerminatorOrIDSeparator(c = next()) && AsciiUtil.isAlpha(c));
 284             --index;
 285
 286             if (index - oldIndex != 5) { // +1 to account for separator
 287                 index = oldIndex;
 288             }
 289         }
 290     }
 291
 292     /**
 293      * Advance index past country, and accumulate normalized country in buffer.
 294      * Index must be immediately after the script (if there is one, else language)
 295      * and IDSeparator.  Return the start of the country code in the buffer.
 296      */
 297     private int parseCountry() {
 298         if (!atTerminator()) {
 299             int oldIndex = index;
 300             ++index;
 301
 302             int oldBlen = buffer.length();
 303             char c;
 304             boolean firstPass = true;
 305             while (!isTerminatorOrIDSeparator(c = next())) {
 306                 if (firstPass) { // first, add hyphen
 307                     hadCountry = true; // we have a country, let variant parsing know
 308                     addSeparator();
 309                     ++oldBlen; // increment past hyphen
 310                     firstPass = false;
 311                 }
 312                 append(AsciiUtil.toUpper(c));
 313             }
 314             --index; // unget
 315
 316             int charsAppended = buffer.length() - oldBlen;
 317
 318             if (charsAppended == 0) {
 319                 // Do nothing.
 320             }
 321             else if (charsAppended < 2 || charsAppended > 3) {
 322                 // It's not a country, so return index and blen to
 323                 // their previous values.
 324                 index = oldIndex;
 325                 --oldBlen;
 326                 buffer.delete(oldBlen, buffer.length());
 327                 hadCountry = false;
 328             }
 329             else if (charsAppended == 3) {
 330                 String region = LocaleIDs.threeToTwoLetterRegion(getString(oldBlen));
 331                 if (region != null) {
 332                     set(oldBlen, region);
 333                 }
 334             }
 335
 336             return oldBlen;
 337         }
 338
 339         return buffer.length();
 340     }
 341
 342     /**
 343      * Advance index past country.
 344      * Index must be immediately after the script (if there is one, else language)
 345      * and IDSeparator.
 346      */
 347     private void skipCountry() {
 348         if (!atTerminator()) {
 349             if (id[index] == UNDERSCORE || id[index] == HYPHEN) {
 350                 ++index;
 351             }
 352             /*
 353              * Save the index point after the separator, since the format
 354              * requires two separators if the country is not present.
 355              */
 356             int oldIndex = index;
 357
 358             skipUntilTerminatorOrIDSeparator();
 359             int charsSkipped = index - oldIndex;
 360             if (charsSkipped < 2 || charsSkipped > 3) {
 361                 index = oldIndex;
 362             }
 363         }
 364     }
 365
 366     /**
 367      * Advance index past variant, and accumulate normalized variant in buffer.  This ignores
 368      * the codepage information from POSIX ids.  Index must be immediately after the country
 369      * or script.  Index is left at the keyword separator or at the end of the text.  Return
 370      * the start of the variant code in the buffer.
 371      *
 372      * In standard form, we can have the following forms:
 373      * ll__VVVV
 374      * ll_CC_VVVV
 375      * ll_Ssss_VVVV
 376      * ll_Ssss_CC_VVVV
 377      *
 378      * This also handles POSIX ids, which can have the following forms (pppp is code page id):
 379      * ll_CC.pppp          --> ll_CC
 380      * ll_CC.pppp@VVVV     --> ll_CC_VVVV
 381      * ll_CC@VVVV          --> ll_CC_VVVV
 382      *
 383      * We identify this use of '@' in POSIX ids by looking for an '=' following
 384      * the '@'.  If there is one, we consider '@' to start a keyword list, instead of
 385      * being part of a POSIX id.
 386      *
 387      * Note:  since it was decided that we want an option to not handle POSIX ids, this
 388      * becomes a bit more complex.
 389      */
 390     private int parseVariant() {
 391         int oldBlen = buffer.length();
 392
 393         boolean start = true;
 394         boolean needSeparator = true;
 395         boolean skipping = false;
 396         char c;
 397         boolean firstPass = true;
 398
 399         while ((c = next()) != DONE) {
 400             if (c == DOT) {
 401                 start = false;
 402                 skipping = true;
 403             } else if (c == KEYWORD_SEPARATOR) {
 404                 if (haveKeywordAssign()) {
 405                     break;
 406                 }
 407                 skipping = false;
 408                 start = false;
 409                 needSeparator = true; // add another underscore if we have more text
 410             } else if (start) {
 411                 start = false;
 412                 if (c != UNDERSCORE && c != HYPHEN) {
 413                     index--;
 414                 }
 415             } else if (!skipping) {
 416                 if (needSeparator) {
 417                     needSeparator = false;
 418                     if (firstPass && !hadCountry) { // no country, we'll need two
 419                         addSeparator();
 420                         ++oldBlen; // for sure
 421                     }
 422                     addSeparator();
 423                     if (firstPass) { // only for the first separator
 424                         ++oldBlen;
 425                         firstPass = false;
 426                     }
 427                 }
 428                 c = AsciiUtil.toUpper(c);
 429                 if (c == HYPHEN || c == COMMA) {
 430                     c = UNDERSCORE;
 431                 }
 432                 append(c);
 433             }
 434         }
 435         --index; // unget
 436
 437         return oldBlen;
 438     }
 439
 440     // no need for skipvariant, to get the keywords we'll just scan directly for
 441     // the keyword separator
 442
 443     /**
 444      * Returns the normalized language id, or the empty string.
 445      */
 446     public String getLanguage() {
 447         reset();
 448         return getString(parseLanguage());
 449     }
 450
 451     /**
 452      * Returns the normalized script id, or the empty string.
 453      */
 454     public String getScript() {
 455         reset();
 456         skipLanguage();
 457         return getString(parseScript());
 458     }
 459
 460     /**
 461      * return the normalized country id, or the empty string.
 462      */
 463     public String getCountry() {
 464         reset();
 465         skipLanguage();
 466         skipScript();
 467         return getString(parseCountry());
 468     }
 469
 470     /**
 471      * Returns the normalized variant id, or the empty string.
 472      */
 473     public String getVariant() {
 474         reset();
 475         skipLanguage();
 476         skipScript();
 477         skipCountry();
 478         return getString(parseVariant());
 479     }
 480
 481     /**
 482      * Returns the language, script, country, and variant as separate strings.
 483      */
 484     public String[] getLanguageScriptCountryVariant() {
 485         reset();
 486         return new String[] {
 487                 getString(parseLanguage()),
 488                 getString(parseScript()),
 489                 getString(parseCountry()),
 490                 getString(parseVariant())
 491         };
 492     }
 493
 494     public void setBaseName(String baseName) {
 495         this.baseName = baseName;
 496     }
 497
 498     public void parseBaseName() {
 499         if (baseName != null) {
 500             set(0, baseName);
 501         } else {
 502             reset();
 503             parseLanguage();
 504             parseScript();
 505             parseCountry();
 506             parseVariant();
 507
 508             // catch unwanted trailing underscore after country if there was no variant
 509             int len = buffer.length();
 510             if (len > 0 && buffer.charAt(len - 1) == UNDERSCORE) {
 511                 buffer.deleteCharAt(len - 1);
 512             }
 513         }
 514     }
 515
 516     /**
 517      * Returns the normalized base form of the locale id.  The base
 518      * form does not include keywords.
 519      */
 520     public String getBaseName() {
 521         if (baseName != null) {
 522             return baseName;
 523         }
 524         parseBaseName();
 525         return getString(0);
 526     }
 527
 528     /**
 529      * Returns the normalized full form of the locale id.  The full
 530      * form includes keywords if they are present.
 531      */
 532     public String getName() {
 533         parseBaseName();
 534         parseKeywords();
 535         return getString(0);
 536     }
 537
 538     // keyword utilities
 539
 540     /**
 541      * If we have keywords, advance index to the start of the keywords and return true,
 542      * otherwise return false.
 543      */
 544     private boolean setToKeywordStart() {
 545         for (int i = index; i < id.length; ++i) {
 546             if (id[i] == KEYWORD_SEPARATOR) {
 547                 if (canonicalize) {
 548                     for (int j = ++i; j < id.length; ++j) { // increment i past separator for return
 549                         if (id[j] == KEYWORD_ASSIGN) {
 550                             index = i;
 551                             return true;
 552                         }
 553                     }
 554                 } else {
 555                     if (++i < id.length) {
 556                         index = i;
 557                         return true;
 558                     }
 559                 }
 560                 break;
 561             }
 562         }
 563         return false;
 564     }
 565
 566     private static boolean isDoneOrKeywordAssign(char c) {
 567         return c == DONE || c == KEYWORD_ASSIGN;
 568     }
 569
 570     private static boolean isDoneOrItemSeparator(char c) {
 571         return c == DONE || c == ITEM_SEPARATOR;
 572     }
 573
 574     private String getKeyword() {
 575         int start = index;
 576         while (!isDoneOrKeywordAssign(next())) {
 577         }
 578         --index;
 579         return AsciiUtil.toLowerString(new String(id, start, index-start).trim());
 580     }
 581
 582     private String getValue() {
 583         int start = index;
 584         while (!isDoneOrItemSeparator(next())) {
 585         }
 586         --index;
 587         return new String(id, start, index-start).trim(); // leave case alone
 588     }
 589
 590     private Comparator<String> getKeyComparator() {
 591         final Comparator<String> comp = new Comparator<String>() {
 592             public int compare(String lhs, String rhs) {
 593                 return lhs.compareTo(rhs);
 594             }
 595         };
 596         return comp;
 597     }
 598
 599     /**
 600      * Returns a map of the keywords and values, or null if there are none.
 601      */
 602     public Map<String, String> getKeywordMap() {
 603         if (keywords == null) {
 604             TreeMap<String, String> m = null;
 605             if (setToKeywordStart()) {
 606                 // trim spaces and convert to lower case, both keywords and values.
 607                 do {
 608                     String key = getKeyword();
 609                     if (key.length() == 0) {
 610                         break;
 611                     }
 612                     char c = next();
 613                     if (c != KEYWORD_ASSIGN) {
 614                         // throw new IllegalArgumentException("key '" + key + "' missing a value.");
 615                         if (c == DONE) {
 616                             break;
 617                         } else {
 618                             continue;
 619                         }
 620                     }
 621                     String value = getValue();
 622                     if (value.length() == 0) {
 623                         // throw new IllegalArgumentException("key '" + key + "' missing a value.");
 624                         continue;
 625                     }
 626                     if (m == null) {
 627                         m = new TreeMap<String, String>(getKeyComparator());
 628                     } else if (m.containsKey(key)) {
 629                         // throw new IllegalArgumentException("key '" + key + "' already has a value.");
 630                         continue;
 631                     }
 632                     m.put(key, value);
 633                 } while (next() == ITEM_SEPARATOR);
 634             }
 635             keywords = m != null ? m : Collections.<String, String>emptyMap();
 636         }
 637
 638         return keywords;
 639     }
 640
 641
 642     /**
 643      * Parse the keywords and return start of the string in the buffer.
 644      */
 645     private int parseKeywords() {
 646         int oldBlen = buffer.length();
 647         Map<String, String> m = getKeywordMap();
 648         if (!m.isEmpty()) {
 649             boolean first = true;
 650             for (Map.Entry<String, String> e : m.entrySet()) {
 651                 append(first ? KEYWORD_SEPARATOR : ITEM_SEPARATOR);
 652                 first = false;
 653                 append(e.getKey());
 654                 append(KEYWORD_ASSIGN);
 655                 append(e.getValue());
 656             }
 657             if (first == false) {
 658                 ++oldBlen;
 659             }
 660         }
 661         return oldBlen;
 662     }
 663
 664     /**
 665      * Returns an iterator over the keywords, or null if we have an empty map.
 666      */
 667     public Iterator<String> getKeywords() {
 668         Map<String, String> m = getKeywordMap();
 669         return m.isEmpty() ? null : m.keySet().iterator();
 670     }
 671
 672     /**
 673      * Returns the value for the named keyword, or null if the keyword is not
 674      * present.
 675      */
 676     public String getKeywordValue(String keywordName) {
 677         Map<String, String> m = getKeywordMap();
 678         return m.isEmpty() ? null : m.get(AsciiUtil.toLowerString(keywordName.trim()));
 679     }
 680
 681     /**
 682      * Set the keyword value only if it is not already set to something else.
 683      */
 684     public void defaultKeywordValue(String keywordName, String value) {
 685         setKeywordValue(keywordName, value, false);
 686     }
 687
 688     /**
 689      * Set the value for the named keyword, or unset it if value is null.  If
 690      * keywordName itself is null, unset all keywords.  If keywordName is not null,
 691      * value must not be null.
 692      */
 693     public void setKeywordValue(String keywordName, String value) {
 694         setKeywordValue(keywordName, value, true);
 695     }
 696
 697     /**
 698      * Set the value for the named keyword, or unset it if value is null.  If
 699      * keywordName itself is null, unset all keywords.  If keywordName is not null,
 700      * value must not be null.  If reset is true, ignore any previous value for
 701      * the keyword, otherwise do not change the keyword (including removal of
 702      * one or all keywords).
 703      */
 704     private void setKeywordValue(String keywordName, String value, boolean reset) {
 705         if (keywordName == null) {
 706             if (reset) {
 707                 // force new map, ignore value
 708                 keywords = Collections.<String, String>emptyMap();
 709             }
 710         } else {
 711             keywordName = AsciiUtil.toLowerString(keywordName.trim());
 712             if (keywordName.length() == 0) {
 713                 throw new IllegalArgumentException("keyword must not be empty");
 714             }
 715             if (value != null) {
 716                 value = value.trim();
 717                 if (value.length() == 0) {
 718                     throw new IllegalArgumentException("value must not be empty");
 719                 }
 720             }
 721             Map<String, String> m = getKeywordMap();
 722             if (m.isEmpty()) { // it is EMPTY_MAP
 723                 if (value != null) {
 724                     // force new map
 725                     keywords = new TreeMap<String, String>(getKeyComparator());
 726                     keywords.put(keywordName, value.trim());
 727                 }
 728             } else {
 729                 if (reset || !m.containsKey(keywordName)) {
 730                     if (value != null) {
 731                         m.put(keywordName, value);
 732                     } else {
 733                         m.remove(keywordName);
 734                         if (m.isEmpty()) {
 735                             // force new map
 736                             keywords = Collections.<String, String>emptyMap();
 737                         }
 738                     }
 739                 }
 740             }
 741         }
 742     }
 743 }