2 ******************************************************************************
3 * Copyright (C) 2003-2011, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 ******************************************************************************
8 package com.ibm.icu.impl;
10 import java.util.Collections;
11 import java.util.Comparator;
12 import java.util.Iterator;
14 import java.util.TreeMap;
16 import com.ibm.icu.impl.locale.AsciiUtil;
19 * Utility class to parse and normalize locale ids (including POSIX style)
21 public final class LocaleIDParser {
24 * Char array representing the locale ID.
29 * Current position in {@link #id} (while parsing).
34 * Temporary buffer for parsed sections of data.
36 private StringBuilder buffer;
38 // um, don't handle POSIX ids unless we request it. why not? well... because.
39 private boolean canonicalize;
40 private boolean hadCountry;
42 // used when canonicalizing
43 Map<String, String> keywords;
49 private static final char KEYWORD_SEPARATOR = '@';
50 private static final char HYPHEN = '-';
51 private static final char KEYWORD_ASSIGN = '=';
52 private static final char COMMA = ',';
53 private static final char ITEM_SEPARATOR = ';';
54 private static final char DOT = '.';
55 private static final char UNDERSCORE = '_';
57 public LocaleIDParser(String localeID) {
58 this(localeID, false);
61 public LocaleIDParser(String localeID, boolean canonicalize) {
62 id = localeID.toCharArray();
64 buffer = new StringBuilder(id.length + 5);
65 this.canonicalize = canonicalize;
68 private void reset() {
70 buffer = new StringBuilder(id.length + 5);
73 // utilities for working on text in the buffer
76 * Append c to the buffer.
78 private void append(char c) {
82 private void addSeparator() {
87 * Returns the text in the buffer from start to blen as a String.
89 private String getString(int start) {
90 return buffer.substring(start);
94 * Set the length of the buffer to pos, then append the string.
96 private void set(int pos, String s) {
97 buffer.delete(pos, buffer.length());
98 buffer.insert(pos, s);
102 * Append the string to the buffer.
104 private void append(String s) {
108 // utilities for parsing text out of the id
111 * Character to indicate no more text is available in the id.
113 private static final char DONE = '\uffff';
116 * Returns the character at index in the id, and advance index. The returned character
117 * is DONE if index was at the limit of the buffer. The index is advanced regardless
118 * so that decrementing the index will always 'unget' the last character returned.
120 private char next() {
121 if (index == id.length) {
130 * Advance index until the next terminator or id separator, and leave it there.
132 private void skipUntilTerminatorOrIDSeparator() {
133 while (!isTerminatorOrIDSeparator(next()));
138 * Returns true if the character at index in the id is a terminator.
140 private boolean atTerminator() {
141 return index >= id.length || isTerminator(id[index]);
145 * Returns true if the character is a terminator (keyword separator, dot, or DONE).
146 * Dot is a terminator because of the POSIX form, where dot precedes the codepage.
148 private boolean isTerminator(char c) {
149 // always terminate at DOT, even if not handling POSIX. It's an error...
150 return c == KEYWORD_SEPARATOR || c == DONE || c == DOT;
154 * Returns true if the character is a terminator or id separator.
156 private boolean isTerminatorOrIDSeparator(char c) {
157 return c == UNDERSCORE || c == HYPHEN || isTerminator(c);
161 * Returns true if the start of the buffer has an experimental or private language
162 * prefix, the pattern '[ixIX][-_].' shows the syntax checked.
164 private boolean haveExperimentalLanguagePrefix() {
167 if (c == HYPHEN || c == UNDERSCORE) {
169 return c == 'x' || c == 'X' || c == 'i' || c == 'I';
176 * Returns true if a value separator occurs at or after index.
178 private boolean haveKeywordAssign() {
179 // assume it is safe to start from index
180 for (int i = index; i < id.length; ++i) {
181 if (id[i] == KEYWORD_ASSIGN) {
189 * Advance index past language, and accumulate normalized language code in buffer.
190 * Index must be at 0 when this is called. Index is left at a terminator or id
191 * separator. Returns the start of the language code in the buffer.
193 private int parseLanguage() {
194 int startLength = buffer.length();
196 if (haveExperimentalLanguagePrefix()) {
197 append(AsciiUtil.toLower(id[0]));
203 while(!isTerminatorOrIDSeparator(c = next())) {
204 append(AsciiUtil.toLower(c));
208 if (buffer.length() - startLength == 3) {
209 String lang = LocaleIDs.threeToTwoLetterLanguage(getString(0));
219 * Advance index past language. Index must be at 0 when this is called. Index
220 * is left at a terminator or id separator.
222 private void skipLanguage() {
223 if (haveExperimentalLanguagePrefix()) {
226 skipUntilTerminatorOrIDSeparator();
230 * Advance index past script, and accumulate normalized script in buffer.
231 * Index must be immediately after the language.
232 * If the item at this position is not a script (is not four characters
233 * long) leave index and buffer unchanged. Otherwise index is left at
234 * a terminator or id separator. Returns the start of the script code
235 * in the buffer (this may be equal to the buffer length, if there is no
238 private int parseScript() {
239 if (!atTerminator()) {
240 int oldIndex = index; // save original index
243 int oldBlen = buffer.length(); // get before append hyphen, if we truncate everything is undone
245 boolean firstPass = true;
246 while(!isTerminatorOrIDSeparator(c = next()) && AsciiUtil.isAlpha(c)) {
249 append(AsciiUtil.toUpper(c));
252 append(AsciiUtil.toLower(c));
257 /* If it's not exactly 4 characters long, then it's not a script. */
258 if (index - oldIndex != 5) { // +1 to account for separator
260 buffer.delete(oldBlen, buffer.length());
262 oldBlen++; // index past hyphen, for clients who want to extract just the script
267 return buffer.length();
271 * Advance index past script.
272 * Index must be immediately after the language and IDSeparator.
273 * If the item at this position is not a script (is not four characters
274 * long) leave index. Otherwise index is left at a terminator or
277 private void skipScript() {
278 if (!atTerminator()) {
279 int oldIndex = index;
283 while (!isTerminatorOrIDSeparator(c = next()) && AsciiUtil.isAlpha(c));
286 if (index - oldIndex != 5) { // +1 to account for separator
293 * Advance index past country, and accumulate normalized country in buffer.
294 * Index must be immediately after the script (if there is one, else language)
295 * and IDSeparator. Return the start of the country code in the buffer.
297 private int parseCountry() {
298 if (!atTerminator()) {
299 int oldIndex = index;
302 int oldBlen = buffer.length();
304 boolean firstPass = true;
305 while (!isTerminatorOrIDSeparator(c = next())) {
306 if (firstPass) { // first, add hyphen
307 hadCountry = true; // we have a country, let variant parsing know
309 ++oldBlen; // increment past hyphen
312 append(AsciiUtil.toUpper(c));
316 int charsAppended = buffer.length() - oldBlen;
318 if (charsAppended == 0) {
321 else if (charsAppended < 2 || charsAppended > 3) {
322 // It's not a country, so return index and blen to
323 // their previous values.
326 buffer.delete(oldBlen, buffer.length());
329 else if (charsAppended == 3) {
330 String region = LocaleIDs.threeToTwoLetterRegion(getString(oldBlen));
331 if (region != null) {
332 set(oldBlen, region);
339 return buffer.length();
343 * Advance index past country.
344 * Index must be immediately after the script (if there is one, else language)
347 private void skipCountry() {
348 if (!atTerminator()) {
349 if (id[index] == UNDERSCORE || id[index] == HYPHEN) {
353 * Save the index point after the separator, since the format
354 * requires two separators if the country is not present.
356 int oldIndex = index;
358 skipUntilTerminatorOrIDSeparator();
359 int charsSkipped = index - oldIndex;
360 if (charsSkipped < 2 || charsSkipped > 3) {
367 * Advance index past variant, and accumulate normalized variant in buffer. This ignores
368 * the codepage information from POSIX ids. Index must be immediately after the country
369 * or script. Index is left at the keyword separator or at the end of the text. Return
370 * the start of the variant code in the buffer.
372 * In standard form, we can have the following forms:
378 * This also handles POSIX ids, which can have the following forms (pppp is code page id):
379 * ll_CC.pppp --> ll_CC
380 * ll_CC.pppp@VVVV --> ll_CC_VVVV
381 * ll_CC@VVVV --> ll_CC_VVVV
383 * We identify this use of '@' in POSIX ids by looking for an '=' following
384 * the '@'. If there is one, we consider '@' to start a keyword list, instead of
385 * being part of a POSIX id.
387 * Note: since it was decided that we want an option to not handle POSIX ids, this
388 * becomes a bit more complex.
390 private int parseVariant() {
391 int oldBlen = buffer.length();
393 boolean start = true;
394 boolean needSeparator = true;
395 boolean skipping = false;
397 boolean firstPass = true;
399 while ((c = next()) != DONE) {
403 } else if (c == KEYWORD_SEPARATOR) {
404 if (haveKeywordAssign()) {
409 needSeparator = true; // add another underscore if we have more text
412 if (c != UNDERSCORE && c != HYPHEN) {
415 } else if (!skipping) {
417 needSeparator = false;
418 if (firstPass && !hadCountry) { // no country, we'll need two
420 ++oldBlen; // for sure
423 if (firstPass) { // only for the first separator
428 c = AsciiUtil.toUpper(c);
429 if (c == HYPHEN || c == COMMA) {
440 // no need for skipvariant, to get the keywords we'll just scan directly for
441 // the keyword separator
444 * Returns the normalized language id, or the empty string.
446 public String getLanguage() {
448 return getString(parseLanguage());
452 * Returns the normalized script id, or the empty string.
454 public String getScript() {
457 return getString(parseScript());
461 * return the normalized country id, or the empty string.
463 public String getCountry() {
467 return getString(parseCountry());
471 * Returns the normalized variant id, or the empty string.
473 public String getVariant() {
478 return getString(parseVariant());
482 * Returns the language, script, country, and variant as separate strings.
484 public String[] getLanguageScriptCountryVariant() {
486 return new String[] {
487 getString(parseLanguage()),
488 getString(parseScript()),
489 getString(parseCountry()),
490 getString(parseVariant())
494 public void setBaseName(String baseName) {
495 this.baseName = baseName;
498 public void parseBaseName() {
499 if (baseName != null) {
508 // catch unwanted trailing underscore after country if there was no variant
509 int len = buffer.length();
510 if (len > 0 && buffer.charAt(len - 1) == UNDERSCORE) {
511 buffer.deleteCharAt(len - 1);
517 * Returns the normalized base form of the locale id. The base
518 * form does not include keywords.
520 public String getBaseName() {
521 if (baseName != null) {
529 * Returns the normalized full form of the locale id. The full
530 * form includes keywords if they are present.
532 public String getName() {
541 * If we have keywords, advance index to the start of the keywords and return true,
542 * otherwise return false.
544 private boolean setToKeywordStart() {
545 for (int i = index; i < id.length; ++i) {
546 if (id[i] == KEYWORD_SEPARATOR) {
548 for (int j = ++i; j < id.length; ++j) { // increment i past separator for return
549 if (id[j] == KEYWORD_ASSIGN) {
555 if (++i < id.length) {
566 private static boolean isDoneOrKeywordAssign(char c) {
567 return c == DONE || c == KEYWORD_ASSIGN;
570 private static boolean isDoneOrItemSeparator(char c) {
571 return c == DONE || c == ITEM_SEPARATOR;
574 private String getKeyword() {
576 while (!isDoneOrKeywordAssign(next())) {
579 return AsciiUtil.toLowerString(new String(id, start, index-start).trim());
582 private String getValue() {
584 while (!isDoneOrItemSeparator(next())) {
587 return new String(id, start, index-start).trim(); // leave case alone
590 private Comparator<String> getKeyComparator() {
591 final Comparator<String> comp = new Comparator<String>() {
592 public int compare(String lhs, String rhs) {
593 return lhs.compareTo(rhs);
600 * Returns a map of the keywords and values, or null if there are none.
602 public Map<String, String> getKeywordMap() {
603 if (keywords == null) {
604 TreeMap<String, String> m = null;
605 if (setToKeywordStart()) {
606 // trim spaces and convert to lower case, both keywords and values.
608 String key = getKeyword();
609 if (key.length() == 0) {
613 if (c != KEYWORD_ASSIGN) {
614 // throw new IllegalArgumentException("key '" + key + "' missing a value.");
621 String value = getValue();
622 if (value.length() == 0) {
623 // throw new IllegalArgumentException("key '" + key + "' missing a value.");
627 m = new TreeMap<String, String>(getKeyComparator());
628 } else if (m.containsKey(key)) {
629 // throw new IllegalArgumentException("key '" + key + "' already has a value.");
633 } while (next() == ITEM_SEPARATOR);
635 keywords = m != null ? m : Collections.<String, String>emptyMap();
643 * Parse the keywords and return start of the string in the buffer.
645 private int parseKeywords() {
646 int oldBlen = buffer.length();
647 Map<String, String> m = getKeywordMap();
649 boolean first = true;
650 for (Map.Entry<String, String> e : m.entrySet()) {
651 append(first ? KEYWORD_SEPARATOR : ITEM_SEPARATOR);
654 append(KEYWORD_ASSIGN);
655 append(e.getValue());
657 if (first == false) {
665 * Returns an iterator over the keywords, or null if we have an empty map.
667 public Iterator<String> getKeywords() {
668 Map<String, String> m = getKeywordMap();
669 return m.isEmpty() ? null : m.keySet().iterator();
673 * Returns the value for the named keyword, or null if the keyword is not
676 public String getKeywordValue(String keywordName) {
677 Map<String, String> m = getKeywordMap();
678 return m.isEmpty() ? null : m.get(AsciiUtil.toLowerString(keywordName.trim()));
682 * Set the keyword value only if it is not already set to something else.
684 public void defaultKeywordValue(String keywordName, String value) {
685 setKeywordValue(keywordName, value, false);
689 * Set the value for the named keyword, or unset it if value is null. If
690 * keywordName itself is null, unset all keywords. If keywordName is not null,
691 * value must not be null.
693 public void setKeywordValue(String keywordName, String value) {
694 setKeywordValue(keywordName, value, true);
698 * Set the value for the named keyword, or unset it if value is null. If
699 * keywordName itself is null, unset all keywords. If keywordName is not null,
700 * value must not be null. If reset is true, ignore any previous value for
701 * the keyword, otherwise do not change the keyword (including removal of
702 * one or all keywords).
704 private void setKeywordValue(String keywordName, String value, boolean reset) {
705 if (keywordName == null) {
707 // force new map, ignore value
708 keywords = Collections.<String, String>emptyMap();
711 keywordName = AsciiUtil.toLowerString(keywordName.trim());
712 if (keywordName.length() == 0) {
713 throw new IllegalArgumentException("keyword must not be empty");
716 value = value.trim();
717 if (value.length() == 0) {
718 throw new IllegalArgumentException("value must not be empty");
721 Map<String, String> m = getKeywordMap();
722 if (m.isEmpty()) { // it is EMPTY_MAP
725 keywords = new TreeMap<String, String>(getKeyComparator());
726 keywords.put(keywordName, value.trim());
729 if (reset || !m.containsKey(keywordName)) {
731 m.put(keywordName, value);
733 m.remove(keywordName);
736 keywords = Collections.<String, String>emptyMap();