2 ******************************************************************************
\r
3 * Copyright (C) 2003-2009, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 ******************************************************************************
\r
8 package com.ibm.icu.impl;
\r
10 import java.util.Collections;
\r
11 import java.util.Comparator;
\r
12 import java.util.Iterator;
\r
13 import java.util.Map;
\r
14 import java.util.TreeMap;
\r
16 import com.ibm.icu.impl.locale.AsciiUtil;
\r
19 * Utility class to parse and normalize locale ids (including POSIX style)
\r
21 public final class LocaleIDParser {
\r
24 private char[] buffer;
\r
26 // um, don't handle POSIX ids unless we request it. why not? well... because.
\r
27 private boolean canonicalize;
\r
28 private boolean hadCountry;
\r
30 // used when canonicalizing
\r
31 Map<String, String> keywords;
\r
35 * Parsing constants.
\r
37 private static final char KEYWORD_SEPARATOR = '@';
\r
38 private static final char HYPHEN = '-';
\r
39 private static final char KEYWORD_ASSIGN = '=';
\r
40 private static final char COMMA = ',';
\r
41 private static final char ITEM_SEPARATOR = ';';
\r
42 private static final char DOT = '.';
\r
43 private static final char UNDERSCORE = '_';
\r
45 public LocaleIDParser(String localeID) {
\r
46 this(localeID, false);
\r
49 public LocaleIDParser(String localeID, boolean canonicalize) {
\r
50 id = localeID.toCharArray();
\r
52 buffer = new char[id.length + 5];
\r
54 this.canonicalize = canonicalize;
\r
57 private void reset() {
\r
61 // utilities for working on text in the buffer
\r
64 * Append c to the buffer.
\r
66 private void append(char c) {
\r
70 catch (IndexOutOfBoundsException e) {
\r
71 if (buffer.length > 512) {
\r
72 // something is seriously wrong, let this go
\r
75 char[] nbuffer = new char[buffer.length * 2];
\r
76 System.arraycopy(buffer, 0, nbuffer, 0, buffer.length);
\r
83 private void addSeparator() {
\r
88 * Returns the text in the buffer from start to blen as a String.
\r
90 private String getString(int start) {
\r
91 if (start == blen) {
\r
94 return new String(buffer, start, blen-start);
\r
98 * Set the length of the buffer to pos, then append the string.
\r
100 private void set(int pos, String s) {
\r
101 this.blen = pos; // no safety
\r
106 * Append the string to the buffer.
\r
108 private void append(String s) {
\r
109 for (int i = 0; i < s.length(); ++i) {
\r
110 append(s.charAt(i));
\r
114 // utilities for parsing text out of the id
\r
117 * Character to indicate no more text is available in the id.
\r
119 private static final char DONE = '\uffff';
\r
122 * Returns the character at index in the id, and advance index. The returned character
\r
123 * is DONE if index was at the limit of the buffer. The index is advanced regardless
\r
124 * so that decrementing the index will always 'unget' the last character returned.
\r
126 private char next() {
\r
127 if (index == id.length) {
\r
132 return id[index++];
\r
136 * Advance index until the next terminator or id separator, and leave it there.
\r
138 private void skipUntilTerminatorOrIDSeparator() {
\r
139 while (!isTerminatorOrIDSeparator(next())) {
\r
145 * Returns true if the character at index in the id is a terminator.
\r
147 private boolean atTerminator() {
\r
148 return index >= id.length || isTerminator(id[index]);
\r
152 * Returns true if the character is an id separator (underscore or hyphen).
\r
154 /* private boolean isIDSeparator(char c) {
\r
155 return c == UNDERSCORE || c == HYPHEN;
\r
159 * Returns true if the character is a terminator (keyword separator, dot, or DONE).
\r
160 * Dot is a terminator because of the POSIX form, where dot precedes the codepage.
\r
162 private boolean isTerminator(char c) {
\r
163 // always terminate at DOT, even if not handling POSIX. It's an error...
\r
164 return c == KEYWORD_SEPARATOR || c == DONE || c == DOT;
\r
168 * Returns true if the character is a terminator or id separator.
\r
170 private boolean isTerminatorOrIDSeparator(char c) {
\r
171 return c == KEYWORD_SEPARATOR || c == UNDERSCORE || c == HYPHEN ||
\r
172 c == DONE || c == DOT;
\r
176 * Returns true if the start of the buffer has an experimental or private language
\r
177 * prefix, the pattern '[ixIX][-_].' shows the syntax checked.
\r
179 private boolean haveExperimentalLanguagePrefix() {
\r
180 if (id.length > 2) {
\r
182 if (c == HYPHEN || c == UNDERSCORE) {
\r
184 return c == 'x' || c == 'X' || c == 'i' || c == 'I';
\r
191 * Returns true if a value separator occurs at or after index.
\r
193 private boolean haveKeywordAssign() {
\r
194 // assume it is safe to start from index
\r
195 for (int i = index; i < id.length; ++i) {
\r
196 if (id[i] == KEYWORD_ASSIGN) {
\r
204 * Advance index past language, and accumulate normalized language code in buffer.
\r
205 * Index must be at 0 when this is called. Index is left at a terminator or id
\r
206 * separator. Returns the start of the language code in the buffer.
\r
208 private int parseLanguage() {
\r
209 if (haveExperimentalLanguagePrefix()) {
\r
210 append(Character.toLowerCase(id[0]));
\r
216 while(!isTerminatorOrIDSeparator(c = next())) {
\r
217 append(Character.toLowerCase(c));
\r
222 String lang = LocaleIDs.threeToTwoLetterLanguage(getString(0));
\r
223 if (lang != null) {
\r
232 * Advance index past language. Index must be at 0 when this is called. Index
\r
233 * is left at a terminator or id separator.
\r
235 private void skipLanguage() {
\r
236 if (haveExperimentalLanguagePrefix()) {
\r
239 skipUntilTerminatorOrIDSeparator();
\r
243 * Advance index past script, and accumulate normalized script in buffer.
\r
244 * Index must be immediately after the language.
\r
245 * If the item at this position is not a script (is not four characters
\r
246 * long) leave index and buffer unchanged. Otherwise index is left at
\r
247 * a terminator or id separator. Returns the start of the script code
\r
248 * in the buffer (this may be equal to the buffer length, if there is no
\r
251 private int parseScript() {
\r
252 if (!atTerminator()) {
\r
253 int oldIndex = index; // save original index
\r
256 int oldBlen = blen; // get before append hyphen, if we truncate everything is undone
\r
258 while(!isTerminatorOrIDSeparator(c = next())) {
\r
259 if (blen == oldBlen) { // first pass
\r
261 append(Character.toUpperCase(c));
\r
263 append(Character.toLowerCase(c));
\r
268 /* If it's not exactly 4 characters long, then it's not a script. */
\r
269 if (index - oldIndex != 5) { // +1 to account for separator
\r
273 oldBlen++; // index past hyphen, for clients who want to extract just the script
\r
282 * Advance index past script.
\r
283 * Index must be immediately after the language and IDSeparator.
\r
284 * If the item at this position is not a script (is not four characters
\r
285 * long) leave index. Otherwise index is left at a terminator or
\r
288 private void skipScript() {
\r
289 if (!atTerminator()) {
\r
290 int oldIndex = index;
\r
293 skipUntilTerminatorOrIDSeparator();
\r
294 if (index - oldIndex != 5) { // +1 to account for separator
\r
301 * Advance index past country, and accumulate normalized country in buffer.
\r
302 * Index must be immediately after the script (if there is one, else language)
\r
303 * and IDSeparator. Return the start of the country code in the buffer.
\r
305 private int parseCountry() {
\r
306 if (!atTerminator()) {
\r
307 int oldIndex = index;
\r
310 int oldBlen = blen;
\r
312 while (!isTerminatorOrIDSeparator(c = next())) {
\r
313 if (oldBlen == blen) { // first, add hyphen
\r
314 hadCountry = true; // we have a country, let variant parsing know
\r
316 ++oldBlen; // increment past hyphen
\r
318 append(Character.toUpperCase(c));
\r
322 int charsAppended = blen - oldBlen;
\r
324 if (charsAppended == 0) {
\r
327 else if (charsAppended < 2 || charsAppended > 3) {
\r
328 // It's not a country, so return index and blen to
\r
329 // their previous values.
\r
333 hadCountry = false;
\r
335 else if (charsAppended == 3) {
\r
336 String region = LocaleIDs.threeToTwoLetterRegion(getString(oldBlen));
\r
337 if (region != null) {
\r
338 set(oldBlen, region);
\r
349 * Advance index past country.
\r
350 * Index must be immediately after the script (if there is one, else language)
\r
353 private void skipCountry() {
\r
354 if (!atTerminator()) {
\r
357 * Save the index point after the separator, since the format
\r
358 * requires two separators if the country is not present.
\r
360 int oldIndex = index;
\r
362 skipUntilTerminatorOrIDSeparator();
\r
363 int charsSkipped = index - oldIndex;
\r
364 if (charsSkipped < 2 || charsSkipped > 3) {
\r
371 * Advance index past variant, and accumulate normalized variant in buffer. This ignores
\r
372 * the codepage information from POSIX ids. Index must be immediately after the country
\r
373 * or script. Index is left at the keyword separator or at the end of the text. Return
\r
374 * the start of the variant code in the buffer.
\r
376 * In standard form, we can have the following forms:
\r
382 * This also handles POSIX ids, which can have the following forms (pppp is code page id):
\r
383 * ll_CC.pppp --> ll_CC
\r
384 * ll_CC.pppp@VVVV --> ll_CC_VVVV
\r
385 * ll_CC@VVVV --> ll_CC_VVVV
\r
387 * We identify this use of '@' in POSIX ids by looking for an '=' following
\r
388 * the '@'. If there is one, we consider '@' to start a keyword list, instead of
\r
389 * being part of a POSIX id.
\r
391 * Note: since it was decided that we want an option to not handle POSIX ids, this
\r
392 * becomes a bit more complex.
\r
394 private int parseVariant() {
\r
395 int oldBlen = blen;
\r
397 boolean start = true;
\r
398 boolean needSeparator = true;
\r
399 boolean skipping = false;
\r
401 while ((c = next()) != DONE) {
\r
405 } else if (c == KEYWORD_SEPARATOR) {
\r
406 if (haveKeywordAssign()) {
\r
411 needSeparator = true; // add another underscore if we have more text
\r
412 } else if (start) {
\r
414 } else if (!skipping) {
\r
415 if (needSeparator) {
\r
416 boolean incOldBlen = blen == oldBlen; // need to skip separators
\r
417 needSeparator = false;
\r
418 if (incOldBlen && !hadCountry) { // no country, we'll need two
\r
420 ++oldBlen; // for sure
\r
423 if (incOldBlen) { // only for the first separator
\r
427 c = Character.toUpperCase(c);
\r
428 if (c == HYPHEN || c == COMMA) {
\r
439 // no need for skipvariant, to get the keywords we'll just scan directly for
\r
440 // the keyword separator
\r
443 * Returns the normalized language id, or the empty string.
\r
445 public String getLanguage() {
\r
447 return getString(parseLanguage());
\r
451 * Returns the normalized script id, or the empty string.
\r
453 public String getScript() {
\r
456 return getString(parseScript());
\r
460 * return the normalized country id, or the empty string.
\r
462 public String getCountry() {
\r
466 return getString(parseCountry());
\r
470 * Returns the normalized variant id, or the empty string.
\r
472 public String getVariant() {
\r
477 return getString(parseVariant());
\r
481 * Returns the language, script, country, and variant as separate strings.
\r
483 public String[] getLanguageScriptCountryVariant() {
\r
485 return new String[] {
\r
486 getString(parseLanguage()),
\r
487 getString(parseScript()),
\r
488 getString(parseCountry()),
\r
489 getString(parseVariant())
\r
493 public void setBaseName(String baseName) {
\r
494 this.baseName = baseName;
\r
497 public void parseBaseName() {
\r
498 if (baseName != null) {
\r
507 // catch unwanted trailing underscore after country if there was no variant
\r
508 if (blen > 1 && buffer[blen-1] == UNDERSCORE) {
\r
515 * Returns the normalized base form of the locale id. The base
\r
516 * form does not include keywords.
\r
518 public String getBaseName() {
\r
519 if (baseName != null) {
\r
523 return getString(0);
\r
527 * Returns the normalized full form of the locale id. The full
\r
528 * form includes keywords if they are present.
\r
530 public String getName() {
\r
533 return getString(0);
\r
536 // keyword utilities
\r
539 * If we have keywords, advance index to the start of the keywords and return true,
\r
540 * otherwise return false.
\r
542 private boolean setToKeywordStart() {
\r
543 for (int i = index; i < id.length; ++i) {
\r
544 if (id[i] == KEYWORD_SEPARATOR) {
\r
545 if (canonicalize) {
\r
546 for (int j = ++i; j < id.length; ++j) { // increment i past separator for return
\r
547 if (id[j] == KEYWORD_ASSIGN) {
\r
553 if (++i < id.length) {
\r
564 private static boolean isDoneOrKeywordAssign(char c) {
\r
565 return c == DONE || c == KEYWORD_ASSIGN;
\r
568 private static boolean isDoneOrItemSeparator(char c) {
\r
569 return c == DONE || c == ITEM_SEPARATOR;
\r
572 private String getKeyword() {
\r
574 while (!isDoneOrKeywordAssign(next())) {
\r
577 return AsciiUtil.toLowerString(new String(id, start, index-start).trim());
\r
580 private String getValue() {
\r
582 while (!isDoneOrItemSeparator(next())) {
\r
585 return new String(id, start, index-start).trim(); // leave case alone
\r
588 private Comparator<String> getKeyComparator() {
\r
589 final Comparator<String> comp = new Comparator<String>() {
\r
590 public int compare(String lhs, String rhs) {
\r
591 return lhs.compareTo(rhs);
\r
598 * Returns a map of the keywords and values, or null if there are none.
\r
600 public Map<String, String> getKeywordMap() {
\r
601 if (keywords == null) {
\r
602 TreeMap<String, String> m = null;
\r
603 if (setToKeywordStart()) {
\r
604 // trim spaces and convert to lower case, both keywords and values.
\r
606 String key = getKeyword();
\r
607 if (key.length() == 0) {
\r
611 if (c != KEYWORD_ASSIGN) {
\r
612 // throw new IllegalArgumentException("key '" + key + "' missing a value.");
\r
619 String value = getValue();
\r
620 if (value.length() == 0) {
\r
621 // throw new IllegalArgumentException("key '" + key + "' missing a value.");
\r
625 m = new TreeMap<String, String>(getKeyComparator());
\r
626 } else if (m.containsKey(key)) {
\r
627 // throw new IllegalArgumentException("key '" + key + "' already has a value.");
\r
631 } while (next() == ITEM_SEPARATOR);
\r
633 keywords = m != null ? m : Collections.<String, String>emptyMap();
\r
641 * Parse the keywords and return start of the string in the buffer.
\r
643 private int parseKeywords() {
\r
644 int oldBlen = blen;
\r
645 Map<String, String> m = getKeywordMap();
\r
646 if (!m.isEmpty()) {
\r
647 boolean first = true;
\r
648 for (Map.Entry<String, String> e : m.entrySet()) {
\r
649 append(first ? KEYWORD_SEPARATOR : ITEM_SEPARATOR);
\r
651 append(e.getKey());
\r
652 append(KEYWORD_ASSIGN);
\r
653 append(e.getValue());
\r
655 if (blen != oldBlen) {
\r
663 * Returns an iterator over the keywords, or null if we have an empty map.
\r
665 public Iterator<String> getKeywords() {
\r
666 Map<String, String> m = getKeywordMap();
\r
667 return m.isEmpty() ? null : m.keySet().iterator();
\r
671 * Returns the value for the named keyword, or null if the keyword is not
\r
674 public String getKeywordValue(String keywordName) {
\r
675 Map<String, String> m = getKeywordMap();
\r
676 return m.isEmpty() ? null : m.get(AsciiUtil.toLowerString(keywordName.trim()));
\r
680 * Set the keyword value only if it is not already set to something else.
\r
682 public void defaultKeywordValue(String keywordName, String value) {
\r
683 setKeywordValue(keywordName, value, false);
\r
687 * Set the value for the named keyword, or unset it if value is null. If
\r
688 * keywordName itself is null, unset all keywords. If keywordName is not null,
\r
689 * value must not be null.
\r
691 public void setKeywordValue(String keywordName, String value) {
\r
692 setKeywordValue(keywordName, value, true);
\r
696 * Set the value for the named keyword, or unset it if value is null. If
\r
697 * keywordName itself is null, unset all keywords. If keywordName is not null,
\r
698 * value must not be null. If reset is true, ignore any previous value for
\r
699 * the keyword, otherwise do not change the keyword (including removal of
\r
700 * one or all keywords).
\r
702 private void setKeywordValue(String keywordName, String value, boolean reset) {
\r
703 if (keywordName == null) {
\r
705 // force new map, ignore value
\r
706 keywords = Collections.<String, String>emptyMap();
\r
709 keywordName = AsciiUtil.toLowerString(keywordName.trim());
\r
710 if (keywordName.length() == 0) {
\r
711 throw new IllegalArgumentException("keyword must not be empty");
\r
713 if (value != null) {
\r
714 value = value.trim();
\r
715 if (value.length() == 0) {
\r
716 throw new IllegalArgumentException("value must not be empty");
\r
719 Map<String, String> m = getKeywordMap();
\r
720 if (m.isEmpty()) { // it is EMPTY_MAP
\r
721 if (value != null) {
\r
723 keywords = new TreeMap<String, String>(getKeyComparator());
\r
724 keywords.put(keywordName, value.trim());
\r
727 if (reset || !m.containsKey(keywordName)) {
\r
728 if (value != null) {
\r
729 m.put(keywordName, value);
\r
731 m.remove(keywordName);
\r
734 keywords = Collections.<String, String>emptyMap();
\r