2 *******************************************************************************
\r
3 * Copyright (C) 1996-2008, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
8 package com.ibm.icu.text;
\r
10 import java.util.Comparator;
\r
11 import java.util.Locale;
\r
13 import com.ibm.icu.util.ULocale;
\r
16 * <p>Collator performs locale-sensitive string comparison.</p>
\r
18 * <p>Following the <a href=http://www.unicode.org>Unicode
\r
19 * Consortium</a>'s specifications for the
\r
20 * <a href="http://www.unicode.org/unicode/reports/tr10/"> Unicode Collation
\r
21 * Algorithm (UCA)</a>, there are 5 different levels of strength used
\r
25 * <li>PRIMARY strength: Typically, this is used to denote differences between
\r
26 * base characters (for example, "a" < "b").
\r
27 * It is the strongest difference. For example, dictionaries are divided
\r
28 * into different sections by base character.
\r
29 * <li>SECONDARY strength: Accents in the characters are considered secondary
\r
30 * differences (for example, "as" < "às" < "at"). Other
\r
32 * between letters can also be considered secondary differences, depending
\r
33 * on the language. A secondary difference is ignored when there is a
\r
34 * primary difference anywhere in the strings.
\r
35 * <li>TERTIARY strength: Upper and lower case differences in characters are
\r
36 * distinguished at tertiary strength (for example, "ao" < "Ao" <
\r
37 * "aò"). In addition, a variant of a letter differs from the base
\r
38 * form on the tertiary strength (such as "A" and "Ⓐ"). Another
\r
40 * difference between large and small Kana. A tertiary difference is ignored
\r
41 * when there is a primary or secondary difference anywhere in the strings.
\r
42 * <li>QUATERNARY strength: When punctuation is ignored
\r
43 * <a href="http://www.icu-project.org/userguide/Collate_Concepts.html#Ignoring_Punctuation">
\r
44 * (see Ignoring Punctuations in the user guide)</a> at PRIMARY to TERTIARY
\r
45 * strength, an additional strength level can
\r
46 * be used to distinguish words with and without punctuation (for example,
\r
47 * "ab" < "a-b" < "aB").
\r
48 * This difference is ignored when there is a PRIMARY, SECONDARY or TERTIARY
\r
49 * difference. The QUATERNARY strength should only be used if ignoring
\r
50 * punctuation is required.
\r
51 * <li>IDENTICAL strength:
\r
52 * When all other strengths are equal, the IDENTICAL strength is used as a
\r
53 * tiebreaker. The Unicode code point values of the NFD form of each string
\r
54 * are compared, just in case there is no difference.
\r
55 * For example, Hebrew cantellation marks are only distinguished at this
\r
56 * strength. This strength should be used sparingly, as only code point
\r
57 * value differences between two strings is an extremely rare occurrence.
\r
58 * Using this strength substantially decreases the performance for both
\r
59 * comparison and collation key generation APIs. This strength also
\r
60 * increases the size of the collation key.
\r
63 * Unlike the JDK, ICU4J's Collator deals only with 2 decomposition modes,
\r
64 * the canonical decomposition mode and one that does not use any decomposition.
\r
65 * The compatibility decomposition mode, java.text.Collator.FULL_DECOMPOSITION
\r
66 * is not supported here. If the canonical
\r
67 * decomposition mode is set, the Collator handles un-normalized text properly,
\r
68 * producing the same results as if the text were normalized in NFD. If
\r
69 * canonical decomposition is turned off, it is the user's responsibility to
\r
70 * ensure that all text is already in the appropriate form before performing
\r
71 * a comparison or before getting a CollationKey.</p>
\r
73 * <p>For more information about the collation service see the
\r
74 * <a href="http://www.icu-project.org/userguide/Collate_Intro.html">users
\r
77 * <p>Examples of use
\r
79 * // Get the Collator for US English and set its strength to PRIMARY
\r
80 * Collator usCollator = Collator.getInstance(Locale.US);
\r
81 * usCollator.setStrength(Collator.PRIMARY);
\r
82 * if (usCollator.compare("abc", "ABC") == 0) {
\r
83 * System.out.println("Strings are equivalent");
\r
86 * The following example shows how to compare two strings using the
\r
87 * Collator for the default locale.
\r
89 * // Compare two strings in the default locale
\r
90 * Collator myCollator = Collator.getInstance();
\r
91 * myCollator.setDecomposition(NO_DECOMPOSITION);
\r
92 * if (myCollator.compare("à\u0325", "a\u0325̀") != 0) {
\r
93 * System.out.println("à\u0325 is not equals to a\u0325̀ without decomposition");
\r
94 * myCollator.setDecomposition(CANONICAL_DECOMPOSITION);
\r
95 * if (myCollator.compare("à\u0325", "a\u0325̀") != 0) {
\r
96 * System.out.println("Error: à\u0325 should be equals to a\u0325̀ with decomposition");
\r
99 * System.out.println("à\u0325 is equals to a\u0325̀ with decomposition");
\r
103 * System.out.println("Error: à\u0325 should be not equals to a\u0325̀ without decomposition");
\r
107 * @see CollationKey
\r
108 * @author Syn Wee Quek
\r
111 public class Collator implements Comparator, Cloneable
\r
116 private final java.text.Collator collator;
\r
121 private Collator(java.text.Collator delegate) {
\r
122 this.collator = delegate;
\r
126 * Create a collator with a null delegate.
\r
127 * For use by possible subclassers. This is present since
\r
128 * the original Collator is abstract, and so, in theory
\r
129 * subclassable. All member APIs must be overridden.
\r
131 protected Collator() {
\r
132 this.collator = null;
\r
135 // public data members ---------------------------------------------------
\r
138 * Strongest collator strength value. Typically used to denote differences
\r
139 * between base characters. See class documentation for more explanation.
\r
140 * @see #setStrength
\r
141 * @see #getStrength
\r
144 public final static int PRIMARY = java.text.Collator.PRIMARY;
\r
147 * Second level collator strength value.
\r
148 * Accents in the characters are considered secondary differences.
\r
149 * Other differences between letters can also be considered secondary
\r
150 * differences, depending on the language.
\r
151 * See class documentation for more explanation.
\r
152 * @see #setStrength
\r
153 * @see #getStrength
\r
156 public final static int SECONDARY = java.text.Collator.SECONDARY;
\r
159 * Third level collator strength value.
\r
160 * Upper and lower case differences in characters are distinguished at this
\r
161 * strength level. In addition, a variant of a letter differs from the base
\r
162 * form on the tertiary level.
\r
163 * See class documentation for more explanation.
\r
164 * @see #setStrength
\r
165 * @see #getStrength
\r
168 public final static int TERTIARY = java.text.Collator.TERTIARY;
\r
171 * Fourth level collator strength value.
\r
172 * When punctuation is ignored
\r
173 * <a href="http://www.icu-project.org/userguide/Collate_Concepts.html#Ignoring_Punctuation">
\r
174 * (see Ignoring Punctuations in the user guide)</a> at PRIMARY to TERTIARY
\r
175 * strength, an additional strength level can
\r
176 * be used to distinguish words with and without punctuation.
\r
177 * See class documentation for more explanation.
\r
178 * @see #setStrength
\r
179 * @see #getStrength
\r
182 public final static int QUATERNARY = java.text.Collator.IDENTICAL;
\r
186 * Smallest Collator strength value. When all other strengths are equal,
\r
187 * the IDENTICAL strength is used as a tiebreaker. The Unicode code point
\r
188 * values of the NFD form of each string are compared, just in case there
\r
189 * is no difference.
\r
190 * See class documentation for more explanation.
\r
193 * Note this value is different from JDK's
\r
197 public final static int IDENTICAL = java.text.Collator.FULL_DECOMPOSITION;
\r
200 * This is for backwards compatibility with Java APIs only. It
\r
201 * should not be used, IDENTICAL should be used instead. ICU's
\r
202 * collation does not support Java's FULL_DECOMPOSITION mode.
\r
204 * @deprecated Backwards compatibility with Java only.
\r
206 public final static int FULL_DECOMPOSITION = java.text.Collator.FULL_DECOMPOSITION;
\r
209 * <p>Decomposition mode value. With NO_DECOMPOSITION set, Strings
\r
210 * will not be decomposed for collation. This is the default
\r
211 * decomposition setting unless otherwise specified by the locale
\r
212 * used to create the Collator.</p>
\r
214 * <p><strong>Note</strong> this value is different from the JDK's.</p>
\r
215 * @see #CANONICAL_DECOMPOSITION
\r
216 * @see #getDecomposition
\r
217 * @see #setDecomposition
\r
220 public final static int NO_DECOMPOSITION = java.text.Collator.NO_DECOMPOSITION;
\r
223 * <p>Decomposition mode value. With CANONICAL_DECOMPOSITION set,
\r
224 * characters that are canonical variants according to the Unicode standard
\r
225 * will be decomposed for collation.</p>
\r
227 * <p>CANONICAL_DECOMPOSITION corresponds to Normalization Form D as
\r
228 * described in <a href="http://www.unicode.org/unicode/reports/tr15/">
\r
229 * Unicode Technical Report #15</a>.
\r
231 * @see #NO_DECOMPOSITION
\r
232 * @see #getDecomposition
\r
233 * @see #setDecomposition
\r
236 public final static int CANONICAL_DECOMPOSITION = java.text.Collator.CANONICAL_DECOMPOSITION;
\r
238 // public methods --------------------------------------------------------
\r
240 // public setters --------------------------------------------------------
\r
243 * <p>Sets this Collator's strength property. The strength property
\r
244 * determines the minimum level of difference considered significant
\r
245 * during comparison.</p>
\r
247 * <p>The default strength for the Collator is TERTIARY, unless specified
\r
248 * otherwise by the locale used to create the Collator.</p>
\r
250 * <p>See the Collator class description for an example of use.</p>
\r
251 * @param newStrength the new strength value.
\r
252 * @see #getStrength
\r
258 * @exception IllegalArgumentException if the new strength value is not one
\r
259 * of PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL.
\r
262 public void setStrength(int newStrength) {
\r
263 collator.setStrength(newStrength);
\r
267 * <p>Set the decomposition mode of this Collator. Setting this
\r
268 * decomposition property with CANONICAL_DECOMPOSITION allows the
\r
269 * Collator to handle un-normalized text properly, producing the
\r
270 * same results as if the text were normalized. If
\r
271 * NO_DECOMPOSITION is set, it is the user's responsibility to
\r
272 * insure that all text is already in the appropriate form before
\r
273 * a comparison or before getting a CollationKey. Adjusting
\r
274 * decomposition mode allows the user to select between faster and
\r
275 * more complete collation behavior.</p>
\r
277 * <p>Since a great many of the world's languages do not require
\r
278 * text normalization, most locales set NO_DECOMPOSITION as the
\r
279 * default decomposition mode.</p>
\r
281 * The default decompositon mode for the Collator is
\r
282 * NO_DECOMPOSITON, unless specified otherwise by the locale used
\r
283 * to create the Collator.</p>
\r
285 * <p>See getDecomposition for a description of decomposition
\r
288 * @param decomposition the new decomposition mode
\r
289 * @see #getDecomposition
\r
290 * @see #NO_DECOMPOSITION
\r
291 * @see #CANONICAL_DECOMPOSITION
\r
292 * @exception IllegalArgumentException If the given value is not a valid
\r
293 * decomposition mode.
\r
296 public void setDecomposition(int decomposition) {
\r
297 collator.setDecomposition(decomposition);
\r
300 // public getters --------------------------------------------------------
\r
303 * Gets the Collator for the current default locale.
\r
304 * The default locale is determined by java.util.Locale.getDefault().
\r
305 * @return the Collator for the default locale (for example, en_US) if it
\r
306 * is created successfully. Otherwise if there is no Collator
\r
307 * associated with the current locale, the default UCA collator
\r
308 * will be returned.
\r
309 * @see java.util.Locale#getDefault()
\r
310 * @see #getInstance(Locale)
\r
313 public static final Collator getInstance() {
\r
314 return new Collator(java.text.Collator.getInstance());
\r
318 * Gets the Collator for the desired locale.
\r
319 * @param locale the desired locale.
\r
320 * @return Collator for the desired locale if it is created successfully.
\r
321 * Otherwise if there is no Collator
\r
322 * associated with the current locale, a default UCA collator will
\r
324 * @see java.util.Locale
\r
325 * @see java.util.ResourceBundle
\r
326 * @see #getInstance(Locale)
\r
327 * @see #getInstance()
\r
328 * @stable ICU 3.4.3
\r
330 public static final Collator getInstance(ULocale locale) {
\r
331 return getInstance(locale.toLocale());
\r
335 * Gets the Collator for the desired locale.
\r
336 * @param locale the desired locale.
\r
337 * @return Collator for the desired locale if it is created successfully.
\r
338 * Otherwise if there is no Collator
\r
339 * associated with the current locale, a default UCA collator will
\r
341 * @see java.util.Locale
\r
342 * @see java.util.ResourceBundle
\r
343 * @see #getInstance(ULocale)
\r
344 * @see #getInstance()
\r
347 public static final Collator getInstance(Locale locale) {
\r
348 return new Collator(java.text.Collator.getInstance(locale));
\r
352 * Get the set of locales, as Locale objects, for which collators
\r
353 * are installed. Note that Locale objects do not support RFC 3066.
\r
354 * @return the list of locales in which collators are installed.
\r
355 * This list includes any that have been registered, in addition to
\r
356 * those that are installed with ICU4J.
\r
359 public static Locale[] getAvailableLocales() {
\r
360 return java.text.Collator.getAvailableLocales();
\r
364 * Get the set of locales, as ULocale objects, for which collators
\r
365 * are installed. ULocale objects support RFC 3066.
\r
366 * @return the list of locales in which collators are installed.
\r
367 * This list includes any that have been registered, in addition to
\r
368 * those that are installed with ICU4J.
\r
369 * @stable ICU 3.4.3
\r
371 public static final ULocale[] getAvailableULocales() {
\r
372 Locale[] locales = java.text.Collator.getAvailableLocales();
\r
373 ULocale[] ulocales = new ULocale[locales.length];
\r
374 for (int i = 0; i < locales.length; ++i) {
\r
375 ulocales[i] = ULocale.forLocale(locales[i]);
\r
381 * Return an array of all possible keywords that are relevant to
\r
382 * collation. At this point, the only recognized keyword for this
\r
383 * service is "collation".
\r
384 * @return an array of valid collation keywords.
\r
385 * @see #getKeywordValues
\r
388 public static final String[] getKeywords() {
\r
389 return new String[0];
\r
393 * Given a keyword, return an array of all values for
\r
394 * that keyword that are currently in use.
\r
395 * @param keyword one of the keywords returned by getKeywords.
\r
396 * @see #getKeywords
\r
399 public static final String[] getKeywordValues(String keyword) {
\r
400 return new String[0];
\r
404 * <p>Returns this Collator's strength property. The strength property
\r
405 * determines the minimum level of difference considered significant.
\r
408 * See the Collator class description for more details.
\r
410 * @return this Collator's current strength property.
\r
411 * @see #setStrength
\r
419 public int getStrength() {
\r
420 return collator.getStrength();
\r
425 * Get the decomposition mode of this Collator. Decomposition mode
\r
426 * determines how Unicode composed characters are handled.
\r
429 * See the Collator class description for more details.
\r
431 * @return the decomposition mode
\r
432 * @see #setDecomposition
\r
433 * @see #NO_DECOMPOSITION
\r
434 * @see #CANONICAL_DECOMPOSITION
\r
437 public int getDecomposition() {
\r
438 return collator.getDecomposition();
\r
443 * Compares the source text String to the target text String according to
\r
444 * this Collator's rules, strength and decomposition mode.
\r
445 * Returns an integer less than,
\r
446 * equal to or greater than zero depending on whether the source String is
\r
447 * less than, equal to or greater than the target String. See the Collator
\r
448 * class description for an example of use.
\r
450 * @param source the source String.
\r
451 * @param target the target String.
\r
452 * @return Returns an integer value. Value is less than zero if source is
\r
453 * less than target, value is zero if source and target are equal,
\r
454 * value is greater than zero if source is greater than target.
\r
455 * @see CollationKey
\r
456 * @see #getCollationKey
\r
457 * @exception NullPointerException thrown if either arguments is null.
\r
458 * IllegalArgumentException thrown if either source or target is
\r
459 * not of the class String.
\r
462 public int compare(Object source, Object target) {
\r
463 return collator.compare(source, target);
\r
466 // public other methods -------------------------------------------------
\r
469 * Convenience method for comparing the equality of two text Strings using
\r
470 * this Collator's rules, strength and decomposition mode.
\r
471 * @param source the source string to be compared.
\r
472 * @param target the target string to be compared.
\r
473 * @return true if the strings are equal according to the collation
\r
474 * rules, otherwise false.
\r
476 * @exception NullPointerException thrown if either arguments is null.
\r
479 public boolean equals(String source, String target) {
\r
480 return (compare(source, target) == 0);
\r
485 * Compares the source text String to the target text String according to
\r
486 * this Collator's rules, strength and decomposition mode.
\r
487 * Returns an integer less than,
\r
488 * equal to or greater than zero depending on whether the source String is
\r
489 * less than, equal to or greater than the target String. See the Collator
\r
490 * class description for an example of use.
\r
492 * @param source the source String.
\r
493 * @param target the target String.
\r
494 * @return Returns an integer value. Value is less than zero if source is
\r
495 * less than target, value is zero if source and target are equal,
\r
496 * value is greater than zero if source is greater than target.
\r
497 * @see CollationKey
\r
498 * @see #getCollationKey
\r
499 * @exception NullPointerException thrown if either arguments is null.
\r
502 public int compare(String source, String target) {
\r
503 return collator.compare(source, target);
\r
508 * Transforms the String into a CollationKey suitable for efficient
\r
509 * repeated comparison. The resulting key depends on the collator's
\r
510 * rules, strength and decomposition mode.
\r
512 * <p>See the CollationKey class documentation for more information.</p>
\r
513 * @param source the string to be transformed into a CollationKey.
\r
514 * @return the CollationKey for the given String based on this Collator's
\r
515 * collation rules. If the source String is null, a null
\r
516 * CollationKey is returned.
\r
517 * @see CollationKey
\r
518 * @see #compare(String, String)
\r
521 public CollationKey getCollationKey(String source) {
\r
522 return new CollationKey(collator.getCollationKey(source));
\r
526 * Return a string suitable for debugging.
\r
527 * @return a string suitable for debugging
\r
528 * @stable ICU 3.4.3
\r
530 public String toString() {
\r
531 return collator.toString();
\r
535 * Clone the collator.
\r
536 * @return a clone of this collator.
\r
539 public Object clone() throws CloneNotSupportedException {
\r
540 return new Collator((java.text.Collator)collator.clone());
\r
544 * Return true if rhs is a Collator and compares the same as this.
\r
545 * @return true if rhs equals this
\r
546 * @stable ICU 3.4.3
\r
548 public boolean equals(Object rhs) {
\r
550 return collator.equals(((Collator)rhs).collator);
\r
552 catch (Exception e) {
\r
558 * Return a hashCode.
\r
559 * @return a hashCode
\r
560 * @stable ICU 3.4.3
\r
562 public int hashCode() {
\r
563 return collator.hashCode();
\r