jars/icu4j-4_2_1-src/src/com/ibm/icu/dev/eclipse/plugins/com.ibm.icu.base/src/com/ibm/icu/text/Collator.java

   1 /*\r
   2  *******************************************************************************\r
   3  * Copyright (C) 1996-2008, International Business Machines Corporation and    *\r
   4  * others. All Rights Reserved.                                                *\r
   5  *******************************************************************************\r
   6  */\r
   7 \r
   8 package com.ibm.icu.text;\r
   9 \r
  10 import java.util.Comparator;\r
  11 import java.util.Locale;\r
  12 \r
  13 import com.ibm.icu.util.ULocale;\r
  14 \r
  15 /**\r
  16 * <p>Collator performs locale-sensitive string comparison.</p>\r
  17 *\r
  18 * <p>Following the <a href=http://www.unicode.org>Unicode\r
  19 * Consortium</a>'s specifications for the\r
  20 * <a href="http://www.unicode.org/unicode/reports/tr10/"> Unicode Collation\r
  21 * Algorithm (UCA)</a>, there are 5 different levels of strength used\r
  22 * in comparisons:\r
  23 *\r
  24 * <ul>\r
  25 * <li>PRIMARY strength: Typically, this is used to denote differences between\r
  26 *     base characters (for example, "a" &lt; "b").\r
  27 *     It is the strongest difference. For example, dictionaries are divided\r
  28 *     into different sections by base character.\r
  29 * <li>SECONDARY strength: Accents in the characters are considered secondary\r
  30 *     differences (for example, "as" &lt; "&agrave;s" &lt; "at"). Other\r
  31 *     differences\r
  32 *     between letters can also be considered secondary differences, depending\r
  33 *     on the language. A secondary difference is ignored when there is a\r
  34 *     primary difference anywhere in the strings.\r
  35 * <li>TERTIARY strength: Upper and lower case differences in characters are\r
  36 *     distinguished at tertiary strength (for example, "ao" &lt; "Ao" &lt;\r
  37 *     "a&ograve;"). In addition, a variant of a letter differs from the base\r
  38 *     form on the tertiary strength (such as "A" and "&#9398;"). Another\r
  39 *     example is the\r
  40 *     difference between large and small Kana. A tertiary difference is ignored\r
  41 *     when there is a primary or secondary difference anywhere in the strings.\r
  42 * <li>QUATERNARY strength: When punctuation is ignored\r
  43 *     <a href="http://www.icu-project.org/userguide/Collate_Concepts.html#Ignoring_Punctuation">\r
  44 *     (see Ignoring Punctuations in the user guide)</a> at PRIMARY to TERTIARY\r
  45 *     strength, an additional strength level can\r
  46 *     be used to distinguish words with and without punctuation (for example,\r
  47 *     "ab" &lt; "a-b" &lt; "aB").\r
  48 *     This difference is ignored when there is a PRIMARY, SECONDARY or TERTIARY\r
  49 *     difference. The QUATERNARY strength should only be used if ignoring\r
  50 *     punctuation is required.\r
  51 * <li>IDENTICAL strength:\r
  52 *     When all other strengths are equal, the IDENTICAL strength is used as a\r
  53 *     tiebreaker. The Unicode code point values of the NFD form of each string\r
  54 *     are compared, just in case there is no difference.\r
  55 *     For example, Hebrew cantellation marks are only distinguished at this\r
  56 *     strength. This strength should be used sparingly, as only code point\r
  57 *     value differences between two strings is an extremely rare occurrence.\r
  58 *     Using this strength substantially decreases the performance for both\r
  59 *     comparison and collation key generation APIs. This strength also\r
  60 *     increases the size of the collation key.\r
  61 * </ul>\r
  62 *\r
  63 * Unlike the JDK, ICU4J's Collator deals only with 2 decomposition modes,\r
  64 * the canonical decomposition mode and one that does not use any decomposition.\r
  65 * The compatibility decomposition mode, java.text.Collator.FULL_DECOMPOSITION\r
  66 * is not supported here. If the canonical\r
  67 * decomposition mode is set, the Collator handles un-normalized text properly,\r
  68 * producing the same results as if the text were normalized in NFD. If\r
  69 * canonical decomposition is turned off, it is the user's responsibility to\r
  70 * ensure that all text is already in the appropriate form before performing\r
  71 * a comparison or before getting a CollationKey.</p>\r
  72 *\r
  73 * <p>For more information about the collation service see the\r
  74 * <a href="http://www.icu-project.org/userguide/Collate_Intro.html">users\r
  75 * guide</a>.</p>\r
  76 *\r
  77 * <p>Examples of use\r
  78 * <pre>\r
  79 * // Get the Collator for US English and set its strength to PRIMARY\r
  80 * Collator usCollator = Collator.getInstance(Locale.US);\r
  81 * usCollator.setStrength(Collator.PRIMARY);\r
  82 * if (usCollator.compare("abc", "ABC") == 0) {\r
  83 *     System.out.println("Strings are equivalent");\r
  84 * }\r
  85 *\r
  86 * The following example shows how to compare two strings using the\r
  87 * Collator for the default locale.\r
  88 *\r
  89 * // Compare two strings in the default locale\r
  90 * Collator myCollator = Collator.getInstance();\r
  91 * myCollator.setDecomposition(NO_DECOMPOSITION);\r
  92 * if (myCollator.compare("&agrave;&#92;u0325", "a&#92;u0325&#768;") != 0) {\r
  93 *     System.out.println("&agrave;&#92;u0325 is not equals to a&#92;u0325&#768; without decomposition");\r
  94 *     myCollator.setDecomposition(CANONICAL_DECOMPOSITION);\r
  95 *     if (myCollator.compare("&agrave;&#92;u0325", "a&#92;u0325&#768;") != 0) {\r
  96 *         System.out.println("Error: &agrave;&#92;u0325 should be equals to a&#92;u0325&#768; with decomposition");\r
  97 *     }\r
  98 *     else {\r
  99 *         System.out.println("&agrave;&#92;u0325 is equals to a&#92;u0325&#768; with decomposition");\r
 100 *     }\r
 101 * }\r
 102 * else {\r
 103 *     System.out.println("Error: &agrave;&#92;u0325 should be not equals to a&#92;u0325&#768; without decomposition");\r
 104 * }\r
 105 * </pre>\r
 106 * </p>\r
 107 * @see CollationKey\r
 108 * @author Syn Wee Quek\r
 109 * @stable ICU 2.8\r
 110 */\r
 111 public class Collator implements Comparator, Cloneable\r
 112 {\r
 113     /**\r
 114      * @internal\r
 115      */\r
 116     private final java.text.Collator collator;\r
 117 \r
 118     /**\r
 119      * @internal\r
 120      */\r
 121     private Collator(java.text.Collator delegate) {\r
 122         this.collator = delegate;\r
 123     }\r
 124 \r
 125     /**\r
 126      * Create a collator with a null delegate.\r
 127      * For use by possible subclassers.  This is present since\r
 128      * the original Collator is abstract, and so, in theory\r
 129      * subclassable.  All member APIs must be overridden.\r
 130      */\r
 131     protected Collator() {\r
 132         this.collator = null;\r
 133     }\r
 134 \r
 135     // public data members ---------------------------------------------------\r
 136 \r
 137     /**\r
 138      * Strongest collator strength value. Typically used to denote differences\r
 139      * between base characters. See class documentation for more explanation.\r
 140      * @see #setStrength\r
 141      * @see #getStrength\r
 142      * @stable ICU 2.8\r
 143      */\r
 144     public final static int PRIMARY = java.text.Collator.PRIMARY;\r
 145 \r
 146     /**\r
 147      * Second level collator strength value.\r
 148      * Accents in the characters are considered secondary differences.\r
 149      * Other differences between letters can also be considered secondary\r
 150      * differences, depending on the language.\r
 151      * See class documentation for more explanation.\r
 152      * @see #setStrength\r
 153      * @see #getStrength\r
 154      * @stable ICU 2.8\r
 155      */\r
 156     public final static int SECONDARY = java.text.Collator.SECONDARY;\r
 157 \r
 158     /**\r
 159      * Third level collator strength value.\r
 160      * Upper and lower case differences in characters are distinguished at this\r
 161      * strength level. In addition, a variant of a letter differs from the base\r
 162      * form on the tertiary level.\r
 163      * See class documentation for more explanation.\r
 164      * @see #setStrength\r
 165      * @see #getStrength\r
 166      * @stable ICU 2.8\r
 167      */\r
 168     public final static int TERTIARY = java.text.Collator.TERTIARY;\r
 169 \r
 170     /**\r
 171      * Fourth level collator strength value.\r
 172      * When punctuation is ignored\r
 173      * <a href="http://www.icu-project.org/userguide/Collate_Concepts.html#Ignoring_Punctuation">\r
 174      * (see Ignoring Punctuations in the user guide)</a> at PRIMARY to TERTIARY\r
 175      * strength, an additional strength level can\r
 176      * be used to distinguish words with and without punctuation.\r
 177      * See class documentation for more explanation.\r
 178      * @see #setStrength\r
 179      * @see #getStrength\r
 180      * @stable ICU 2.8\r
 181      */\r
 182     public final static int QUATERNARY = java.text.Collator.IDENTICAL;\r
 183 \r
 184     /**\r
 185      * <p>\r
 186      * Smallest Collator strength value. When all other strengths are equal,\r
 187      * the IDENTICAL strength is used as a tiebreaker. The Unicode code point\r
 188      * values of the NFD form of each string are compared, just in case there\r
 189      * is no difference.\r
 190      * See class documentation for more explanation.\r
 191      * </p>\r
 192      * <p>\r
 193      * Note this value is different from JDK's\r
 194      * </p>\r
 195      * @stable ICU 2.8\r
 196      */\r
 197     public final static int IDENTICAL = java.text.Collator.FULL_DECOMPOSITION;\r
 198 \r
 199     /**\r
 200      * This is for backwards compatibility with Java APIs only.  It\r
 201      * should not be used, IDENTICAL should be used instead.  ICU's\r
 202      * collation does not support Java's FULL_DECOMPOSITION mode.\r
 203      * @stable ICU 3.4\r
 204      * @deprecated Backwards compatibility with Java only.\r
 205      */\r
 206     public final static int FULL_DECOMPOSITION = java.text.Collator.FULL_DECOMPOSITION;\r
 207 \r
 208     /**\r
 209      * <p>Decomposition mode value. With NO_DECOMPOSITION set, Strings\r
 210      * will not be decomposed for collation. This is the default\r
 211      * decomposition setting unless otherwise specified by the locale\r
 212      * used to create the Collator.</p>\r
 213      *\r
 214      * <p><strong>Note</strong> this value is different from the JDK's.</p>\r
 215      * @see #CANONICAL_DECOMPOSITION\r
 216      * @see #getDecomposition\r
 217      * @see #setDecomposition\r
 218      * @stable ICU 2.8\r
 219      */\r
 220     public final static int NO_DECOMPOSITION = java.text.Collator.NO_DECOMPOSITION;\r
 221 \r
 222     /**\r
 223      * <p>Decomposition mode value. With CANONICAL_DECOMPOSITION set,\r
 224      * characters that are canonical variants according to the Unicode standard\r
 225      * will be decomposed for collation.</p>\r
 226      *\r
 227      * <p>CANONICAL_DECOMPOSITION corresponds to Normalization Form D as\r
 228      * described in <a href="http://www.unicode.org/unicode/reports/tr15/">\r
 229      * Unicode Technical Report #15</a>.\r
 230      * </p>\r
 231      * @see #NO_DECOMPOSITION\r
 232      * @see #getDecomposition\r
 233      * @see #setDecomposition\r
 234      * @stable ICU 2.8\r
 235      */\r
 236     public final static int CANONICAL_DECOMPOSITION = java.text.Collator.CANONICAL_DECOMPOSITION;\r
 237 \r
 238     // public methods --------------------------------------------------------\r
 239 \r
 240     // public setters --------------------------------------------------------\r
 241 \r
 242     /**\r
 243      * <p>Sets this Collator's strength property. The strength property\r
 244      * determines the minimum level of difference considered significant\r
 245      * during comparison.</p>\r
 246      *\r
 247      * <p>The default strength for the Collator is TERTIARY, unless specified\r
 248      * otherwise by the locale used to create the Collator.</p>\r
 249      *\r
 250      * <p>See the Collator class description for an example of use.</p>\r
 251      * @param newStrength the new strength value.\r
 252      * @see #getStrength\r
 253      * @see #PRIMARY\r
 254      * @see #SECONDARY\r
 255      * @see #TERTIARY\r
 256      * @see #QUATERNARY\r
 257      * @see #IDENTICAL\r
 258      * @exception IllegalArgumentException if the new strength value is not one\r
 259      *                of PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL.\r
 260      * @stable ICU 2.8\r
 261      */\r
 262     public void setStrength(int newStrength) {\r
 263         collator.setStrength(newStrength);\r
 264     }\r
 265 \r
 266     /**\r
 267      * <p>Set the decomposition mode of this Collator.  Setting this\r
 268      * decomposition property with CANONICAL_DECOMPOSITION allows the\r
 269      * Collator to handle un-normalized text properly, producing the\r
 270      * same results as if the text were normalized. If\r
 271      * NO_DECOMPOSITION is set, it is the user's responsibility to\r
 272      * insure that all text is already in the appropriate form before\r
 273      * a comparison or before getting a CollationKey. Adjusting\r
 274      * decomposition mode allows the user to select between faster and\r
 275      * more complete collation behavior.</p>\r
 276      *\r
 277      * <p>Since a great many of the world's languages do not require\r
 278      * text normalization, most locales set NO_DECOMPOSITION as the\r
 279      * default decomposition mode.</p>\r
 280      *\r
 281      * The default decompositon mode for the Collator is\r
 282      * NO_DECOMPOSITON, unless specified otherwise by the locale used\r
 283      * to create the Collator.</p>\r
 284      *\r
 285      * <p>See getDecomposition for a description of decomposition\r
 286      * mode.</p>\r
 287      *\r
 288      * @param decomposition the new decomposition mode\r
 289      * @see #getDecomposition\r
 290      * @see #NO_DECOMPOSITION\r
 291      * @see #CANONICAL_DECOMPOSITION\r
 292      * @exception IllegalArgumentException If the given value is not a valid\r
 293      *            decomposition mode.\r
 294      * @stable ICU 2.8\r
 295      */\r
 296     public void setDecomposition(int decomposition) {\r
 297         collator.setDecomposition(decomposition);\r
 298     }\r
 299 \r
 300     // public getters --------------------------------------------------------\r
 301 \r
 302     /**\r
 303      * Gets the Collator for the current default locale.\r
 304      * The default locale is determined by java.util.Locale.getDefault().\r
 305      * @return the Collator for the default locale (for example, en_US) if it\r
 306      *         is created successfully. Otherwise if there is no Collator\r
 307      *         associated with the current locale, the default UCA collator\r
 308      *         will be returned.\r
 309      * @see java.util.Locale#getDefault()\r
 310      * @see #getInstance(Locale)\r
 311      * @stable ICU 2.8\r
 312      */\r
 313     public static final Collator getInstance() {\r
 314         return new Collator(java.text.Collator.getInstance());\r
 315     }\r
 316 \r
 317      /**\r
 318      * Gets the Collator for the desired locale.\r
 319      * @param locale the desired locale.\r
 320      * @return Collator for the desired locale if it is created successfully.\r
 321      *         Otherwise if there is no Collator\r
 322      *         associated with the current locale, a default UCA collator will\r
 323      *         be returned.\r
 324      * @see java.util.Locale\r
 325      * @see java.util.ResourceBundle\r
 326      * @see #getInstance(Locale)\r
 327      * @see #getInstance()\r
 328      * @stable ICU 3.4.3\r
 329      */\r
 330     public static final Collator getInstance(ULocale locale) {\r
 331         return getInstance(locale.toLocale());\r
 332     }\r
 333 \r
 334     /**\r
 335      * Gets the Collator for the desired locale.\r
 336      * @param locale the desired locale.\r
 337      * @return Collator for the desired locale if it is created successfully.\r
 338      *         Otherwise if there is no Collator\r
 339      *         associated with the current locale, a default UCA collator will\r
 340      *         be returned.\r
 341      * @see java.util.Locale\r
 342      * @see java.util.ResourceBundle\r
 343      * @see #getInstance(ULocale)\r
 344      * @see #getInstance()\r
 345      * @stable ICU 2.8\r
 346      */\r
 347     public static final Collator getInstance(Locale locale) {\r
 348         return new Collator(java.text.Collator.getInstance(locale));\r
 349     }\r
 350 \r
 351      /**\r
 352      * Get the set of locales, as Locale objects, for which collators\r
 353      * are installed.  Note that Locale objects do not support RFC 3066.\r
 354      * @return the list of locales in which collators are installed.\r
 355      * This list includes any that have been registered, in addition to\r
 356      * those that are installed with ICU4J.\r
 357      * @stable ICU 2.4\r
 358      */\r
 359     public static Locale[] getAvailableLocales() {\r
 360         return java.text.Collator.getAvailableLocales();\r
 361     }\r
 362 \r
 363     /**\r
 364      * Get the set of locales, as ULocale objects, for which collators\r
 365      * are installed.  ULocale objects support RFC 3066.\r
 366      * @return the list of locales in which collators are installed.\r
 367      * This list includes any that have been registered, in addition to\r
 368      * those that are installed with ICU4J.\r
 369      * @stable ICU 3.4.3\r
 370      */\r
 371     public static final ULocale[] getAvailableULocales() {\r
 372         Locale[] locales = java.text.Collator.getAvailableLocales();\r
 373         ULocale[] ulocales = new ULocale[locales.length];\r
 374         for (int i = 0; i < locales.length; ++i) {\r
 375             ulocales[i] = ULocale.forLocale(locales[i]);\r
 376         }\r
 377         return ulocales;\r
 378     }\r
 379  \r
 380     /**\r
 381      * Return an array of all possible keywords that are relevant to\r
 382      * collation. At this point, the only recognized keyword for this\r
 383      * service is "collation".\r
 384      * @return an array of valid collation keywords.\r
 385      * @see #getKeywordValues\r
 386      * @stable ICU 3.0\r
 387      */\r
 388     public static final String[] getKeywords() {\r
 389         return new String[0];\r
 390     }\r
 391     \r
 392     /**\r
 393      * Given a keyword, return an array of all values for\r
 394      * that keyword that are currently in use.\r
 395      * @param keyword one of the keywords returned by getKeywords.\r
 396      * @see #getKeywords\r
 397      * @stable ICU 3.0\r
 398      */\r
 399     public static final String[] getKeywordValues(String keyword) {\r
 400         return new String[0];\r
 401     }\r
 402 \r
 403     /**\r
 404      * <p>Returns this Collator's strength property. The strength property\r
 405      * determines the minimum level of difference considered significant.\r
 406      * </p>\r
 407      * <p>\r
 408      * See the Collator class description for more details.\r
 409      * </p>\r
 410      * @return this Collator's current strength property.\r
 411      * @see #setStrength\r
 412      * @see #PRIMARY\r
 413      * @see #SECONDARY\r
 414      * @see #TERTIARY\r
 415      * @see #QUATERNARY\r
 416      * @see #IDENTICAL\r
 417      * @stable ICU 2.8\r
 418      */\r
 419     public int getStrength() {\r
 420         return collator.getStrength();\r
 421     }\r
 422 \r
 423     /**\r
 424      * <p>\r
 425      * Get the decomposition mode of this Collator. Decomposition mode\r
 426      * determines how Unicode composed characters are handled.\r
 427      * </p>\r
 428      * <p>\r
 429      * See the Collator class description for more details.\r
 430      * </p>\r
 431      * @return the decomposition mode\r
 432      * @see #setDecomposition\r
 433      * @see #NO_DECOMPOSITION\r
 434      * @see #CANONICAL_DECOMPOSITION\r
 435      * @stable ICU 2.8\r
 436      */\r
 437     public int getDecomposition() {\r
 438         return collator.getDecomposition();\r
 439     }\r
 440 \r
 441     /**\r
 442      * <p>\r
 443      * Compares the source text String to the target text String according to\r
 444      * this Collator's rules, strength and decomposition mode.\r
 445      * Returns an integer less than,\r
 446      * equal to or greater than zero depending on whether the source String is\r
 447      * less than, equal to or greater than the target String. See the Collator\r
 448      * class description for an example of use.\r
 449      * </p>\r
 450      * @param source the source String.\r
 451      * @param target the target String.\r
 452      * @return Returns an integer value. Value is less than zero if source is\r
 453      *         less than target, value is zero if source and target are equal,\r
 454      *         value is greater than zero if source is greater than target.\r
 455      * @see CollationKey\r
 456      * @see #getCollationKey\r
 457      * @exception NullPointerException thrown if either arguments is null.\r
 458      *            IllegalArgumentException thrown if either source or target is\r
 459      *            not of the class String.\r
 460      * @stable ICU 2.8\r
 461      */\r
 462     public int compare(Object source, Object target) {\r
 463         return collator.compare(source, target);\r
 464     }\r
 465 \r
 466     // public other methods -------------------------------------------------\r
 467 \r
 468     /**\r
 469      * Convenience method for comparing the equality of two text Strings using\r
 470      * this Collator's rules, strength and decomposition mode.\r
 471      * @param source the source string to be compared.\r
 472      * @param target the target string to be compared.\r
 473      * @return true if the strings are equal according to the collation\r
 474      *         rules, otherwise false.\r
 475      * @see #compare\r
 476      * @exception NullPointerException thrown if either arguments is null.\r
 477      * @stable ICU 2.8\r
 478      */\r
 479     public boolean equals(String source, String target) {\r
 480         return (compare(source, target) == 0);\r
 481     }\r
 482 \r
 483     /**\r
 484      * <p>\r
 485      * Compares the source text String to the target text String according to\r
 486      * this Collator's rules, strength and decomposition mode.\r
 487      * Returns an integer less than,\r
 488      * equal to or greater than zero depending on whether the source String is\r
 489      * less than, equal to or greater than the target String. See the Collator\r
 490      * class description for an example of use.\r
 491      * </p>\r
 492      * @param source the source String.\r
 493      * @param target the target String.\r
 494      * @return Returns an integer value. Value is less than zero if source is\r
 495      *         less than target, value is zero if source and target are equal,\r
 496      *         value is greater than zero if source is greater than target.\r
 497      * @see CollationKey\r
 498      * @see #getCollationKey\r
 499      * @exception NullPointerException thrown if either arguments is null.\r
 500      * @stable ICU 2.8\r
 501      */\r
 502     public int compare(String source, String target) {\r
 503         return collator.compare(source, target);\r
 504     }\r
 505 \r
 506     /**\r
 507      * <p>\r
 508      * Transforms the String into a CollationKey suitable for efficient\r
 509      * repeated comparison.  The resulting key depends on the collator's\r
 510      * rules, strength and decomposition mode.\r
 511      * </p>\r
 512      * <p>See the CollationKey class documentation for more information.</p>\r
 513      * @param source the string to be transformed into a CollationKey.\r
 514      * @return the CollationKey for the given String based on this Collator's\r
 515      *         collation rules. If the source String is null, a null\r
 516      *         CollationKey is returned.\r
 517      * @see CollationKey\r
 518      * @see #compare(String, String)\r
 519      * @stable ICU 2.8\r
 520      */\r
 521     public CollationKey getCollationKey(String source) {\r
 522         return new CollationKey(collator.getCollationKey(source));\r
 523     }\r
 524     \r
 525     /**\r
 526      * Return a string suitable for debugging.\r
 527      * @return a string suitable for debugging\r
 528      * @stable ICU 3.4.3\r
 529      */\r
 530     public String toString() {\r
 531         return collator.toString();\r
 532     }\r
 533 \r
 534    /**\r
 535      * Clone the collator.\r
 536      * @return a clone of this collator.\r
 537      * @stable ICU 2.6\r
 538      */\r
 539     public Object clone() throws CloneNotSupportedException {\r
 540         return new Collator((java.text.Collator)collator.clone());\r
 541     }\r
 542 \r
 543     /**\r
 544      * Return true if rhs is a Collator and compares the same as this.\r
 545      * @return true if rhs equals this\r
 546      * @stable ICU 3.4.3\r
 547      */\r
 548     public boolean equals(Object rhs) {\r
 549         try {\r
 550             return collator.equals(((Collator)rhs).collator);\r
 551         }\r
 552         catch (Exception e) {\r
 553             return false;\r
 554         }\r
 555     }\r
 556 \r
 557     /**\r
 558      * Return a hashCode.\r
 559      * @return a hashCode\r
 560      * @stable ICU 3.4.3\r
 561      */\r
 562     public int hashCode() {\r
 563         return collator.hashCode();\r
 564     }\r
 565 }\r