jars/icu4j-4_4_2-src/main/classes/core/src/com/ibm/icu/text/Normalizer.java

   1 /*\r
   2  *******************************************************************************\r
   3  * Copyright (C) 2000-2010, International Business Machines Corporation and    *\r
   4  * others. All Rights Reserved.                                                *\r
   5  *******************************************************************************\r
   6  */\r
   7 package com.ibm.icu.text;\r
   8 import java.io.IOException;\r
   9 import java.nio.CharBuffer;\r
  10 import java.text.CharacterIterator;\r
  11 \r
  12 import com.ibm.icu.impl.Norm2AllModes;\r
  13 import com.ibm.icu.impl.Normalizer2Impl;\r
  14 import com.ibm.icu.impl.UCaseProps;\r
  15 import com.ibm.icu.lang.UCharacter;\r
  16 \r
  17 /**\r
  18  * Unicode Normalization \r
  19  *\r
  20  * <h2>Unicode normalization API</h2>\r
  21  *\r
  22  * <code>normalize</code> transforms Unicode text into an equivalent composed or\r
  23  * decomposed form, allowing for easier sorting and searching of text.\r
  24  * <code>normalize</code> supports the standard normalization forms described in\r
  25  * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">\r
  26  * Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>.\r
  27  *\r
  28  * Characters with accents or other adornments can be encoded in\r
  29  * several different ways in Unicode.  For example, take the character A-acute.\r
  30  * In Unicode, this can be encoded as a single character (the\r
  31  * "composed" form):\r
  32  *\r
  33  * <pre>\r
  34  *      00C1    LATIN CAPITAL LETTER A WITH ACUTE\r
  35  * </pre>\r
  36  *\r
  37  * or as two separate characters (the "decomposed" form):\r
  38  *\r
  39  * <pre>\r
  40  *      0041    LATIN CAPITAL LETTER A\r
  41  *      0301    COMBINING ACUTE ACCENT\r
  42  * </pre>\r
  43  *\r
  44  * To a user of your program, however, both of these sequences should be\r
  45  * treated as the same "user-level" character "A with acute accent".  When you \r
  46  * are searching or comparing text, you must ensure that these two sequences are \r
  47  * treated equivalently.  In addition, you must handle characters with more than\r
  48  * one accent.  Sometimes the order of a character's combining accents is\r
  49  * significant, while in other cases accent sequences in different orders are\r
  50  * really equivalent.\r
  51  *\r
  52  * Similarly, the string "ffi" can be encoded as three separate letters:\r
  53  *\r
  54  * <pre>\r
  55  *      0066    LATIN SMALL LETTER F\r
  56  *      0066    LATIN SMALL LETTER F\r
  57  *      0069    LATIN SMALL LETTER I\r
  58  * </pre>\r
  59  *\r
  60  * or as the single character\r
  61  *\r
  62  * <pre>\r
  63  *      FB03    LATIN SMALL LIGATURE FFI\r
  64  * </pre>\r
  65  *\r
  66  * The ffi ligature is not a distinct semantic character, and strictly speaking\r
  67  * it shouldn't be in Unicode at all, but it was included for compatibility\r
  68  * with existing character sets that already provided it.  The Unicode standard\r
  69  * identifies such characters by giving them "compatibility" decompositions\r
  70  * into the corresponding semantic characters.  When sorting and searching, you\r
  71  * will often want to use these mappings.\r
  72  *\r
  73  * <code>normalize</code> helps solve these problems by transforming text into \r
  74  * the canonical composed and decomposed forms as shown in the first example \r
  75  * above. In addition, you can have it perform compatibility decompositions so \r
  76  * that you can treat compatibility characters the same as their equivalents.\r
  77  * Finally, <code>normalize</code> rearranges accents into the proper canonical\r
  78  * order, so that you do not have to worry about accent rearrangement on your\r
  79  * own.\r
  80  *\r
  81  * Form FCD, "Fast C or D", is also designed for collation.\r
  82  * It allows to work on strings that are not necessarily normalized\r
  83  * with an algorithm (like in collation) that works under "canonical closure", \r
  84  * i.e., it treats precomposed characters and their decomposed equivalents the \r
  85  * same.\r
  86  *\r
  87  * It is not a normalization form because it does not provide for uniqueness of \r
  88  * representation. Multiple strings may be canonically equivalent (their NFDs \r
  89  * are identical) and may all conform to FCD without being identical themselves.\r
  90  *\r
  91  * The form is defined such that the "raw decomposition", the recursive \r
  92  * canonical decomposition of each character, results in a string that is \r
  93  * canonically ordered. This means that precomposed characters are allowed for \r
  94  * as long as their decompositions do not need canonical reordering.\r
  95  *\r
  96  * Its advantage for a process like collation is that all NFD and most NFC texts\r
  97  * - and many unnormalized texts - already conform to FCD and do not need to be \r
  98  * normalized (NFD) for such a process. The FCD quick check will return YES for \r
  99  * most strings in practice.\r
 100  *\r
 101  * normalize(FCD) may be implemented with NFD.\r
 102  *\r
 103  * For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications):\r
 104  * http://www.unicode.org/notes/tn5/#FCD\r
 105  *\r
 106  * ICU collation performs either NFD or FCD normalization automatically if \r
 107  * normalization is turned on for the collator object. Beyond collation and \r
 108  * string search, normalized strings may be useful for string equivalence \r
 109  * comparisons, transliteration/transcription, unique representations, etc.\r
 110  *\r
 111  * The W3C generally recommends to exchange texts in NFC.\r
 112  * Note also that most legacy character encodings use only precomposed forms and\r
 113  * often do not encode any combining marks by themselves. For conversion to such\r
 114  * character encodings the Unicode text needs to be normalized to NFC.\r
 115  * For more usage examples, see the Unicode Standard Annex.\r
 116  *\r
 117  * Note: The Normalizer class also provides API for iterative normalization.\r
 118  * While the setIndex() and getIndex() refer to indices in the\r
 119  * underlying Unicode input text, the next() and previous() methods\r
 120  * iterate through characters in the normalized output.\r
 121  * This means that there is not necessarily a one-to-one correspondence\r
 122  * between characters returned by next() and previous() and the indices\r
 123  * passed to and returned from setIndex() and getIndex().\r
 124  * It is for this reason that Normalizer does not implement the CharacterIterator interface.\r
 125  *\r
 126  * @stable ICU 2.8\r
 127  */\r
 128 public final class Normalizer implements Cloneable {\r
 129     // The input text and our position in it\r
 130     private UCharacterIterator  text;\r
 131     private Normalizer2         norm2;\r
 132     private Mode                mode;\r
 133     private int                 options;\r
 134 \r
 135     // The normalization buffer is the result of normalization\r
 136     // of the source in [currentIndex..nextIndex[ .\r
 137     private int                 currentIndex;\r
 138     private int                 nextIndex;\r
 139 \r
 140     // A buffer for holding intermediate results\r
 141     private StringBuilder       buffer;\r
 142     private int                 bufferPos;\r
 143 \r
 144     // Helper classes to defer loading of normalization data.\r
 145     private static final class ModeImpl {\r
 146         private ModeImpl(Normalizer2 n2) {\r
 147             normalizer2 = n2;\r
 148         }\r
 149         private final Normalizer2 normalizer2;\r
 150     }\r
 151     private static final class NFDModeImpl {\r
 152         private static final ModeImpl INSTANCE =\r
 153             new ModeImpl(Norm2AllModes.getNFCInstance().decomp);\r
 154     }\r
 155     private static final class NFKDModeImpl {\r
 156         private static final ModeImpl INSTANCE =\r
 157             new ModeImpl(Norm2AllModes.getNFKCInstance().decomp);\r
 158     }\r
 159     private static final class NFCModeImpl {\r
 160         private static final ModeImpl INSTANCE =\r
 161             new ModeImpl(Norm2AllModes.getNFCInstance().comp);\r
 162     }\r
 163     private static final class NFKCModeImpl {\r
 164         private static final ModeImpl INSTANCE =\r
 165             new ModeImpl(Norm2AllModes.getNFKCInstance().comp);\r
 166     }\r
 167     private static final class FCDModeImpl {\r
 168         private static final ModeImpl INSTANCE =\r
 169             new ModeImpl(Norm2AllModes.getFCDNormalizer2());\r
 170     }\r
 171 \r
 172     private static final class Unicode32 {\r
 173         private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze();\r
 174     }\r
 175     private static final class NFD32ModeImpl {\r
 176         private static final ModeImpl INSTANCE =\r
 177             new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getNFCInstance().decomp,\r
 178                                                  Unicode32.INSTANCE));\r
 179     }\r
 180     private static final class NFKD32ModeImpl {\r
 181         private static final ModeImpl INSTANCE =\r
 182             new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getNFKCInstance().decomp,\r
 183                                                  Unicode32.INSTANCE));\r
 184     }\r
 185     private static final class NFC32ModeImpl {\r
 186         private static final ModeImpl INSTANCE =\r
 187             new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getNFCInstance().comp,\r
 188                                                  Unicode32.INSTANCE));\r
 189     }\r
 190     private static final class NFKC32ModeImpl {\r
 191         private static final ModeImpl INSTANCE =\r
 192             new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getNFKCInstance().comp,\r
 193                                                  Unicode32.INSTANCE));\r
 194     }\r
 195     private static final class FCD32ModeImpl {\r
 196         private static final ModeImpl INSTANCE =\r
 197             new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getFCDNormalizer2(),\r
 198                                                  Unicode32.INSTANCE));\r
 199     }\r
 200 \r
 201     /**\r
 202      * Options bit set value to select Unicode 3.2 normalization\r
 203      * (except NormalizationCorrections).\r
 204      * At most one Unicode version can be selected at a time.\r
 205      * @stable ICU 2.6\r
 206      */\r
 207     public static final int UNICODE_3_2=0x20;\r
 208 \r
 209     /**\r
 210      * Constant indicating that the end of the iteration has been reached.\r
 211      * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}.\r
 212      * @stable ICU 2.8\r
 213      */\r
 214     public static final int DONE = UCharacterIterator.DONE;\r
 215 \r
 216     /**\r
 217      * Constants for normalization modes.\r
 218      * <p>\r
 219      * The Mode class is not intended for public subclassing.\r
 220      * Only the Mode constants provided by the Normalizer class should be used,\r
 221      * and any fields or methods should not be called or overridden by users.\r
 222      * @stable ICU 2.8\r
 223      */\r
 224     public static abstract class Mode {\r
 225         /**\r
 226          * @internal\r
 227          * @deprecated This API is ICU internal only.\r
 228          */\r
 229         protected abstract Normalizer2 getNormalizer2(int options);\r
 230     }\r
 231 \r
 232     private static final class NONEMode extends Mode {\r
 233         protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; }\r
 234     }\r
 235     private static final class NFDMode extends Mode {\r
 236         protected Normalizer2 getNormalizer2(int options) {\r
 237             return (options&UNICODE_3_2) != 0 ?\r
 238                     NFD32ModeImpl.INSTANCE.normalizer2 : NFDModeImpl.INSTANCE.normalizer2;\r
 239         }\r
 240     }\r
 241     private static final class NFKDMode extends Mode {\r
 242         protected Normalizer2 getNormalizer2(int options) {\r
 243             return (options&UNICODE_3_2) != 0 ?\r
 244                     NFKD32ModeImpl.INSTANCE.normalizer2 : NFKDModeImpl.INSTANCE.normalizer2;\r
 245         }\r
 246     }\r
 247     private static final class NFCMode extends Mode {\r
 248         protected Normalizer2 getNormalizer2(int options) {\r
 249             return (options&UNICODE_3_2) != 0 ?\r
 250                     NFC32ModeImpl.INSTANCE.normalizer2 : NFCModeImpl.INSTANCE.normalizer2;\r
 251         }\r
 252     }\r
 253     private static final class NFKCMode extends Mode {\r
 254         protected Normalizer2 getNormalizer2(int options) {\r
 255             return (options&UNICODE_3_2) != 0 ?\r
 256                     NFKC32ModeImpl.INSTANCE.normalizer2 : NFKCModeImpl.INSTANCE.normalizer2;\r
 257         }\r
 258     }\r
 259     private static final class FCDMode extends Mode {\r
 260         protected Normalizer2 getNormalizer2(int options) {\r
 261             return (options&UNICODE_3_2) != 0 ?\r
 262                     FCD32ModeImpl.INSTANCE.normalizer2 : FCDModeImpl.INSTANCE.normalizer2;\r
 263         }\r
 264     }\r
 265 \r
 266     /** \r
 267      * No decomposition/composition.  \r
 268      * @stable ICU 2.8\r
 269      */\r
 270     public static final Mode NONE = new NONEMode();\r
 271 \r
 272     /** \r
 273      * Canonical decomposition.  \r
 274      * @stable ICU 2.8\r
 275      */\r
 276     public static final Mode NFD = new NFDMode();\r
 277 \r
 278     /** \r
 279      * Compatibility decomposition.  \r
 280      * @stable ICU 2.8\r
 281      */\r
 282     public static final Mode NFKD = new NFKDMode();\r
 283 \r
 284     /** \r
 285      * Canonical decomposition followed by canonical composition.  \r
 286      * @stable ICU 2.8\r
 287      */\r
 288     public static final Mode NFC = new NFCMode();\r
 289 \r
 290     /** \r
 291      * Default normalization.  \r
 292      * @stable ICU 2.8\r
 293      */\r
 294     public static final Mode DEFAULT = NFC; \r
 295 \r
 296     /** \r
 297      * Compatibility decomposition followed by canonical composition. \r
 298      * @stable ICU 2.8\r
 299      */\r
 300     public static final Mode NFKC =new NFKCMode();\r
 301 \r
 302     /** \r
 303      * "Fast C or D" form. \r
 304      * @stable ICU 2.8 \r
 305      */\r
 306     public static final Mode FCD = new FCDMode();\r
 307 \r
 308     /**\r
 309      * Null operation for use with the {@link com.ibm.icu.text.Normalizer constructors}\r
 310      * and the static {@link #normalize normalize} method.  This value tells\r
 311      * the <tt>Normalizer</tt> to do nothing but return unprocessed characters\r
 312      * from the underlying String or CharacterIterator.  If you have code which\r
 313      * requires raw text at some times and normalized text at others, you can\r
 314      * use <tt>NO_OP</tt> for the cases where you want raw text, rather\r
 315      * than having a separate code path that bypasses <tt>Normalizer</tt>\r
 316      * altogether.\r
 317      * <p>\r
 318      * @see #setMode\r
 319      * @deprecated ICU 2.8. Use Nomalizer.NONE\r
 320      * @see #NONE\r
 321      */\r
 322     public static final Mode NO_OP = NONE;\r
 323 \r
 324     /**\r
 325      * Canonical decomposition followed by canonical composition.  Used with the\r
 326      * {@link com.ibm.icu.text.Normalizer constructors} and the static \r
 327      * {@link #normalize normalize} method to determine the operation to be \r
 328      * performed.\r
 329      * <p>\r
 330      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned\r
 331      * off, this operation produces output that is in\r
 332      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical \r
 333      * Form</a>\r
 334      * <b>C</b>.\r
 335      * <p>\r
 336      * @see #setMode\r
 337      * @deprecated ICU 2.8. Use Normalier.NFC\r
 338      * @see #NFC\r
 339      */\r
 340     public static final Mode COMPOSE = NFC;\r
 341 \r
 342     /**\r
 343      * Compatibility decomposition followed by canonical composition.\r
 344      * Used with the {@link com.ibm.icu.text.Normalizer constructors} and the static\r
 345      * {@link #normalize normalize} method to determine the operation to be \r
 346      * performed.\r
 347      * <p>\r
 348      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned\r
 349      * off, this operation produces output that is in\r
 350      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical \r
 351      * Form</a>\r
 352      * <b>KC</b>.\r
 353      * <p>\r
 354      * @see #setMode\r
 355      * @deprecated ICU 2.8. Use Normalizer.NFKC\r
 356      * @see #NFKC\r
 357      */\r
 358     public static final Mode COMPOSE_COMPAT = NFKC;\r
 359 \r
 360     /**\r
 361      * Canonical decomposition.  This value is passed to the\r
 362      * {@link com.ibm.icu.text.Normalizer constructors} and the static\r
 363      * {@link #normalize normalize}\r
 364      * method to determine the operation to be performed.\r
 365      * <p>\r
 366      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned\r
 367      * off, this operation produces output that is in\r
 368      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical \r
 369      * Form</a>\r
 370      * <b>D</b>.\r
 371      * <p>\r
 372      * @see #setMode\r
 373      * @deprecated ICU 2.8. Use Normalizer.NFD\r
 374      * @see #NFD\r
 375      */\r
 376     public static final Mode DECOMP = NFD;\r
 377 \r
 378     /**\r
 379      * Compatibility decomposition.  This value is passed to the\r
 380      * {@link com.ibm.icu.text.Normalizer constructors} and the static \r
 381      * {@link #normalize normalize}\r
 382      * method to determine the operation to be performed.\r
 383      * <p>\r
 384      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned\r
 385      * off, this operation produces output that is in\r
 386      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical \r
 387      * Form</a>\r
 388      * <b>KD</b>.\r
 389      * <p>\r
 390      * @see #setMode\r
 391      * @deprecated ICU 2.8. Use Normalizer.NFKD\r
 392      * @see #NFKD\r
 393      */\r
 394     public static final Mode DECOMP_COMPAT = NFKD;\r
 395 \r
 396     /**\r
 397      * Option to disable Hangul/Jamo composition and decomposition.\r
 398      * This option applies to Korean text,\r
 399      * which can be represented either in the Jamo alphabet or in Hangul\r
 400      * characters, which are really just two or three Jamo combined\r
 401      * into one visual glyph.  Since Jamo takes up more storage space than\r
 402      * Hangul, applications that process only Hangul text may wish to turn\r
 403      * this option on when decomposing text.\r
 404      * <p>\r
 405      * The Unicode standard treates Hangul to Jamo conversion as a\r
 406      * canonical decomposition, so this option must be turned <b>off</b> if you\r
 407      * wish to transform strings into one of the standard\r
 408      * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">\r
 409      * Unicode Normalization Forms</a>.\r
 410      * <p>\r
 411      * @see #setOption\r
 412      * @deprecated ICU 2.8. This option is no longer supported.\r
 413      */\r
 414     public static final int IGNORE_HANGUL = 0x0001;\r
 415           \r
 416     /**\r
 417      * Result values for quickCheck().\r
 418      * For details see Unicode Technical Report 15.\r
 419      * @stable ICU 2.8\r
 420      */\r
 421     public static final class QuickCheckResult{\r
 422         //private int resultValue;\r
 423         private QuickCheckResult(int value) {\r
 424             //resultValue=value;\r
 425         }\r
 426     }\r
 427     /** \r
 428      * Indicates that string is not in the normalized format\r
 429      * @stable ICU 2.8\r
 430      */\r
 431     public static final QuickCheckResult NO = new QuickCheckResult(0);\r
 432         \r
 433     /** \r
 434      * Indicates that string is in the normalized format\r
 435      * @stable ICU 2.8\r
 436      */\r
 437     public static final QuickCheckResult YES = new QuickCheckResult(1);\r
 438 \r
 439     /** \r
 440      * Indicates it cannot be determined if string is in the normalized \r
 441      * format without further thorough checks.\r
 442      * @stable ICU 2.8\r
 443      */\r
 444     public static final QuickCheckResult MAYBE = new QuickCheckResult(2);\r
 445     \r
 446     /**\r
 447      * Option bit for compare:\r
 448      * Case sensitively compare the strings\r
 449      * @stable ICU 2.8\r
 450      */\r
 451     public static final int FOLD_CASE_DEFAULT =  UCharacter.FOLD_CASE_DEFAULT;\r
 452     \r
 453     /**\r
 454      * Option bit for compare:\r
 455      * Both input strings are assumed to fulfill FCD conditions.\r
 456      * @stable ICU 2.8\r
 457      */\r
 458     public static final int INPUT_IS_FCD    =      0x20000;\r
 459         \r
 460     /**\r
 461      * Option bit for compare:\r
 462      * Perform case-insensitive comparison.\r
 463      * @stable ICU 2.8\r
 464      */\r
 465     public static final int COMPARE_IGNORE_CASE  =     0x10000;\r
 466         \r
 467     /**\r
 468      * Option bit for compare:\r
 469      * Compare strings in code point order instead of code unit order.\r
 470      * @stable ICU 2.8\r
 471      */\r
 472     public static final int COMPARE_CODE_POINT_ORDER = 0x8000;\r
 473     \r
 474     /** \r
 475      * Option value for case folding: exclude the mappings for dotted I \r
 476      * and dotless i marked with 'I' in CaseFolding.txt. \r
 477      * @stable ICU 2.8\r
 478      */\r
 479     public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = UCharacter.FOLD_CASE_EXCLUDE_SPECIAL_I;\r
 480     \r
 481     /**\r
 482      * Lowest-order bit number of compare() options bits corresponding to\r
 483      * normalization options bits.\r
 484      *\r
 485      * The options parameter for compare() uses most bits for\r
 486      * itself and for various comparison and folding flags.\r
 487      * The most significant bits, however, are shifted down and passed on\r
 488      * to the normalization implementation.\r
 489      * (That is, from compare(..., options, ...),\r
 490      * options>>COMPARE_NORM_OPTIONS_SHIFT will be passed on to the\r
 491      * internal normalization functions.)\r
 492      *\r
 493      * @see #compare\r
 494      * @stable ICU 2.6\r
 495      */\r
 496     public static final int COMPARE_NORM_OPTIONS_SHIFT  = 20;\r
 497         \r
 498     //-------------------------------------------------------------------------\r
 499     // Iterator constructors\r
 500     //-------------------------------------------------------------------------\r
 501 \r
 502     /**\r
 503      * Creates a new <tt>Normalizer</tt> object for iterating over the\r
 504      * normalized form of a given string.\r
 505      * <p>\r
 506      * The <tt>options</tt> parameter specifies which optional\r
 507      * <tt>Normalizer</tt> features are to be enabled for this object.\r
 508      * <p>\r
 509      * @param str  The string to be normalized.  The normalization\r
 510      *              will start at the beginning of the string.\r
 511      *\r
 512      * @param mode The normalization mode.\r
 513      *\r
 514      * @param opt Any optional features to be enabled.\r
 515      *            Currently the only available option is {@link #UNICODE_3_2}.\r
 516      *            If you want the default behavior corresponding to one of the\r
 517      *            standard Unicode Normalization Forms, use 0 for this argument.\r
 518      * @stable ICU 2.6\r
 519      */\r
 520     public Normalizer(String str, Mode mode, int opt) {\r
 521         this.text = UCharacterIterator.getInstance(str);\r
 522         this.mode = mode; \r
 523         this.options=opt;\r
 524         norm2 = mode.getNormalizer2(opt);\r
 525         buffer = new StringBuilder();\r
 526     }\r
 527 \r
 528     /**\r
 529      * Creates a new <tt>Normalizer</tt> object for iterating over the\r
 530      * normalized form of the given text.\r
 531      * <p>\r
 532      * @param iter  The input text to be normalized.  The normalization\r
 533      *              will start at the beginning of the string.\r
 534      *\r
 535      * @param mode  The normalization mode.\r
 536      *\r
 537      * @param opt Any optional features to be enabled.\r
 538      *            Currently the only available option is {@link #UNICODE_3_2}.\r
 539      *            If you want the default behavior corresponding to one of the\r
 540      *            standard Unicode Normalization Forms, use 0 for this argument.\r
 541      * @stable ICU 2.6\r
 542      */\r
 543     public Normalizer(CharacterIterator iter, Mode mode, int opt) {\r
 544         this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone());\r
 545         this.mode = mode;\r
 546         this.options = opt;\r
 547         norm2 = mode.getNormalizer2(opt);\r
 548         buffer = new StringBuilder();\r
 549     }\r
 550 \r
 551     /**\r
 552      * Creates a new <tt>Normalizer</tt> object for iterating over the\r
 553      * normalized form of the given text.\r
 554      * <p>\r
 555      * @param iter  The input text to be normalized.  The normalization\r
 556      *              will start at the beginning of the string.\r
 557      *\r
 558      * @param mode  The normalization mode.\r
 559      * @param options The normalization options, ORed together (0 for no options).\r
 560      * @stable ICU 2.6\r
 561      */\r
 562     public Normalizer(UCharacterIterator iter, Mode mode, int options) {\r
 563         try {\r
 564             this.text     = (UCharacterIterator)iter.clone();\r
 565             this.mode     = mode;\r
 566             this.options  = options;\r
 567             norm2 = mode.getNormalizer2(options);\r
 568             buffer = new StringBuilder();\r
 569         } catch (CloneNotSupportedException e) {\r
 570             throw new IllegalStateException(e.toString());\r
 571         }\r
 572     }\r
 573 \r
 574     /**\r
 575      * Clones this <tt>Normalizer</tt> object.  All properties of this\r
 576      * object are duplicated in the new object, including the cloning of any\r
 577      * {@link CharacterIterator} that was passed in to the constructor\r
 578      * or to {@link #setText(CharacterIterator) setText}.\r
 579      * However, the text storage underlying\r
 580      * the <tt>CharacterIterator</tt> is not duplicated unless the\r
 581      * iterator's <tt>clone</tt> method does so.\r
 582      * @stable ICU 2.8\r
 583      */\r
 584     public Object clone() {\r
 585         try {\r
 586             Normalizer copy = (Normalizer) super.clone();\r
 587             copy.text = (UCharacterIterator) text.clone();\r
 588             copy.mode = mode;\r
 589             copy.options = options;\r
 590             copy.norm2 = norm2;\r
 591             copy.buffer = new StringBuilder(buffer);\r
 592             copy.bufferPos = bufferPos;\r
 593             copy.currentIndex = currentIndex;\r
 594             copy.nextIndex = nextIndex;\r
 595             return copy;\r
 596         }\r
 597         catch (CloneNotSupportedException e) {\r
 598             throw new IllegalStateException(e);\r
 599         }\r
 600     }\r
 601 \r
 602     //--------------------------------------------------------------------------\r
 603     // Static Utility methods\r
 604     //--------------------------------------------------------------------------\r
 605 \r
 606     private static final Normalizer2 getComposeNormalizer2(boolean compat, int options) {\r
 607         return (compat ? NFKC : NFC).getNormalizer2(options);\r
 608     }\r
 609     private static final Normalizer2 getDecomposeNormalizer2(boolean compat, int options) {\r
 610         return (compat ? NFKD : NFD).getNormalizer2(options);\r
 611     }\r
 612 \r
 613     /**\r
 614      * Compose a string.\r
 615      * The string will be composed to according to the specified mode.\r
 616      * @param str        The string to compose.\r
 617      * @param compat     If true the string will be composed according to \r
 618      *                    NFKC rules and if false will be composed according to \r
 619      *                    NFC rules.\r
 620      * @return String    The composed string   \r
 621      * @stable ICU 2.8\r
 622      */            \r
 623     public static String compose(String str, boolean compat) {\r
 624         return compose(str,compat,0);           \r
 625     }\r
 626     \r
 627     /**\r
 628      * Compose a string.\r
 629      * The string will be composed to according to the specified mode.\r
 630      * @param str        The string to compose.\r
 631      * @param compat     If true the string will be composed according to \r
 632      *                    NFKC rules and if false will be composed according to \r
 633      *                    NFC rules.\r
 634      * @param options    The only recognized option is UNICODE_3_2\r
 635      * @return String    The composed string   \r
 636      * @stable ICU 2.6\r
 637      */            \r
 638     public static String compose(String str, boolean compat, int options) {\r
 639         return getComposeNormalizer2(compat, options).normalize(str);\r
 640     }\r
 641     \r
 642     /**\r
 643      * Compose a string.\r
 644      * The string will be composed to according to the specified mode.\r
 645      * @param source The char array to compose.\r
 646      * @param target A char buffer to receive the normalized text.\r
 647      * @param compat If true the char array will be composed according to \r
 648      *                NFKC rules and if false will be composed according to \r
 649      *                NFC rules.\r
 650      * @param options The normalization options, ORed together (0 for no options).\r
 651      * @return int   The total buffer size needed;if greater than length of \r
 652      *                result, the output was truncated.\r
 653      * @exception IndexOutOfBoundsException if target.length is less than the \r
 654      *             required length\r
 655      * @stable ICU 2.6  \r
 656      */         \r
 657     public static int compose(char[] source,char[] target, boolean compat, int options) {\r
 658         return compose(source, 0, source.length, target, 0, target.length, compat, options);\r
 659     }\r
 660     \r
 661     /**\r
 662      * Compose a string.\r
 663      * The string will be composed to according to the specified mode.\r
 664      * @param src       The char array to compose.\r
 665      * @param srcStart  Start index of the source\r
 666      * @param srcLimit  Limit index of the source\r
 667      * @param dest      The char buffer to fill in\r
 668      * @param destStart Start index of the destination buffer  \r
 669      * @param destLimit End index of the destination buffer\r
 670      * @param compat If true the char array will be composed according to \r
 671      *                NFKC rules and if false will be composed according to \r
 672      *                NFC rules.\r
 673      * @param options The normalization options, ORed together (0 for no options).\r
 674      * @return int   The total buffer size needed;if greater than length of \r
 675      *                result, the output was truncated.\r
 676      * @exception IndexOutOfBoundsException if target.length is less than the \r
 677      *             required length \r
 678      * @stable ICU 2.6 \r
 679      */         \r
 680     public static int compose(char[] src,int srcStart, int srcLimit,\r
 681                               char[] dest,int destStart, int destLimit,\r
 682                               boolean compat, int options) {\r
 683         CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);\r
 684         CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);\r
 685         getComposeNormalizer2(compat, options).normalize(srcBuffer, app);\r
 686         return app.length();\r
 687     }\r
 688 \r
 689     /**\r
 690      * Decompose a string.\r
 691      * The string will be decomposed to according to the specified mode.\r
 692      * @param str       The string to decompose.\r
 693      * @param compat    If true the string will be decomposed according to NFKD \r
 694      *                   rules and if false will be decomposed according to NFD \r
 695      *                   rules.\r
 696      * @return String   The decomposed string  \r
 697      * @stable ICU 2.8 \r
 698      */         \r
 699     public static String decompose(String str, boolean compat) {\r
 700         return decompose(str,compat,0);                  \r
 701     }\r
 702     \r
 703     /**\r
 704      * Decompose a string.\r
 705      * The string will be decomposed to according to the specified mode.\r
 706      * @param str     The string to decompose.\r
 707      * @param compat  If true the string will be decomposed according to NFKD \r
 708      *                 rules and if false will be decomposed according to NFD \r
 709      *                 rules.\r
 710      * @param options The normalization options, ORed together (0 for no options).\r
 711      * @return String The decomposed string \r
 712      * @stable ICU 2.6\r
 713      */         \r
 714     public static String decompose(String str, boolean compat, int options) {\r
 715         return getDecomposeNormalizer2(compat, options).normalize(str);\r
 716     }\r
 717 \r
 718     /**\r
 719      * Decompose a string.\r
 720      * The string will be decomposed to according to the specified mode.\r
 721      * @param source The char array to decompose.\r
 722      * @param target A char buffer to receive the normalized text.\r
 723      * @param compat If true the char array will be decomposed according to NFKD \r
 724      *                rules and if false will be decomposed according to \r
 725      *                NFD rules.\r
 726      * @return int   The total buffer size needed;if greater than length of \r
 727      *                result,the output was truncated.\r
 728      * @param options The normalization options, ORed together (0 for no options).\r
 729      * @exception IndexOutOfBoundsException if the target capacity is less than\r
 730      *             the required length   \r
 731      * @stable ICU 2.6\r
 732      */\r
 733     public static int decompose(char[] source,char[] target, boolean compat, int options) {\r
 734         return decompose(source, 0, source.length, target, 0, target.length, compat, options);\r
 735     }\r
 736     \r
 737     /**\r
 738      * Decompose a string.\r
 739      * The string will be decomposed to according to the specified mode.\r
 740      * @param src       The char array to compose.\r
 741      * @param srcStart  Start index of the source\r
 742      * @param srcLimit  Limit index of the source\r
 743      * @param dest      The char buffer to fill in\r
 744      * @param destStart Start index of the destination buffer  \r
 745      * @param destLimit End index of the destination buffer\r
 746      * @param compat If true the char array will be decomposed according to NFKD \r
 747      *                rules and if false will be decomposed according to \r
 748      *                NFD rules.\r
 749      * @param options The normalization options, ORed together (0 for no options).\r
 750      * @return int   The total buffer size needed;if greater than length of \r
 751      *                result,the output was truncated.\r
 752      * @exception IndexOutOfBoundsException if the target capacity is less than\r
 753      *             the required length  \r
 754      * @stable ICU 2.6 \r
 755      */\r
 756     public static int decompose(char[] src,int srcStart, int srcLimit,\r
 757                                 char[] dest,int destStart, int destLimit,\r
 758                                 boolean compat, int options) {\r
 759         CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);\r
 760         CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);\r
 761         getDecomposeNormalizer2(compat, options).normalize(srcBuffer, app);\r
 762         return app.length();\r
 763     }\r
 764 \r
 765     /**\r
 766      * Normalizes a <tt>String</tt> using the given normalization operation.\r
 767      * <p>\r
 768      * The <tt>options</tt> parameter specifies which optional\r
 769      * <tt>Normalizer</tt> features are to be enabled for this operation.\r
 770      * Currently the only available option is {@link #UNICODE_3_2}.\r
 771      * If you want the default behavior corresponding to one of the standard\r
 772      * Unicode Normalization Forms, use 0 for this argument.\r
 773      * <p>\r
 774      * @param str       the input string to be normalized.\r
 775      * @param mode      the normalization mode\r
 776      * @param options   the optional features to be enabled.\r
 777      * @return String   the normalized string\r
 778      * @stable ICU 2.6\r
 779      */\r
 780     public static String normalize(String str, Mode mode, int options) {\r
 781         return mode.getNormalizer2(options).normalize(str);\r
 782     }\r
 783     \r
 784     /**\r
 785      * Normalize a string.\r
 786      * The string will be normalized according to the specified normalization \r
 787      * mode and options.\r
 788      * @param src        The string to normalize.\r
 789      * @param mode       The normalization mode; one of Normalizer.NONE, \r
 790      *                    Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC, \r
 791      *                    Normalizer.NFKD, Normalizer.DEFAULT\r
 792      * @return the normalized string\r
 793      * @stable ICU 2.8\r
 794      *   \r
 795      */\r
 796     public static String normalize(String src,Mode mode) {\r
 797         return normalize(src, mode, 0);    \r
 798     }\r
 799     /**\r
 800      * Normalize a string.\r
 801      * The string will be normalized according to the specified normalization \r
 802      * mode and options.\r
 803      * @param source The char array to normalize.\r
 804      * @param target A char buffer to receive the normalized text.\r
 805      * @param mode   The normalization mode; one of Normalizer.NONE, \r
 806      *                Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC, \r
 807      *                Normalizer.NFKD, Normalizer.DEFAULT\r
 808      * @param options The normalization options, ORed together (0 for no options).\r
 809      * @return int   The total buffer size needed;if greater than length of \r
 810      *                result, the output was truncated.\r
 811      * @exception    IndexOutOfBoundsException if the target capacity is less \r
 812      *                than the required length\r
 813      * @stable ICU 2.6     \r
 814      */\r
 815     public static int normalize(char[] source,char[] target, Mode  mode, int options) {\r
 816         return normalize(source,0,source.length,target,0,target.length,mode, options);\r
 817     }\r
 818 \r
 819     /**\r
 820      * Normalize a string.\r
 821      * The string will be normalized according to the specified normalization\r
 822      * mode and options.\r
 823      * @param src       The char array to compose.\r
 824      * @param srcStart  Start index of the source\r
 825      * @param srcLimit  Limit index of the source\r
 826      * @param dest      The char buffer to fill in\r
 827      * @param destStart Start index of the destination buffer  \r
 828      * @param destLimit End index of the destination buffer\r
 829      * @param mode      The normalization mode; one of Normalizer.NONE, \r
 830      *                   Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC, \r
 831      *                   Normalizer.NFKD, Normalizer.DEFAULT\r
 832      * @param options The normalization options, ORed together (0 for no options). \r
 833      * @return int      The total buffer size needed;if greater than length of \r
 834      *                   result, the output was truncated.\r
 835      * @exception       IndexOutOfBoundsException if the target capacity is \r
 836      *                   less than the required length\r
 837      * @stable ICU 2.6    \r
 838      */       \r
 839     public static int normalize(char[] src,int srcStart, int srcLimit, \r
 840                                 char[] dest,int destStart, int destLimit,\r
 841                                 Mode  mode, int options) {\r
 842         CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);\r
 843         CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);\r
 844         mode.getNormalizer2(options).normalize(srcBuffer, app);\r
 845         return app.length();\r
 846     }\r
 847 \r
 848     /**\r
 849      * Normalize a codepoint according to the given mode\r
 850      * @param char32    The input string to be normalized.\r
 851      * @param mode      The normalization mode\r
 852      * @param options   Options for use with exclusion set and tailored Normalization\r
 853      *                                   The only option that is currently recognized is UNICODE_3_2\r
 854      * @return String   The normalized string\r
 855      * @stable ICU 2.6\r
 856      * @see #UNICODE_3_2\r
 857      */\r
 858     public static String normalize(int char32, Mode mode, int options) {\r
 859         if(mode == NFD && options == 0) {\r
 860             String decomposition =\r
 861                 Norm2AllModes.getNFCInstance().impl.getDecomposition(char32);\r
 862             if(decomposition == null) {\r
 863                 decomposition = UTF16.valueOf(char32);\r
 864             }\r
 865             return decomposition;\r
 866         }\r
 867         return normalize(UTF16.valueOf(char32), mode, options);\r
 868     }\r
 869 \r
 870     /**\r
 871      * Convenience method to normalize a codepoint according to the given mode\r
 872      * @param char32    The input string to be normalized.\r
 873      * @param mode      The normalization mode\r
 874      * @return String   The normalized string\r
 875      * @stable ICU 2.6\r
 876      */\r
 877     public static String normalize(int char32, Mode mode) {\r
 878         return normalize(char32, mode, 0);\r
 879     }\r
 880 \r
 881     /**\r
 882      * Convenience method.\r
 883      *\r
 884      * @param source   string for determining if it is in a normalized format\r
 885      * @param mode     normalization format (Normalizer.NFC,Normalizer.NFD,  \r
 886      *                  Normalizer.NFKC,Normalizer.NFKD)\r
 887      * @return         Return code to specify if the text is normalized or not \r
 888      *                     (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)\r
 889      * @stable ICU 2.8\r
 890      */\r
 891     public static QuickCheckResult quickCheck(String source, Mode mode) {\r
 892         return quickCheck(source, mode, 0);\r
 893     }\r
 894 \r
 895     /**\r
 896      * Performing quick check on a string, to quickly determine if the string is \r
 897      * in a particular normalization format.\r
 898      * Three types of result can be returned Normalizer.YES, Normalizer.NO or\r
 899      * Normalizer.MAYBE. Result Normalizer.YES indicates that the argument\r
 900      * string is in the desired normalized format, Normalizer.NO determines that\r
 901      * argument string is not in the desired normalized format. A \r
 902      * Normalizer.MAYBE result indicates that a more thorough check is required, \r
 903      * the user may have to put the string in its normalized form and compare \r
 904      * the results.\r
 905      *\r
 906      * @param source   string for determining if it is in a normalized format\r
 907      * @param mode     normalization format (Normalizer.NFC,Normalizer.NFD,  \r
 908      *                  Normalizer.NFKC,Normalizer.NFKD)\r
 909      * @param options   Options for use with exclusion set and tailored Normalization\r
 910      *                                   The only option that is currently recognized is UNICODE_3_2     \r
 911      * @return         Return code to specify if the text is normalized or not \r
 912      *                     (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)\r
 913      * @stable ICU 2.6\r
 914      */\r
 915     public static QuickCheckResult quickCheck(String source, Mode mode, int options) {\r
 916         return mode.getNormalizer2(options).quickCheck(source);\r
 917     }\r
 918 \r
 919     /**\r
 920      * Convenience method.\r
 921      *\r
 922      * @param source Array of characters for determining if it is in a \r
 923      *                normalized format\r
 924      * @param mode   normalization format (Normalizer.NFC,Normalizer.NFD,  \r
 925      *                Normalizer.NFKC,Normalizer.NFKD)\r
 926      * @param options   Options for use with exclusion set and tailored Normalization\r
 927      *                                   The only option that is currently recognized is UNICODE_3_2\r
 928      * @return       Return code to specify if the text is normalized or not \r
 929      *                (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)\r
 930      * @stable ICU 2.6\r
 931      */\r
 932     public static QuickCheckResult quickCheck(char[] source, Mode mode, int options) {\r
 933         return quickCheck(source, 0, source.length, mode, options);\r
 934     }\r
 935 \r
 936     /**\r
 937      * Performing quick check on a string, to quickly determine if the string is \r
 938      * in a particular normalization format.\r
 939      * Three types of result can be returned Normalizer.YES, Normalizer.NO or\r
 940      * Normalizer.MAYBE. Result Normalizer.YES indicates that the argument\r
 941      * string is in the desired normalized format, Normalizer.NO determines that\r
 942      * argument string is not in the desired normalized format. A \r
 943      * Normalizer.MAYBE result indicates that a more thorough check is required, \r
 944      * the user may have to put the string in its normalized form and compare \r
 945      * the results.\r
 946      *\r
 947      * @param source    string for determining if it is in a normalized format\r
 948      * @param start     the start index of the source\r
 949      * @param limit     the limit index of the source it is equal to the length\r
 950      * @param mode      normalization format (Normalizer.NFC,Normalizer.NFD,  \r
 951      *                   Normalizer.NFKC,Normalizer.NFKD)\r
 952      * @param options   Options for use with exclusion set and tailored Normalization\r
 953      *                                   The only option that is currently recognized is UNICODE_3_2    \r
 954      * @return          Return code to specify if the text is normalized or not \r
 955      *                   (Normalizer.YES, Normalizer.NO or\r
 956      *                   Normalizer.MAYBE)\r
 957      * @stable ICU 2.6\r
 958      */\r
 959 \r
 960     public static QuickCheckResult quickCheck(char[] source,int start, \r
 961                                               int limit, Mode mode,int options) {       \r
 962         CharBuffer srcBuffer = CharBuffer.wrap(source, start, limit - start);\r
 963         return mode.getNormalizer2(options).quickCheck(srcBuffer);\r
 964     }\r
 965 \r
 966     /**\r
 967      * Test if a string is in a given normalization form.\r
 968      * This is semantically equivalent to source.equals(normalize(source, mode)).\r
 969      *\r
 970      * Unlike quickCheck(), this function returns a definitive result,\r
 971      * never a "maybe".\r
 972      * For NFD, NFKD, and FCD, both functions work exactly the same.\r
 973      * For NFC and NFKC where quickCheck may return "maybe", this function will\r
 974      * perform further tests to arrive at a true/false result.\r
 975      * @param src       The input array of characters to be checked to see if \r
 976      *                   it is normalized\r
 977      * @param start     The strart index in the source\r
 978      * @param limit     The limit index in the source\r
 979      * @param mode      the normalization mode\r
 980      * @param options   Options for use with exclusion set and tailored Normalization\r
 981      *                                   The only option that is currently recognized is UNICODE_3_2    \r
 982      * @return Boolean value indicating whether the source string is in the\r
 983      *         "mode" normalization form\r
 984      * @stable ICU 2.6\r
 985      */\r
 986     public static boolean isNormalized(char[] src,int start,\r
 987                                        int limit, Mode mode, \r
 988                                        int options) {\r
 989         CharBuffer srcBuffer = CharBuffer.wrap(src, start, limit - start);\r
 990         return mode.getNormalizer2(options).isNormalized(srcBuffer);\r
 991     }\r
 992 \r
 993     /**\r
 994      * Test if a string is in a given normalization form.\r
 995      * This is semantically equivalent to source.equals(normalize(source, mode)).\r
 996      *\r
 997      * Unlike quickCheck(), this function returns a definitive result,\r
 998      * never a "maybe".\r
 999      * For NFD, NFKD, and FCD, both functions work exactly the same.\r
1000      * For NFC and NFKC where quickCheck may return "maybe", this function will\r
1001      * perform further tests to arrive at a true/false result.\r
1002      * @param str       the input string to be checked to see if it is \r
1003      *                   normalized\r
1004      * @param mode      the normalization mode\r
1005      * @param options   Options for use with exclusion set and tailored Normalization\r
1006      *                  The only option that is currently recognized is UNICODE_3_2   \r
1007      * @see #isNormalized\r
1008      * @stable ICU 2.6\r
1009      */\r
1010     public static boolean isNormalized(String str, Mode mode, int options) {\r
1011         return mode.getNormalizer2(options).isNormalized(str);\r
1012     }\r
1013 \r
1014     /**\r
1015      * Convenience Method\r
1016      * @param char32    the input code point to be checked to see if it is \r
1017      *                   normalized\r
1018      * @param mode      the normalization mode\r
1019      * @param options   Options for use with exclusion set and tailored Normalization\r
1020      *                  The only option that is currently recognized is UNICODE_3_2    \r
1021      *\r
1022      * @see #isNormalized\r
1023      * @stable ICU 2.6\r
1024      */\r
1025     public static boolean isNormalized(int char32, Mode mode,int options) {\r
1026         return isNormalized(UTF16.valueOf(char32), mode, options);\r
1027     }\r
1028 \r
1029     /**\r
1030      * Compare two strings for canonical equivalence.\r
1031      * Further options include case-insensitive comparison and\r
1032      * code point order (as opposed to code unit order).\r
1033      *\r
1034      * Canonical equivalence between two strings is defined as their normalized\r
1035      * forms (NFD or NFC) being identical.\r
1036      * This function compares strings incrementally instead of normalizing\r
1037      * (and optionally case-folding) both strings entirely,\r
1038      * improving performance significantly.\r
1039      *\r
1040      * Bulk normalization is only necessary if the strings do not fulfill the \r
1041      * FCD conditions. Only in this case, and only if the strings are relatively \r
1042      * long, is memory allocated temporarily.\r
1043      * For FCD strings and short non-FCD strings there is no memory allocation.\r
1044      *\r
1045      * Semantically, this is equivalent to\r
1046      *   strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2)))\r
1047      * where code point order and foldCase are all optional.\r
1048      *\r
1049      * @param s1        First source character array.\r
1050      * @param s1Start   start index of source\r
1051      * @param s1Limit   limit of the source\r
1052      *\r
1053      * @param s2        Second source character array.\r
1054      * @param s2Start   start index of the source\r
1055      * @param s2Limit   limit of the source\r
1056      * \r
1057      * @param options A bit set of options:\r
1058      *   - FOLD_CASE_DEFAULT or 0 is used for default options:\r
1059      *     Case-sensitive comparison in code unit order, and the input strings\r
1060      *     are quick-checked for FCD.\r
1061      *\r
1062      *   - INPUT_IS_FCD\r
1063      *     Set if the caller knows that both s1 and s2 fulfill the FCD \r
1064      *     conditions.If not set, the function will quickCheck for FCD\r
1065      *     and normalize if necessary.\r
1066      *\r
1067      *   - COMPARE_CODE_POINT_ORDER\r
1068      *     Set to choose code point order instead of code unit order\r
1069      *\r
1070      *   - COMPARE_IGNORE_CASE\r
1071      *     Set to compare strings case-insensitively using case folding,\r
1072      *     instead of case-sensitively.\r
1073      *     If set, then the following case folding options are used.\r
1074      *\r
1075      *\r
1076      * @return <0 or 0 or >0 as usual for string comparisons\r
1077      *\r
1078      * @see #normalize\r
1079      * @see #FCD\r
1080      * @stable ICU 2.8\r
1081      */\r
1082     public static int compare(char[] s1, int s1Start, int s1Limit,\r
1083                               char[] s2, int s2Start, int s2Limit,\r
1084                               int options) {\r
1085         if( s1==null || s1Start<0 || s1Limit<0 || \r
1086             s2==null || s2Start<0 || s2Limit<0 ||\r
1087             s1Limit<s1Start || s2Limit<s2Start\r
1088         ) {\r
1089             throw new IllegalArgumentException();\r
1090         }\r
1091         return internalCompare(CharBuffer.wrap(s1, s1Start, s1Limit-s1Start), \r
1092                                CharBuffer.wrap(s2, s2Start, s2Limit-s2Start), \r
1093                                options);\r
1094     } \r
1095 \r
1096     /**\r
1097      * Compare two strings for canonical equivalence.\r
1098      * Further options include case-insensitive comparison and\r
1099      * code point order (as opposed to code unit order).\r
1100      *\r
1101      * Canonical equivalence between two strings is defined as their normalized\r
1102      * forms (NFD or NFC) being identical.\r
1103      * This function compares strings incrementally instead of normalizing\r
1104      * (and optionally case-folding) both strings entirely,\r
1105      * improving performance significantly.\r
1106      *\r
1107      * Bulk normalization is only necessary if the strings do not fulfill the \r
1108      * FCD conditions. Only in this case, and only if the strings are relatively \r
1109      * long, is memory allocated temporarily.\r
1110      * For FCD strings and short non-FCD strings there is no memory allocation.\r
1111      *\r
1112      * Semantically, this is equivalent to\r
1113      *   strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2)))\r
1114      * where code point order and foldCase are all optional.\r
1115      *\r
1116      * @param s1 First source string.\r
1117      * @param s2 Second source string.\r
1118      *\r
1119      * @param options A bit set of options:\r
1120      *   - FOLD_CASE_DEFAULT or 0 is used for default options:\r
1121      *     Case-sensitive comparison in code unit order, and the input strings\r
1122      *     are quick-checked for FCD.\r
1123      *\r
1124      *   - INPUT_IS_FCD\r
1125      *     Set if the caller knows that both s1 and s2 fulfill the FCD \r
1126      *     conditions. If not set, the function will quickCheck for FCD\r
1127      *     and normalize if necessary.\r
1128      *\r
1129      *   - COMPARE_CODE_POINT_ORDER\r
1130      *     Set to choose code point order instead of code unit order\r
1131      *\r
1132      *   - COMPARE_IGNORE_CASE\r
1133      *     Set to compare strings case-insensitively using case folding,\r
1134      *     instead of case-sensitively.\r
1135      *     If set, then the following case folding options are used.\r
1136      *\r
1137      * @return <0 or 0 or >0 as usual for string comparisons\r
1138      *\r
1139      * @see #normalize\r
1140      * @see #FCD\r
1141      * @stable ICU 2.8\r
1142      */\r
1143     public static int compare(String s1, String s2, int options) {\r
1144         return internalCompare(s1, s2, options);\r
1145     }\r
1146 \r
1147     /**\r
1148      * Compare two strings for canonical equivalence.\r
1149      * Further options include case-insensitive comparison and\r
1150      * code point order (as opposed to code unit order).\r
1151      * Convenience method.\r
1152      *\r
1153      * @param s1 First source string.\r
1154      * @param s2 Second source string.\r
1155      *\r
1156      * @param options A bit set of options:\r
1157      *   - FOLD_CASE_DEFAULT or 0 is used for default options:\r
1158      *     Case-sensitive comparison in code unit order, and the input strings\r
1159      *     are quick-checked for FCD.\r
1160      *\r
1161      *   - INPUT_IS_FCD\r
1162      *     Set if the caller knows that both s1 and s2 fulfill the FCD \r
1163      *     conditions. If not set, the function will quickCheck for FCD\r
1164      *     and normalize if necessary.\r
1165      *\r
1166      *   - COMPARE_CODE_POINT_ORDER\r
1167      *     Set to choose code point order instead of code unit order\r
1168      *\r
1169      *   - COMPARE_IGNORE_CASE\r
1170      *     Set to compare strings case-insensitively using case folding,\r
1171      *     instead of case-sensitively.\r
1172      *     If set, then the following case folding options are used.\r
1173      *\r
1174      * @return <0 or 0 or >0 as usual for string comparisons\r
1175      *\r
1176      * @see #normalize\r
1177      * @see #FCD\r
1178      * @stable ICU 2.8\r
1179      */\r
1180     public static int compare(char[] s1, char[] s2, int options) {\r
1181         return internalCompare(CharBuffer.wrap(s1), CharBuffer.wrap(s2), options);\r
1182     }\r
1183 \r
1184     /**\r
1185      * Convenience method that can have faster implementation\r
1186      * by not allocating buffers.\r
1187      * @param char32a    the first code point to be checked against the\r
1188      * @param char32b    the second code point\r
1189      * @param options    A bit set of options\r
1190      * @stable ICU 2.8\r
1191      */\r
1192     public static int compare(int char32a, int char32b, int options) {\r
1193         return internalCompare(UTF16.valueOf(char32a), UTF16.valueOf(char32b), options|INPUT_IS_FCD);\r
1194     }\r
1195 \r
1196     /**\r
1197      * Convenience method that can have faster implementation\r
1198      * by not allocating buffers.\r
1199      * @param char32a   the first code point to be checked against\r
1200      * @param str2      the second string\r
1201      * @param options   A bit set of options\r
1202      * @stable ICU 2.8\r
1203      */\r
1204     public static int compare(int char32a, String str2, int options) {\r
1205         return internalCompare(UTF16.valueOf(char32a), str2, options);\r
1206     }\r
1207 \r
1208     /* Concatenation of normalized strings --------------------------------- */\r
1209     /**\r
1210      * Concatenate normalized strings, making sure that the result is normalized\r
1211      * as well.\r
1212      *\r
1213      * If both the left and the right strings are in\r
1214      * the normalization form according to "mode",\r
1215      * then the result will be\r
1216      *\r
1217      * <code>\r
1218      *     dest=normalize(left+right, mode)\r
1219      * </code>\r
1220      *\r
1221      * With the input strings already being normalized,\r
1222      * this function will use next() and previous()\r
1223      * to find the adjacent end pieces of the input strings.\r
1224      * Only the concatenation of these end pieces will be normalized and\r
1225      * then concatenated with the remaining parts of the input strings.\r
1226      *\r
1227      * It is allowed to have dest==left to avoid copying the entire left string.\r
1228      *\r
1229      * @param left Left source array, may be same as dest.\r
1230      * @param leftStart start in the left array.\r
1231      * @param leftLimit limit in the left array (==length)\r
1232      * @param right Right source array.\r
1233      * @param rightStart start in the right array.\r
1234      * @param rightLimit limit in the right array (==length)\r
1235      * @param dest The output buffer; can be null if destStart==destLimit==0 \r
1236      *              for pure preflighting.\r
1237      * @param destStart start in the destination array\r
1238      * @param destLimit limit in the destination array (==length)\r
1239      * @param mode The normalization mode.\r
1240      * @param options The normalization options, ORed together (0 for no options).\r
1241      * @return Length of output (number of chars) when successful or \r
1242      *          IndexOutOfBoundsException\r
1243      * @exception IndexOutOfBoundsException whose message has the string \r
1244      *             representation of destination capacity required. \r
1245      * @see #normalize\r
1246      * @see #next\r
1247      * @see #previous\r
1248      * @exception IndexOutOfBoundsException if target capacity is less than the\r
1249      *             required length\r
1250      * @stable ICU 2.8\r
1251      */\r
1252     public static int concatenate(char[] left,  int leftStart,  int leftLimit,\r
1253                                   char[] right, int rightStart, int rightLimit, \r
1254                                   char[] dest,  int destStart,  int destLimit,\r
1255                                   Normalizer.Mode mode, int options) {\r
1256         if(dest == null) {\r
1257             throw new IllegalArgumentException();\r
1258         }\r
1259     \r
1260         /* check for overlapping right and destination */\r
1261         if (right == dest && rightStart < destLimit && destStart < rightLimit) {\r
1262             throw new IllegalArgumentException("overlapping right and dst ranges");\r
1263         }\r
1264     \r
1265         /* allow left==dest */\r
1266         StringBuilder destBuilder=new StringBuilder(leftLimit-leftStart+rightLimit-rightStart+16);\r
1267         destBuilder.append(left, leftStart, leftLimit-leftStart);\r
1268         CharBuffer rightBuffer=CharBuffer.wrap(right, rightStart, rightLimit-rightStart);\r
1269         mode.getNormalizer2(options).append(destBuilder, rightBuffer);\r
1270         int destLength=destBuilder.length();\r
1271         if(destLength<=(destLimit-destStart)) {\r
1272             destBuilder.getChars(0, destLength, dest, destStart);\r
1273             return destLength;\r
1274         } else {\r
1275             throw new IndexOutOfBoundsException(Integer.toString(destLength));\r
1276         }\r
1277     }\r
1278 \r
1279     /**\r
1280      * Concatenate normalized strings, making sure that the result is normalized\r
1281      * as well.\r
1282      *\r
1283      * If both the left and the right strings are in\r
1284      * the normalization form according to "mode",\r
1285      * then the result will be\r
1286      *\r
1287      * <code>\r
1288      *     dest=normalize(left+right, mode)\r
1289      * </code>\r
1290      *\r
1291      * For details see concatenate \r
1292      *\r
1293      * @param left Left source string.\r
1294      * @param right Right source string.\r
1295      * @param mode The normalization mode.\r
1296      * @param options The normalization options, ORed together (0 for no options).\r
1297      * @return result\r
1298      *\r
1299      * @see #concatenate\r
1300      * @see #normalize\r
1301      * @see #next\r
1302      * @see #previous\r
1303      * @see #concatenate\r
1304      * @stable ICU 2.8\r
1305      */\r
1306     public static String concatenate(char[] left, char[] right,Mode mode, int options) {\r
1307         StringBuilder dest=new StringBuilder(left.length+right.length+16).append(left);\r
1308         return mode.getNormalizer2(options).append(dest, CharBuffer.wrap(right)).toString();\r
1309     }\r
1310 \r
1311     /**\r
1312      * Concatenate normalized strings, making sure that the result is normalized\r
1313      * as well.\r
1314      *\r
1315      * If both the left and the right strings are in\r
1316      * the normalization form according to "mode",\r
1317      * then the result will be\r
1318      *\r
1319      * <code>\r
1320      *     dest=normalize(left+right, mode)\r
1321      * </code>\r
1322      *\r
1323      * With the input strings already being normalized,\r
1324      * this function will use next() and previous()\r
1325      * to find the adjacent end pieces of the input strings.\r
1326      * Only the concatenation of these end pieces will be normalized and\r
1327      * then concatenated with the remaining parts of the input strings.\r
1328      *\r
1329      * @param left Left source string.\r
1330      * @param right Right source string.\r
1331      * @param mode The normalization mode.\r
1332      * @param options The normalization options, ORed together (0 for no options).\r
1333      * @return result\r
1334      *\r
1335      * @see #concatenate\r
1336      * @see #normalize\r
1337      * @see #next\r
1338      * @see #previous\r
1339      * @see #concatenate\r
1340      * @stable ICU 2.8\r
1341      */\r
1342     public static String concatenate(String left, String right, Mode mode, int options) {\r
1343         StringBuilder dest=new StringBuilder(left.length()+right.length()+16).append(left);\r
1344         return mode.getNormalizer2(options).append(dest, right).toString();\r
1345     }\r
1346 \r
1347     /**\r
1348      * Gets the FC_NFKC closure value.\r
1349      * @param c The code point whose closure value is to be retrieved\r
1350      * @param dest The char array to receive the closure value\r
1351      * @return the length of the closure value; 0 if there is none\r
1352      * @stable ICU 3.8\r
1353      */\r
1354     public static int getFC_NFKC_Closure(int c,char[] dest) {\r
1355         String closure=getFC_NFKC_Closure(c);\r
1356         int length=closure.length();\r
1357         if(length!=0 && dest!=null && length<=dest.length) {\r
1358             closure.getChars(0, length, dest, 0);\r
1359         }\r
1360         return length;\r
1361     }\r
1362     /**\r
1363      * Gets the FC_NFKC closure value.\r
1364      * @param c The code point whose closure value is to be retrieved\r
1365      * @return String representation of the closure value; "" if there is none\r
1366      * @stable ICU 3.8\r
1367      */ \r
1368     public static String getFC_NFKC_Closure(int c) {\r
1369         // Compute the FC_NFKC_Closure on the fly:\r
1370         // We have the API for complete coverage of Unicode properties, although\r
1371         // this value by itself is not useful via API.\r
1372         // (What could be useful is a custom normalization table that combines\r
1373         // case folding and NFKC.)\r
1374         // For the derivation, see Unicode's DerivedNormalizationProps.txt.\r
1375         Normalizer2 nfkc=NFKCModeImpl.INSTANCE.normalizer2;\r
1376         UCaseProps csp;\r
1377         try {\r
1378             csp=UCaseProps.getSingleton();\r
1379         } catch(IOException e) {\r
1380             throw new RuntimeException(e);\r
1381         }\r
1382         // first: b = NFKC(Fold(a))\r
1383         StringBuffer folded=new StringBuffer();\r
1384         int folded1Length=csp.toFullFolding(c, folded, 0);\r
1385         if(folded1Length<0) {\r
1386             Normalizer2Impl nfkcImpl=((Norm2AllModes.Normalizer2WithImpl)nfkc).impl;\r
1387             if(nfkcImpl.getCompQuickCheck(nfkcImpl.getNorm16(c))!=0) {\r
1388                 return "";  // c does not change at all under CaseFolding+NFKC\r
1389             }\r
1390             folded.appendCodePoint(c);\r
1391         } else {\r
1392             if(folded1Length>UCaseProps.MAX_STRING_LENGTH) {\r
1393                 folded.appendCodePoint(folded1Length);\r
1394             }\r
1395         }\r
1396         String kc1=nfkc.normalize(folded);\r
1397         // second: c = NFKC(Fold(b))\r
1398         String kc2=nfkc.normalize(UCharacter.foldCase(kc1, 0));\r
1399         // if (c != b) add the mapping from a to c\r
1400         if(kc1.equals(kc2)) {\r
1401             return "";\r
1402         } else {\r
1403             return kc2;\r
1404         }\r
1405     }\r
1406 \r
1407     //-------------------------------------------------------------------------\r
1408     // Iteration API\r
1409     //-------------------------------------------------------------------------\r
1410 \r
1411     /**\r
1412      * Return the current character in the normalized text.\r
1413      * @return The codepoint as an int\r
1414      * @stable ICU 2.8\r
1415      */\r
1416     public int current() {\r
1417         if(bufferPos<buffer.length() || nextNormalize()) {\r
1418             return buffer.codePointAt(bufferPos);\r
1419         } else {\r
1420             return DONE;\r
1421         }\r
1422     }\r
1423 \r
1424     /**\r
1425      * Return the next character in the normalized text and advance\r
1426      * the iteration position by one.  If the end\r
1427      * of the text has already been reached, {@link #DONE} is returned.\r
1428      * @return The codepoint as an int\r
1429      * @stable ICU 2.8\r
1430      */\r
1431     public int next() {\r
1432         if(bufferPos<buffer.length() ||  nextNormalize()) {\r
1433             int c=buffer.codePointAt(bufferPos);\r
1434             bufferPos+=Character.charCount(c);\r
1435             return c;\r
1436         } else {\r
1437             return DONE;\r
1438         }\r
1439     }\r
1440         \r
1441         \r
1442     /**\r
1443      * Return the previous character in the normalized text and decrement\r
1444      * the iteration position by one.  If the beginning\r
1445      * of the text has already been reached, {@link #DONE} is returned.\r
1446      * @return The codepoint as an int\r
1447      * @stable ICU 2.8\r
1448      */\r
1449     public int previous() {\r
1450         if(bufferPos>0 || previousNormalize()) {\r
1451             int c=buffer.codePointBefore(bufferPos);\r
1452             bufferPos-=Character.charCount(c);\r
1453             return c;\r
1454         } else {\r
1455             return DONE;\r
1456         }\r
1457     }\r
1458         \r
1459     /**\r
1460      * Reset the index to the beginning of the text.\r
1461      * This is equivalent to setIndexOnly(startIndex)).\r
1462      * @stable ICU 2.8\r
1463      */\r
1464     public void reset() {\r
1465         text.setToStart();\r
1466         currentIndex=nextIndex=0;\r
1467         clearBuffer();\r
1468     }\r
1469     \r
1470     /**\r
1471      * Set the iteration position in the input text that is being normalized,\r
1472      * without any immediate normalization.\r
1473      * After setIndexOnly(), getIndex() will return the same index that is\r
1474      * specified here.\r
1475      *\r
1476      * @param index the desired index in the input text.\r
1477      * @stable ICU 2.8\r
1478      */\r
1479     public void setIndexOnly(int index) {\r
1480         text.setIndex(index);  // validates index\r
1481         currentIndex=nextIndex=index;\r
1482         clearBuffer();\r
1483     }\r
1484         \r
1485     /**\r
1486      * Set the iteration position in the input text that is being normalized\r
1487      * and return the first normalized character at that position.\r
1488      * <p>\r
1489      * <b>Note:</b> This method sets the position in the <em>input</em> text,\r
1490      * while {@link #next} and {@link #previous} iterate through characters\r
1491      * in the normalized <em>output</em>.  This means that there is not\r
1492      * necessarily a one-to-one correspondence between characters returned\r
1493      * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and\r
1494      * returned from <tt>setIndex</tt> and {@link #getIndex}.\r
1495      * <p>\r
1496      * @param index the desired index in the input text.\r
1497      *\r
1498      * @return   the first normalized character that is the result of iterating\r
1499      *            forward starting at the given index.\r
1500      *\r
1501      * @throws IllegalArgumentException if the given index is less than\r
1502      *          {@link #getBeginIndex} or greater than {@link #getEndIndex}.\r
1503      * @deprecated ICU 3.2\r
1504      * @obsolete ICU 3.2\r
1505      */\r
1506      ///CLOVER:OFF\r
1507      public int setIndex(int index) {\r
1508          setIndexOnly(index);\r
1509          return current();\r
1510      }\r
1511      ///CLOVER:ON\r
1512     /**\r
1513      * Retrieve the index of the start of the input text. This is the begin \r
1514      * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the \r
1515      * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating\r
1516      * @deprecated ICU 2.2. Use startIndex() instead.\r
1517      * @return The codepoint as an int\r
1518      * @see #startIndex\r
1519      */\r
1520     public int getBeginIndex() {\r
1521         return 0;\r
1522     }\r
1523 \r
1524     /**\r
1525      * Retrieve the index of the end of the input text.  This is the end index\r
1526      * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>\r
1527      * over which this <tt>Normalizer</tt> is iterating\r
1528      * @deprecated ICU 2.2. Use endIndex() instead.\r
1529      * @return The codepoint as an int\r
1530      * @see #endIndex\r
1531      */\r
1532     public int getEndIndex() {\r
1533         return endIndex();\r
1534     }\r
1535     /**\r
1536      * Return the first character in the normalized text.  This resets\r
1537      * the <tt>Normalizer's</tt> position to the beginning of the text.\r
1538      * @return The codepoint as an int\r
1539      * @stable ICU 2.8\r
1540      */\r
1541     public int first() {\r
1542         reset();\r
1543         return next();\r
1544     }\r
1545         \r
1546     /**\r
1547      * Return the last character in the normalized text.  This resets\r
1548      * the <tt>Normalizer's</tt> position to be just before the\r
1549      * the input text corresponding to that normalized character.\r
1550      * @return The codepoint as an int\r
1551      * @stable ICU 2.8\r
1552      */\r
1553     public int last() {\r
1554         text.setToLimit();\r
1555         currentIndex=nextIndex=text.getIndex();\r
1556         clearBuffer();\r
1557         return previous();\r
1558     }\r
1559 \r
1560     /**\r
1561      * Retrieve the current iteration position in the input text that is\r
1562      * being normalized.  This method is useful in applications such as\r
1563      * searching, where you need to be able to determine the position in\r
1564      * the input text that corresponds to a given normalized output character.\r
1565      * <p>\r
1566      * <b>Note:</b> This method sets the position in the <em>input</em>, while\r
1567      * {@link #next} and {@link #previous} iterate through characters in the\r
1568      * <em>output</em>.  This means that there is not necessarily a one-to-one\r
1569      * correspondence between characters returned by <tt>next</tt> and\r
1570      * <tt>previous</tt> and the indices passed to and returned from\r
1571      * <tt>setIndex</tt> and {@link #getIndex}.\r
1572      * @return The current iteration position\r
1573      * @stable ICU 2.8\r
1574      */\r
1575     public int getIndex() {\r
1576         if(bufferPos<buffer.length()) {\r
1577             return currentIndex;\r
1578         } else {\r
1579             return nextIndex;\r
1580         }\r
1581     }\r
1582 \r
1583     /**\r
1584      * Retrieve the index of the start of the input text. This is the begin \r
1585      * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the \r
1586      * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating\r
1587      * @return The current iteration position\r
1588      * @stable ICU 2.8\r
1589      */\r
1590     public int startIndex() {\r
1591         return 0;\r
1592     }\r
1593 \r
1594     /**\r
1595      * Retrieve the index of the end of the input text.  This is the end index\r
1596      * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>\r
1597      * over which this <tt>Normalizer</tt> is iterating\r
1598      * @return The current iteration position\r
1599      * @stable ICU 2.8\r
1600      */\r
1601     public int endIndex() {\r
1602         return text.getLength();\r
1603     }\r
1604 \r
1605     //-------------------------------------------------------------------------\r
1606     // Iterator attributes\r
1607     //-------------------------------------------------------------------------\r
1608     /**\r
1609      * Set the normalization mode for this object.\r
1610      * <p>\r
1611      * <b>Note:</b>If the normalization mode is changed while iterating\r
1612      * over a string, calls to {@link #next} and {@link #previous} may\r
1613      * return previously buffers characters in the old normalization mode\r
1614      * until the iteration is able to re-sync at the next base character.\r
1615      * It is safest to call {@link #setText setText()}, {@link #first},\r
1616      * {@link #last}, etc. after calling <tt>setMode</tt>.\r
1617      * <p>\r
1618      * @param newMode the new mode for this <tt>Normalizer</tt>.\r
1619      * The supported modes are:\r
1620      * <ul>\r
1621      *  <li>{@link #NFC}    - Unicode canonical decompositiion\r
1622      *                        followed by canonical composition.\r
1623      *  <li>{@link #NFKC}   - Unicode compatibility decompositiion\r
1624      *                        follwed by canonical composition.\r
1625      *  <li>{@link #NFD}    - Unicode canonical decomposition\r
1626      *  <li>{@link #NFKD}   - Unicode compatibility decomposition.\r
1627      *  <li>{@link #NONE}   - Do nothing but return characters\r
1628      *                        from the underlying input text.\r
1629      * </ul>\r
1630      *\r
1631      * @see #getMode\r
1632      * @stable ICU 2.8\r
1633      */\r
1634     public void setMode(Mode newMode) {\r
1635         mode = newMode;\r
1636         norm2 = mode.getNormalizer2(options);\r
1637     }\r
1638     /**\r
1639      * Return the basic operation performed by this <tt>Normalizer</tt>\r
1640      *\r
1641      * @see #setMode\r
1642      * @stable ICU 2.8\r
1643      */\r
1644     public Mode getMode() {\r
1645         return mode;\r
1646     }\r
1647     /**\r
1648      * Set options that affect this <tt>Normalizer</tt>'s operation.\r
1649      * Options do not change the basic composition or decomposition operation\r
1650      * that is being performed , but they control whether\r
1651      * certain optional portions of the operation are done.\r
1652      * Currently the only available option is:\r
1653      * <p>\r
1654      * <ul>\r
1655      *   <li>{@link #UNICODE_3_2} - Use Normalization conforming to Unicode version 3.2.\r
1656      * </ul>\r
1657      * <p>\r
1658      * @param   option  the option whose value is to be set.\r
1659      * @param   value   the new setting for the option.  Use <tt>true</tt> to\r
1660      *                  turn the option on and <tt>false</tt> to turn it off.\r
1661      *\r
1662      * @see #getOption\r
1663      * @stable ICU 2.6\r
1664      */\r
1665     public void setOption(int option,boolean value) {\r
1666         if (value) {\r
1667             options |= option;\r
1668         } else {\r
1669             options &= (~option);\r
1670         }\r
1671         norm2 = mode.getNormalizer2(options);\r
1672     }\r
1673 \r
1674     /**\r
1675      * Determine whether an option is turned on or off.\r
1676      * <p>\r
1677      * @see #setOption\r
1678      * @stable ICU 2.6\r
1679      */\r
1680     public int getOption(int option) {\r
1681         if((options & option)!=0) {\r
1682             return 1 ;\r
1683         } else {\r
1684             return 0;\r
1685         }\r
1686     }\r
1687     \r
1688     /**\r
1689      * Gets the underlying text storage\r
1690      * @param fillIn the char buffer to fill the UTF-16 units.\r
1691      *         The length of the buffer should be equal to the length of the\r
1692      *         underlying text storage\r
1693      * @throws IndexOutOfBoundsException If the index passed for the array is invalid.\r
1694      * @see   #getLength\r
1695      * @stable ICU 2.8\r
1696      */\r
1697     public int getText(char[] fillIn) {\r
1698         return text.getText(fillIn);\r
1699     }\r
1700     \r
1701     /**\r
1702      * Gets the length of underlying text storage\r
1703      * @return the length\r
1704      * @stable ICU 2.8\r
1705      */ \r
1706     public int getLength() {\r
1707         return text.getLength();\r
1708     }\r
1709     \r
1710     /**\r
1711      * Returns the text under iteration as a string\r
1712      * @return a copy of the text under iteration.\r
1713      * @stable ICU 2.8\r
1714      */\r
1715     public String getText() {\r
1716         return text.getText();\r
1717     }\r
1718     \r
1719     /**\r
1720      * Set the input text over which this <tt>Normalizer</tt> will iterate.\r
1721      * The iteration position is set to the beginning of the input text.\r
1722      * @param newText   The new string to be normalized.\r
1723      * @stable ICU 2.8\r
1724      */\r
1725     public void setText(StringBuffer newText) {\r
1726         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);\r
1727         if (newIter == null) {\r
1728             throw new IllegalStateException("Could not create a new UCharacterIterator");\r
1729         }  \r
1730         text = newIter;\r
1731         reset();\r
1732     }\r
1733 \r
1734     /**\r
1735      * Set the input text over which this <tt>Normalizer</tt> will iterate.\r
1736      * The iteration position is set to the beginning of the input text.\r
1737      * @param newText   The new string to be normalized.\r
1738      * @stable ICU 2.8\r
1739      */\r
1740     public void setText(char[] newText) {\r
1741         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);\r
1742         if (newIter == null) {\r
1743             throw new IllegalStateException("Could not create a new UCharacterIterator");\r
1744         }  \r
1745         text = newIter;\r
1746         reset();\r
1747     }\r
1748 \r
1749     /**\r
1750      * Set the input text over which this <tt>Normalizer</tt> will iterate.\r
1751      * The iteration position is set to the beginning of the input text.\r
1752      * @param newText   The new string to be normalized.\r
1753      * @stable ICU 2.8\r
1754      */\r
1755     public void setText(String newText) {\r
1756         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);\r
1757         if (newIter == null) {\r
1758             throw new IllegalStateException("Could not create a new UCharacterIterator");\r
1759         }  \r
1760         text = newIter;\r
1761         reset();\r
1762     }\r
1763 \r
1764     /**\r
1765      * Set the input text over which this <tt>Normalizer</tt> will iterate.\r
1766      * The iteration position is set to the beginning of the input text.\r
1767      * @param newText   The new string to be normalized.\r
1768      * @stable ICU 2.8\r
1769      */\r
1770     public void setText(CharacterIterator newText) {\r
1771         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);\r
1772         if (newIter == null) {\r
1773             throw new IllegalStateException("Could not create a new UCharacterIterator");\r
1774         }  \r
1775         text = newIter;\r
1776         reset();\r
1777     }\r
1778 \r
1779     /**\r
1780      * Set the input text over which this <tt>Normalizer</tt> will iterate.\r
1781      * The iteration position is set to the beginning of the string.\r
1782      * @param newText   The new string to be normalized.\r
1783      * @stable ICU 2.8\r
1784      */\r
1785     public void setText(UCharacterIterator newText) { \r
1786         try{\r
1787             UCharacterIterator newIter = (UCharacterIterator)newText.clone();\r
1788             if (newIter == null) {\r
1789                 throw new IllegalStateException("Could not create a new UCharacterIterator");\r
1790             }\r
1791             text = newIter;\r
1792             reset();\r
1793         }catch(CloneNotSupportedException e) {\r
1794             throw new IllegalStateException("Could not clone the UCharacterIterator");\r
1795         }\r
1796     }\r
1797 \r
1798     private void clearBuffer() {\r
1799         buffer.setLength(0);\r
1800         bufferPos=0;\r
1801     }\r
1802 \r
1803     private boolean nextNormalize() {\r
1804         clearBuffer();\r
1805         currentIndex=nextIndex;\r
1806         text.setIndex(nextIndex);\r
1807         // Skip at least one character so we make progress.\r
1808         int c=text.nextCodePoint();\r
1809         if(c<0) {\r
1810             return false;\r
1811         }\r
1812         StringBuilder segment=new StringBuilder().appendCodePoint(c);\r
1813         while((c=text.nextCodePoint())>=0) {\r
1814             if(norm2.hasBoundaryBefore(c)) {\r
1815                 text.moveCodePointIndex(-1);\r
1816                 break;\r
1817             }\r
1818             segment.appendCodePoint(c);\r
1819         }\r
1820         nextIndex=text.getIndex();\r
1821         norm2.normalize(segment, buffer);\r
1822         return buffer.length()!=0;\r
1823     }\r
1824 \r
1825     private boolean previousNormalize() {\r
1826         clearBuffer();\r
1827         nextIndex=currentIndex;\r
1828         text.setIndex(currentIndex);\r
1829         StringBuilder segment=new StringBuilder();\r
1830         int c;\r
1831         while((c=text.previousCodePoint())>=0) {\r
1832             if(c<=0xffff) {\r
1833                 segment.insert(0, (char)c);\r
1834             } else {\r
1835                 segment.insert(0, Character.toChars(c));\r
1836             }\r
1837             if(norm2.hasBoundaryBefore(c)) {\r
1838                 break;\r
1839             }\r
1840         }\r
1841         currentIndex=text.getIndex();\r
1842         norm2.normalize(segment, buffer);\r
1843         bufferPos=buffer.length();\r
1844         return buffer.length()!=0;\r
1845     }\r
1846 \r
1847     /* compare canonically equivalent ------------------------------------------- */\r
1848 \r
1849     // TODO: Broaden the public compare(String, String, options) API like this. Ticket #7407\r
1850     private static int internalCompare(CharSequence s1, CharSequence s2, int options) {\r
1851         int normOptions=options>>>COMPARE_NORM_OPTIONS_SHIFT;\r
1852         options|= COMPARE_EQUIV;\r
1853 \r
1854         /*\r
1855          * UAX #21 Case Mappings, as fixed for Unicode version 4\r
1856          * (see Jitterbug 2021), defines a canonical caseless match as\r
1857          *\r
1858          * A string X is a canonical caseless match\r
1859          * for a string Y if and only if\r
1860          * NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y)))\r
1861          *\r
1862          * For better performance, we check for FCD (or let the caller tell us that\r
1863          * both strings are in FCD) for the inner normalization.\r
1864          * BasicNormalizerTest::FindFoldFCDExceptions() makes sure that\r
1865          * case-folding preserves the FCD-ness of a string.\r
1866          * The outer normalization is then only performed by NormalizerImpl.cmpEquivFold()\r
1867          * when there is a difference.\r
1868          *\r
1869          * Exception: When using the Turkic case-folding option, we do perform\r
1870          * full NFD first. This is because in the Turkic case precomposed characters\r
1871          * with 0049 capital I or 0069 small i fold differently whether they\r
1872          * are first decomposed or not, so an FCD check - a check only for\r
1873          * canonical order - is not sufficient.\r
1874          */\r
1875         if((options&INPUT_IS_FCD)==0 || (options&FOLD_CASE_EXCLUDE_SPECIAL_I)!=0) {\r
1876             Normalizer2 n2;\r
1877             if((options&FOLD_CASE_EXCLUDE_SPECIAL_I)!=0) {\r
1878                 n2=NFD.getNormalizer2(normOptions);\r
1879             } else {\r
1880                 n2=FCD.getNormalizer2(normOptions);\r
1881             }\r
1882 \r
1883             // check if s1 and/or s2 fulfill the FCD conditions\r
1884             int spanQCYes1=n2.spanQuickCheckYes(s1);\r
1885             int spanQCYes2=n2.spanQuickCheckYes(s2);\r
1886 \r
1887             /*\r
1888              * ICU 2.4 had a further optimization:\r
1889              * If both strings were not in FCD, then they were both NFD'ed,\r
1890              * and the COMPARE_EQUIV option was turned off.\r
1891              * It is not entirely clear that this is valid with the current\r
1892              * definition of the canonical caseless match.\r
1893              * Therefore, ICU 2.6 removes that optimization.\r
1894              */\r
1895 \r
1896             if(spanQCYes1<s1.length()) {\r
1897                 StringBuilder fcd1=new StringBuilder(s1.length()+16).append(s1, 0, spanQCYes1);\r
1898                 s1=n2.normalizeSecondAndAppend(fcd1, s1.subSequence(spanQCYes1, s1.length()));\r
1899             }\r
1900             if(spanQCYes2<s2.length()) {\r
1901                 StringBuilder fcd2=new StringBuilder(s2.length()+16).append(s2, 0, spanQCYes2);\r
1902                 s2=n2.normalizeSecondAndAppend(fcd2, s2.subSequence(spanQCYes2, s2.length()));\r
1903             }\r
1904         }\r
1905 \r
1906         return cmpEquivFold(s1, s2, options);\r
1907     }    \r
1908 \r
1909     /*\r
1910      * Compare two strings for canonical equivalence.\r
1911      * Further options include case-insensitive comparison and\r
1912      * code point order (as opposed to code unit order).\r
1913      *\r
1914      * In this function, canonical equivalence is optional as well.\r
1915      * If canonical equivalence is tested, then both strings must fulfill\r
1916      * the FCD check.\r
1917      *\r
1918      * Semantically, this is equivalent to\r
1919      *   strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2)))\r
1920      * where code point order, NFD and foldCase are all optional.\r
1921      *\r
1922      * String comparisons almost always yield results before processing both strings\r
1923      * completely.\r
1924      * They are generally more efficient working incrementally instead of\r
1925      * performing the sub-processing (strlen, normalization, case-folding)\r
1926      * on the entire strings first.\r
1927      *\r
1928      * It is also unnecessary to not normalize identical characters.\r
1929      *\r
1930      * This function works in principle as follows:\r
1931      *\r
1932      * loop {\r
1933      *   get one code unit c1 from s1 (-1 if end of source)\r
1934      *   get one code unit c2 from s2 (-1 if end of source)\r
1935      *\r
1936      *   if(either string finished) {\r
1937      *     return result;\r
1938      *   }\r
1939      *   if(c1==c2) {\r
1940      *     continue;\r
1941      *   }\r
1942      *\r
1943      *   // c1!=c2\r
1944      *   try to decompose/case-fold c1/c2, and continue if one does;\r
1945      *\r
1946      *   // still c1!=c2 and neither decomposes/case-folds, return result\r
1947      *   return c1-c2;\r
1948      * }\r
1949      *\r
1950      * When a character decomposes, then the pointer for that source changes to\r
1951      * the decomposition, pushing the previous pointer onto a stack.\r
1952      * When the end of the decomposition is reached, then the code unit reader\r
1953      * pops the previous source from the stack.\r
1954      * (Same for case-folding.)\r
1955      *\r
1956      * This is complicated further by operating on variable-width UTF-16.\r
1957      * The top part of the loop works on code units, while lookups for decomposition\r
1958      * and case-folding need code points.\r
1959      * Code points are assembled after the equality/end-of-source part.\r
1960      * The source pointer is only advanced beyond all code units when the code point\r
1961      * actually decomposes/case-folds.\r
1962      *\r
1963      * If we were on a trail surrogate unit when assembling a code point,\r
1964      * and the code point decomposes/case-folds, then the decomposition/folding\r
1965      * result must be compared with the part of the other string that corresponds to\r
1966      * this string's lead surrogate.\r
1967      * Since we only assemble a code point when hitting a trail unit when the\r
1968      * preceding lead units were identical, we back up the other string by one unit\r
1969      * in such a case.\r
1970      *\r
1971      * The optional code point order comparison at the end works with\r
1972      * the same fix-up as the other code point order comparison functions.\r
1973      * See ustring.c and the comment near the end of this function.\r
1974      *\r
1975      * Assumption: A decomposition or case-folding result string never contains\r
1976      * a single surrogate. This is a safe assumption in the Unicode Standard.\r
1977      * Therefore, we do not need to check for surrogate pairs across\r
1978      * decomposition/case-folding boundaries.\r
1979      *\r
1980      * Further assumptions (see verifications tstnorm.cpp):\r
1981      * The API function checks for FCD first, while the core function\r
1982      * first case-folds and then decomposes. This requires that case-folding does not\r
1983      * un-FCD any strings.\r
1984      *\r
1985      * The API function may also NFD the input and turn off decomposition.\r
1986      * This requires that case-folding does not un-NFD strings either.\r
1987      *\r
1988      * TODO If any of the above two assumptions is violated,\r
1989      * then this entire code must be re-thought.\r
1990      * If this happens, then a simple solution is to case-fold both strings up front\r
1991      * and to turn off UNORM_INPUT_IS_FCD.\r
1992      * We already do this when not both strings are in FCD because makeFCD\r
1993      * would be a partial NFD before the case folding, which does not work.\r
1994      * Note that all of this is only a problem when case-folding _and_\r
1995      * canonical equivalence come together.\r
1996      * (Comments in unorm_compare() are more up to date than this TODO.)\r
1997      */\r
1998 \r
1999     /* stack element for previous-level source/decomposition pointers */\r
2000     private static final class CmpEquivLevel {\r
2001         CharSequence cs;\r
2002         int s;\r
2003     };\r
2004     private static final CmpEquivLevel[] createCmpEquivLevelStack() {\r
2005         return new CmpEquivLevel[] {\r
2006             new CmpEquivLevel(), new CmpEquivLevel()\r
2007         };\r
2008     }\r
2009 \r
2010     /**\r
2011      * Internal option for unorm_cmpEquivFold() for decomposing.\r
2012      * If not set, just do strcasecmp().\r
2013      */\r
2014     private static final int COMPARE_EQUIV=0x80000;\r
2015 \r
2016     /* internal function; package visibility for use by UTF16.StringComparator */\r
2017     /*package*/ static int cmpEquivFold(CharSequence cs1, CharSequence cs2, int options) {\r
2018         Normalizer2Impl nfcImpl;\r
2019         UCaseProps csp;\r
2020 \r
2021         /* current-level start/limit - s1/s2 as current */\r
2022         int s1, s2, limit1, limit2;\r
2023 \r
2024         /* decomposition and case folding variables */\r
2025         int length;\r
2026 \r
2027         /* stacks of previous-level start/current/limit */\r
2028         CmpEquivLevel[] stack1=null, stack2=null;\r
2029 \r
2030         /* buffers for algorithmic decompositions */\r
2031         String decomp1, decomp2;\r
2032 \r
2033         /* case folding buffers, only use current-level start/limit */\r
2034         StringBuffer fold1, fold2;\r
2035 \r
2036         /* track which is the current level per string */\r
2037         int level1, level2;\r
2038 \r
2039         /* current code units, and code points for lookups */\r
2040         int c1, c2, cp1, cp2;\r
2041 \r
2042         /* no argument error checking because this itself is not an API */\r
2043 \r
2044         /*\r
2045          * assume that at least one of the options _COMPARE_EQUIV and U_COMPARE_IGNORE_CASE is set\r
2046          * otherwise this function must behave exactly as uprv_strCompare()\r
2047          * not checking for that here makes testing this function easier\r
2048          */\r
2049 \r
2050         /* normalization/properties data loaded? */\r
2051         if((options&COMPARE_EQUIV)!=0) {\r
2052             nfcImpl=Norm2AllModes.getNFCInstance().impl;\r
2053         } else {\r
2054             nfcImpl=null;\r
2055         }\r
2056         if((options&COMPARE_IGNORE_CASE)!=0) {\r
2057             try {\r
2058                 csp=UCaseProps.getSingleton();\r
2059             } catch(IOException e) {\r
2060                 throw new RuntimeException(e);\r
2061             }\r
2062             fold1=new StringBuffer();\r
2063             fold2=new StringBuffer();\r
2064         } else {\r
2065             csp=null;\r
2066             fold1=fold2=null;\r
2067         }\r
2068 \r
2069         /* initialize */\r
2070         s1=0;\r
2071         limit1=cs1.length();\r
2072         s2=0;\r
2073         limit2=cs2.length();\r
2074 \r
2075         level1=level2=0;\r
2076         c1=c2=-1;\r
2077 \r
2078         /* comparison loop */\r
2079         for(;;) {\r
2080             /*\r
2081              * here a code unit value of -1 means "get another code unit"\r
2082              * below it will mean "this source is finished"\r
2083              */\r
2084 \r
2085             if(c1<0) {\r
2086                 /* get next code unit from string 1, post-increment */\r
2087                 for(;;) {\r
2088                     if(s1==limit1) {\r
2089                         if(level1==0) {\r
2090                             c1=-1;\r
2091                             break;\r
2092                         }\r
2093                     } else {\r
2094                         c1=cs1.charAt(s1++);\r
2095                         break;\r
2096                     }\r
2097 \r
2098                     /* reached end of level buffer, pop one level */\r
2099                     do {\r
2100                         --level1;\r
2101                         cs1=stack1[level1].cs;\r
2102                     } while(cs1==null);\r
2103                     s1=stack1[level1].s;\r
2104                     limit1=cs1.length();\r
2105                 }\r
2106             }\r
2107 \r
2108             if(c2<0) {\r
2109                 /* get next code unit from string 2, post-increment */\r
2110                 for(;;) {\r
2111                     if(s2==limit2) {\r
2112                         if(level2==0) {\r
2113                             c2=-1;\r
2114                             break;\r
2115                         }\r
2116                     } else {\r
2117                         c2=cs2.charAt(s2++);\r
2118                         break;\r
2119                     }\r
2120 \r
2121                     /* reached end of level buffer, pop one level */\r
2122                     do {\r
2123                         --level2;\r
2124                         cs2=stack2[level2].cs;\r
2125                     } while(cs2==null);\r
2126                     s2=stack2[level2].s;\r
2127                     limit2=cs2.length();\r
2128                 }\r
2129             }\r
2130 \r
2131             /*\r
2132              * compare c1 and c2\r
2133              * either variable c1, c2 is -1 only if the corresponding string is finished\r
2134              */\r
2135             if(c1==c2) {\r
2136                 if(c1<0) {\r
2137                     return 0;   /* c1==c2==-1 indicating end of strings */\r
2138                 }\r
2139                 c1=c2=-1;       /* make us fetch new code units */\r
2140                 continue;\r
2141             } else if(c1<0) {\r
2142                 return -1;      /* string 1 ends before string 2 */\r
2143             } else if(c2<0) {\r
2144                 return 1;       /* string 2 ends before string 1 */\r
2145             }\r
2146             /* c1!=c2 && c1>=0 && c2>=0 */\r
2147 \r
2148             /* get complete code points for c1, c2 for lookups if either is a surrogate */\r
2149             cp1=c1;\r
2150             if(UTF16.isSurrogate((char)c1)) {\r
2151                 char c;\r
2152 \r
2153                 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {\r
2154                     if(s1!=limit1 && Character.isLowSurrogate(c=cs1.charAt(s1))) {\r
2155                         /* advance ++s1; only below if cp1 decomposes/case-folds */\r
2156                         cp1=Character.toCodePoint((char)c1, c);\r
2157                     }\r
2158                 } else /* isTrail(c1) */ {\r
2159                     if(0<=(s1-2) && Character.isHighSurrogate(c=cs1.charAt(s1-2))) {\r
2160                         cp1=Character.toCodePoint(c, (char)c1);\r
2161                     }\r
2162                 }\r
2163             }\r
2164 \r
2165             cp2=c2;\r
2166             if(UTF16.isSurrogate((char)c2)) {\r
2167                 char c;\r
2168 \r
2169                 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {\r
2170                     if(s2!=limit2 && Character.isLowSurrogate(c=cs2.charAt(s2))) {\r
2171                         /* advance ++s2; only below if cp2 decomposes/case-folds */\r
2172                         cp2=Character.toCodePoint((char)c2, c);\r
2173                     }\r
2174                 } else /* isTrail(c2) */ {\r
2175                     if(0<=(s2-2) && Character.isHighSurrogate(c=cs2.charAt(s2-2))) {\r
2176                         cp2=Character.toCodePoint(c, (char)c2);\r
2177                     }\r
2178                 }\r
2179             }\r
2180 \r
2181             /*\r
2182              * go down one level for each string\r
2183              * continue with the main loop as soon as there is a real change\r
2184              */\r
2185 \r
2186             if( level1==0 && (options&COMPARE_IGNORE_CASE)!=0 &&\r
2187                 (length=csp.toFullFolding(cp1, fold1, options))>=0\r
2188             ) {\r
2189                 /* cp1 case-folds to the code point "length" or to p[length] */\r
2190                 if(UTF16.isSurrogate((char)c1)) {\r
2191                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {\r
2192                         /* advance beyond source surrogate pair if it case-folds */\r
2193                         ++s1;\r
2194                     } else /* isTrail(c1) */ {\r
2195                         /*\r
2196                          * we got a supplementary code point when hitting its trail surrogate,\r
2197                          * therefore the lead surrogate must have been the same as in the other string;\r
2198                          * compare this decomposition with the lead surrogate in the other string\r
2199                          * remember that this simulates bulk text replacement:\r
2200                          * the decomposition would replace the entire code point\r
2201                          */\r
2202                         --s2;\r
2203                         c2=cs2.charAt(s2-1);\r
2204                     }\r
2205                 }\r
2206 \r
2207                 /* push current level pointers */\r
2208                 if(stack1==null) {\r
2209                     stack1=createCmpEquivLevelStack();\r
2210                 }\r
2211                 stack1[0].cs=cs1;\r
2212                 stack1[0].s=s1;\r
2213                 ++level1;\r
2214 \r
2215                 /* copy the folding result to fold1[] */\r
2216                 /* Java: the buffer was probably not empty, remove the old contents */\r
2217                 if(length<=UCaseProps.MAX_STRING_LENGTH) {\r
2218                     fold1.delete(0, fold1.length()-length);\r
2219                 } else {\r
2220                     fold1.setLength(0);\r
2221                     fold1.appendCodePoint(length);\r
2222                 }\r
2223 \r
2224                 /* set next level pointers to case folding */\r
2225                 cs1=fold1;\r
2226                 s1=0;\r
2227                 limit1=fold1.length();\r
2228 \r
2229                 /* get ready to read from decomposition, continue with loop */\r
2230                 c1=-1;\r
2231                 continue;\r
2232             }\r
2233 \r
2234             if( level2==0 && (options&COMPARE_IGNORE_CASE)!=0 &&\r
2235                 (length=csp.toFullFolding(cp2, fold2, options))>=0\r
2236             ) {\r
2237                 /* cp2 case-folds to the code point "length" or to p[length] */\r
2238                 if(UTF16.isSurrogate((char)c2)) {\r
2239                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {\r
2240                         /* advance beyond source surrogate pair if it case-folds */\r
2241                         ++s2;\r
2242                     } else /* isTrail(c2) */ {\r
2243                         /*\r
2244                          * we got a supplementary code point when hitting its trail surrogate,\r
2245                          * therefore the lead surrogate must have been the same as in the other string;\r
2246                          * compare this decomposition with the lead surrogate in the other string\r
2247                          * remember that this simulates bulk text replacement:\r
2248                          * the decomposition would replace the entire code point\r
2249                          */\r
2250                         --s1;\r
2251                         c1=cs1.charAt(s1-1);\r
2252                     }\r
2253                 }\r
2254 \r
2255                 /* push current level pointers */\r
2256                 if(stack2==null) {\r
2257                     stack2=createCmpEquivLevelStack();\r
2258                 }\r
2259                 stack2[0].cs=cs2;\r
2260                 stack2[0].s=s2;\r
2261                 ++level2;\r
2262 \r
2263                 /* copy the folding result to fold2[] */\r
2264                 /* Java: the buffer was probably not empty, remove the old contents */\r
2265                 if(length<=UCaseProps.MAX_STRING_LENGTH) {\r
2266                     fold2.delete(0, fold2.length()-length);\r
2267                 } else {\r
2268                     fold2.setLength(0);\r
2269                     fold2.appendCodePoint(length);\r
2270                 }\r
2271 \r
2272                 /* set next level pointers to case folding */\r
2273                 cs2=fold2;\r
2274                 s2=0;\r
2275                 limit2=fold2.length();\r
2276 \r
2277                 /* get ready to read from decomposition, continue with loop */\r
2278                 c2=-1;\r
2279                 continue;\r
2280             }\r
2281 \r
2282             if( level1<2 && (options&COMPARE_EQUIV)!=0 &&\r
2283                 (decomp1=nfcImpl.getDecomposition(cp1))!=null\r
2284             ) {\r
2285                 /* cp1 decomposes into p[length] */\r
2286                 if(UTF16.isSurrogate((char)c1)) {\r
2287                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {\r
2288                         /* advance beyond source surrogate pair if it decomposes */\r
2289                         ++s1;\r
2290                     } else /* isTrail(c1) */ {\r
2291                         /*\r
2292                          * we got a supplementary code point when hitting its trail surrogate,\r
2293                          * therefore the lead surrogate must have been the same as in the other string;\r
2294                          * compare this decomposition with the lead surrogate in the other string\r
2295                          * remember that this simulates bulk text replacement:\r
2296                          * the decomposition would replace the entire code point\r
2297                          */\r
2298                         --s2;\r
2299                         c2=cs2.charAt(s2-1);\r
2300                     }\r
2301                 }\r
2302 \r
2303                 /* push current level pointers */\r
2304                 if(stack1==null) {\r
2305                     stack1=createCmpEquivLevelStack();\r
2306                 }\r
2307                 stack1[level1].cs=cs1;\r
2308                 stack1[level1].s=s1;\r
2309                 ++level1;\r
2310 \r
2311                 /* set empty intermediate level if skipped */\r
2312                 if(level1<2) {\r
2313                     stack1[level1++].cs=null;\r
2314                 }\r
2315 \r
2316                 /* set next level pointers to decomposition */\r
2317                 cs1=decomp1;\r
2318                 s1=0;\r
2319                 limit1=decomp1.length();\r
2320 \r
2321                 /* get ready to read from decomposition, continue with loop */\r
2322                 c1=-1;\r
2323                 continue;\r
2324             }\r
2325 \r
2326             if( level2<2 && (options&COMPARE_EQUIV)!=0 &&\r
2327                 (decomp2=nfcImpl.getDecomposition(cp2))!=null\r
2328             ) {\r
2329                 /* cp2 decomposes into p[length] */\r
2330                 if(UTF16.isSurrogate((char)c2)) {\r
2331                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {\r
2332                         /* advance beyond source surrogate pair if it decomposes */\r
2333                         ++s2;\r
2334                     } else /* isTrail(c2) */ {\r
2335                         /*\r
2336                          * we got a supplementary code point when hitting its trail surrogate,\r
2337                          * therefore the lead surrogate must have been the same as in the other string;\r
2338                          * compare this decomposition with the lead surrogate in the other string\r
2339                          * remember that this simulates bulk text replacement:\r
2340                          * the decomposition would replace the entire code point\r
2341                          */\r
2342                         --s1;\r
2343                         c1=cs1.charAt(s1-1);\r
2344                     }\r
2345                 }\r
2346 \r
2347                 /* push current level pointers */\r
2348                 if(stack2==null) {\r
2349                     stack2=createCmpEquivLevelStack();\r
2350                 }\r
2351                 stack2[level2].cs=cs2;\r
2352                 stack2[level2].s=s2;\r
2353                 ++level2;\r
2354 \r
2355                 /* set empty intermediate level if skipped */\r
2356                 if(level2<2) {\r
2357                     stack2[level2++].cs=null;\r
2358                 }\r
2359 \r
2360                 /* set next level pointers to decomposition */\r
2361                 cs2=decomp2;\r
2362                 s2=0;\r
2363                 limit2=decomp2.length();\r
2364 \r
2365                 /* get ready to read from decomposition, continue with loop */\r
2366                 c2=-1;\r
2367                 continue;\r
2368             }\r
2369 \r
2370             /*\r
2371              * no decomposition/case folding, max level for both sides:\r
2372              * return difference result\r
2373              *\r
2374              * code point order comparison must not just return cp1-cp2\r
2375              * because when single surrogates are present then the surrogate pairs\r
2376              * that formed cp1 and cp2 may be from different string indexes\r
2377              *\r
2378              * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units\r
2379              * c1=d800 cp1=10001 c2=dc00 cp2=10000\r
2380              * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }\r
2381              *\r
2382              * therefore, use same fix-up as in ustring.c/uprv_strCompare()\r
2383              * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++\r
2384              * so we have slightly different pointer/start/limit comparisons here\r
2385              */\r
2386 \r
2387             if(c1>=0xd800 && c2>=0xd800 && (options&COMPARE_CODE_POINT_ORDER)!=0) {\r
2388                 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */\r
2389                 if(\r
2390                     (c1<=0xdbff && s1!=limit1 && Character.isLowSurrogate(cs1.charAt(s1))) ||\r
2391                     (Character.isLowSurrogate((char)c1) && 0!=(s1-1) && Character.isHighSurrogate(cs1.charAt(s1-2)))\r
2392                 ) {\r
2393                     /* part of a surrogate pair, leave >=d800 */\r
2394                 } else {\r
2395                     /* BMP code point - may be surrogate code point - make <d800 */\r
2396                     c1-=0x2800;\r
2397                 }\r
2398 \r
2399                 if(\r
2400                     (c2<=0xdbff && s2!=limit2 && Character.isLowSurrogate(cs2.charAt(s2))) ||\r
2401                     (Character.isLowSurrogate((char)c2) && 0!=(s2-1) && Character.isHighSurrogate(cs2.charAt(s2-2)))\r
2402                 ) {\r
2403                     /* part of a surrogate pair, leave >=d800 */\r
2404                 } else {\r
2405                     /* BMP code point - may be surrogate code point - make <d800 */\r
2406                     c2-=0x2800;\r
2407                 }\r
2408             }\r
2409 \r
2410             return c1-c2;\r
2411         }\r
2412     }\r
2413 \r
2414     /**\r
2415      * An Appendable that writes into a char array with a capacity that may be\r
2416      * less than array.length.\r
2417      * (By contrast, CharBuffer will write beyond destLimit all the way up to array.length.)\r
2418      * <p>\r
2419      * An overflow is only reported at the end, for the old Normalizer API functions that write\r
2420      * to char arrays.\r
2421      */\r
2422     private static final class CharsAppendable implements Appendable {\r
2423         public CharsAppendable(char[] dest, int destStart, int destLimit) {\r
2424             chars=dest;\r
2425             start=offset=destStart;\r
2426             limit=destLimit;\r
2427         }\r
2428         public int length() {\r
2429             int len=offset-start;\r
2430             if(offset<=limit) {\r
2431                 return len;\r
2432             } else {\r
2433                 throw new IndexOutOfBoundsException(Integer.toString(len));\r
2434             }\r
2435         }\r
2436         public Appendable append(char c) {\r
2437             if(offset<limit) {\r
2438                 chars[offset]=c;\r
2439             }\r
2440             ++offset;\r
2441             return this;\r
2442         }\r
2443         public Appendable append(CharSequence s) {\r
2444             return append(s, 0, s.length());\r
2445         }\r
2446         public Appendable append(CharSequence s, int sStart, int sLimit) {\r
2447             int len=sLimit-sStart;\r
2448             if(len<=(limit-offset)) {\r
2449                 while(sStart<sLimit) {  // TODO: Is there a better way to copy the characters?\r
2450                     chars[offset++]=s.charAt(sStart++);\r
2451                 }\r
2452             } else {\r
2453                 offset+=len;\r
2454             }\r
2455             return this;\r
2456         }\r
2457 \r
2458         private final char[] chars;\r
2459         private final int start, limit;\r
2460         private int offset;\r
2461     }\r
2462 }\r