jars/icu4j-52_1/main/classes/core/src/com/ibm/icu/text/Normalizer.java

   1 /*
   2  *******************************************************************************
   3  * Copyright (C) 2000-2012, International Business Machines Corporation and
   4  * others. All Rights Reserved.
   5  *******************************************************************************
   6  */
   7 package com.ibm.icu.text;
   8 import java.nio.CharBuffer;
   9 import java.text.CharacterIterator;
  10
  11 import com.ibm.icu.impl.Norm2AllModes;
  12 import com.ibm.icu.impl.Normalizer2Impl;
  13 import com.ibm.icu.impl.UCaseProps;
  14 import com.ibm.icu.lang.UCharacter;
  15
  16 /**
  17  * Unicode Normalization
  18  *
  19  * <h2>Unicode normalization API</h2>
  20  *
  21  * <code>normalize</code> transforms Unicode text into an equivalent composed or
  22  * decomposed form, allowing for easier sorting and searching of text.
  23  * <code>normalize</code> supports the standard normalization forms described in
  24  * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
  25  * Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>.
  26  *
  27  * Characters with accents or other adornments can be encoded in
  28  * several different ways in Unicode.  For example, take the character A-acute.
  29  * In Unicode, this can be encoded as a single character (the
  30  * "composed" form):
  31  *
  32  * <pre>
  33  *      00C1    LATIN CAPITAL LETTER A WITH ACUTE
  34  * </pre>
  35  *
  36  * or as two separate characters (the "decomposed" form):
  37  *
  38  * <pre>
  39  *      0041    LATIN CAPITAL LETTER A
  40  *      0301    COMBINING ACUTE ACCENT
  41  * </pre>
  42  *
  43  * To a user of your program, however, both of these sequences should be
  44  * treated as the same "user-level" character "A with acute accent".  When you
  45  * are searching or comparing text, you must ensure that these two sequences are
  46  * treated equivalently.  In addition, you must handle characters with more than
  47  * one accent.  Sometimes the order of a character's combining accents is
  48  * significant, while in other cases accent sequences in different orders are
  49  * really equivalent.
  50  *
  51  * Similarly, the string "ffi" can be encoded as three separate letters:
  52  *
  53  * <pre>
  54  *      0066    LATIN SMALL LETTER F
  55  *      0066    LATIN SMALL LETTER F
  56  *      0069    LATIN SMALL LETTER I
  57  * </pre>
  58  *
  59  * or as the single character
  60  *
  61  * <pre>
  62  *      FB03    LATIN SMALL LIGATURE FFI
  63  * </pre>
  64  *
  65  * The ffi ligature is not a distinct semantic character, and strictly speaking
  66  * it shouldn't be in Unicode at all, but it was included for compatibility
  67  * with existing character sets that already provided it.  The Unicode standard
  68  * identifies such characters by giving them "compatibility" decompositions
  69  * into the corresponding semantic characters.  When sorting and searching, you
  70  * will often want to use these mappings.
  71  *
  72  * <code>normalize</code> helps solve these problems by transforming text into
  73  * the canonical composed and decomposed forms as shown in the first example
  74  * above. In addition, you can have it perform compatibility decompositions so
  75  * that you can treat compatibility characters the same as their equivalents.
  76  * Finally, <code>normalize</code> rearranges accents into the proper canonical
  77  * order, so that you do not have to worry about accent rearrangement on your
  78  * own.
  79  *
  80  * Form FCD, "Fast C or D", is also designed for collation.
  81  * It allows to work on strings that are not necessarily normalized
  82  * with an algorithm (like in collation) that works under "canonical closure",
  83  * i.e., it treats precomposed characters and their decomposed equivalents the
  84  * same.
  85  *
  86  * It is not a normalization form because it does not provide for uniqueness of
  87  * representation. Multiple strings may be canonically equivalent (their NFDs
  88  * are identical) and may all conform to FCD without being identical themselves.
  89  *
  90  * The form is defined such that the "raw decomposition", the recursive
  91  * canonical decomposition of each character, results in a string that is
  92  * canonically ordered. This means that precomposed characters are allowed for
  93  * as long as their decompositions do not need canonical reordering.
  94  *
  95  * Its advantage for a process like collation is that all NFD and most NFC texts
  96  * - and many unnormalized texts - already conform to FCD and do not need to be
  97  * normalized (NFD) for such a process. The FCD quick check will return YES for
  98  * most strings in practice.
  99  *
 100  * normalize(FCD) may be implemented with NFD.
 101  *
 102  * For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications):
 103  * http://www.unicode.org/notes/tn5/#FCD
 104  *
 105  * ICU collation performs either NFD or FCD normalization automatically if
 106  * normalization is turned on for the collator object. Beyond collation and
 107  * string search, normalized strings may be useful for string equivalence
 108  * comparisons, transliteration/transcription, unique representations, etc.
 109  *
 110  * The W3C generally recommends to exchange texts in NFC.
 111  * Note also that most legacy character encodings use only precomposed forms and
 112  * often do not encode any combining marks by themselves. For conversion to such
 113  * character encodings the Unicode text needs to be normalized to NFC.
 114  * For more usage examples, see the Unicode Standard Annex.
 115  *
 116  * Note: The Normalizer class also provides API for iterative normalization.
 117  * While the setIndex() and getIndex() refer to indices in the
 118  * underlying Unicode input text, the next() and previous() methods
 119  * iterate through characters in the normalized output.
 120  * This means that there is not necessarily a one-to-one correspondence
 121  * between characters returned by next() and previous() and the indices
 122  * passed to and returned from setIndex() and getIndex().
 123  * It is for this reason that Normalizer does not implement the CharacterIterator interface.
 124  *
 125  * @stable ICU 2.8
 126  */
 127 public final class Normalizer implements Cloneable {
 128     // The input text and our position in it
 129     private UCharacterIterator  text;
 130     private Normalizer2         norm2;
 131     private Mode                mode;
 132     private int                 options;
 133
 134     // The normalization buffer is the result of normalization
 135     // of the source in [currentIndex..nextIndex[ .
 136     private int                 currentIndex;
 137     private int                 nextIndex;
 138
 139     // A buffer for holding intermediate results
 140     private StringBuilder       buffer;
 141     private int                 bufferPos;
 142
 143     // Helper classes to defer loading of normalization data.
 144     private static final class ModeImpl {
 145         private ModeImpl(Normalizer2 n2) {
 146             normalizer2 = n2;
 147         }
 148         private final Normalizer2 normalizer2;
 149     }
 150     private static final class NFDModeImpl {
 151         private static final ModeImpl INSTANCE =
 152             new ModeImpl(Norm2AllModes.getNFCInstance().decomp);
 153     }
 154     private static final class NFKDModeImpl {
 155         private static final ModeImpl INSTANCE =
 156             new ModeImpl(Norm2AllModes.getNFKCInstance().decomp);
 157     }
 158     private static final class NFCModeImpl {
 159         private static final ModeImpl INSTANCE =
 160             new ModeImpl(Norm2AllModes.getNFCInstance().comp);
 161     }
 162     private static final class NFKCModeImpl {
 163         private static final ModeImpl INSTANCE =
 164             new ModeImpl(Norm2AllModes.getNFKCInstance().comp);
 165     }
 166     private static final class FCDModeImpl {
 167         private static final ModeImpl INSTANCE =
 168             new ModeImpl(Norm2AllModes.getFCDNormalizer2());
 169     }
 170
 171     private static final class Unicode32 {
 172         private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze();
 173     }
 174     private static final class NFD32ModeImpl {
 175         private static final ModeImpl INSTANCE =
 176             new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getNFCInstance().decomp,
 177                                                  Unicode32.INSTANCE));
 178     }
 179     private static final class NFKD32ModeImpl {
 180         private static final ModeImpl INSTANCE =
 181             new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getNFKCInstance().decomp,
 182                                                  Unicode32.INSTANCE));
 183     }
 184     private static final class NFC32ModeImpl {
 185         private static final ModeImpl INSTANCE =
 186             new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getNFCInstance().comp,
 187                                                  Unicode32.INSTANCE));
 188     }
 189     private static final class NFKC32ModeImpl {
 190         private static final ModeImpl INSTANCE =
 191             new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getNFKCInstance().comp,
 192                                                  Unicode32.INSTANCE));
 193     }
 194     private static final class FCD32ModeImpl {
 195         private static final ModeImpl INSTANCE =
 196             new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getFCDNormalizer2(),
 197                                                  Unicode32.INSTANCE));
 198     }
 199
 200     /**
 201      * Options bit set value to select Unicode 3.2 normalization
 202      * (except NormalizationCorrections).
 203      * At most one Unicode version can be selected at a time.
 204      * @stable ICU 2.6
 205      */
 206     public static final int UNICODE_3_2=0x20;
 207
 208     /**
 209      * Constant indicating that the end of the iteration has been reached.
 210      * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}.
 211      * @stable ICU 2.8
 212      */
 213     public static final int DONE = UCharacterIterator.DONE;
 214
 215     /**
 216      * Constants for normalization modes.
 217      * <p>
 218      * The Mode class is not intended for public subclassing.
 219      * Only the Mode constants provided by the Normalizer class should be used,
 220      * and any fields or methods should not be called or overridden by users.
 221      * @stable ICU 2.8
 222      */
 223     public static abstract class Mode {
 224         /**
 225          * @internal
 226          * @deprecated This API is ICU internal only.
 227          */
 228         protected abstract Normalizer2 getNormalizer2(int options);
 229     }
 230
 231     private static final class NONEMode extends Mode {
 232         protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; }
 233     }
 234     private static final class NFDMode extends Mode {
 235         protected Normalizer2 getNormalizer2(int options) {
 236             return (options&UNICODE_3_2) != 0 ?
 237                     NFD32ModeImpl.INSTANCE.normalizer2 : NFDModeImpl.INSTANCE.normalizer2;
 238         }
 239     }
 240     private static final class NFKDMode extends Mode {
 241         protected Normalizer2 getNormalizer2(int options) {
 242             return (options&UNICODE_3_2) != 0 ?
 243                     NFKD32ModeImpl.INSTANCE.normalizer2 : NFKDModeImpl.INSTANCE.normalizer2;
 244         }
 245     }
 246     private static final class NFCMode extends Mode {
 247         protected Normalizer2 getNormalizer2(int options) {
 248             return (options&UNICODE_3_2) != 0 ?
 249                     NFC32ModeImpl.INSTANCE.normalizer2 : NFCModeImpl.INSTANCE.normalizer2;
 250         }
 251     }
 252     private static final class NFKCMode extends Mode {
 253         protected Normalizer2 getNormalizer2(int options) {
 254             return (options&UNICODE_3_2) != 0 ?
 255                     NFKC32ModeImpl.INSTANCE.normalizer2 : NFKCModeImpl.INSTANCE.normalizer2;
 256         }
 257     }
 258     private static final class FCDMode extends Mode {
 259         protected Normalizer2 getNormalizer2(int options) {
 260             return (options&UNICODE_3_2) != 0 ?
 261                     FCD32ModeImpl.INSTANCE.normalizer2 : FCDModeImpl.INSTANCE.normalizer2;
 262         }
 263     }
 264
 265     /**
 266      * No decomposition/composition.
 267      * @stable ICU 2.8
 268      */
 269     public static final Mode NONE = new NONEMode();
 270
 271     /**
 272      * Canonical decomposition.
 273      * @stable ICU 2.8
 274      */
 275     public static final Mode NFD = new NFDMode();
 276
 277     /**
 278      * Compatibility decomposition.
 279      * @stable ICU 2.8
 280      */
 281     public static final Mode NFKD = new NFKDMode();
 282
 283     /**
 284      * Canonical decomposition followed by canonical composition.
 285      * @stable ICU 2.8
 286      */
 287     public static final Mode NFC = new NFCMode();
 288
 289     /**
 290      * Default normalization.
 291      * @stable ICU 2.8
 292      */
 293     public static final Mode DEFAULT = NFC;
 294
 295     /**
 296      * Compatibility decomposition followed by canonical composition.
 297      * @stable ICU 2.8
 298      */
 299     public static final Mode NFKC =new NFKCMode();
 300
 301     /**
 302      * "Fast C or D" form.
 303      * @stable ICU 2.8
 304      */
 305     public static final Mode FCD = new FCDMode();
 306
 307     /**
 308      * Null operation for use with the {@link com.ibm.icu.text.Normalizer constructors}
 309      * and the static {@link #normalize normalize} method.  This value tells
 310      * the <tt>Normalizer</tt> to do nothing but return unprocessed characters
 311      * from the underlying String or CharacterIterator.  If you have code which
 312      * requires raw text at some times and normalized text at others, you can
 313      * use <tt>NO_OP</tt> for the cases where you want raw text, rather
 314      * than having a separate code path that bypasses <tt>Normalizer</tt>
 315      * altogether.
 316      * <p>
 317      * @see #setMode
 318      * @deprecated ICU 2.8. Use Nomalizer.NONE
 319      * @see #NONE
 320      */
 321     public static final Mode NO_OP = NONE;
 322
 323     /**
 324      * Canonical decomposition followed by canonical composition.  Used with the
 325      * {@link com.ibm.icu.text.Normalizer constructors} and the static
 326      * {@link #normalize normalize} method to determine the operation to be
 327      * performed.
 328      * <p>
 329      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
 330      * off, this operation produces output that is in
 331      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
 332      * Form</a>
 333      * <b>C</b>.
 334      * <p>
 335      * @see #setMode
 336      * @deprecated ICU 2.8. Use Normalier.NFC
 337      * @see #NFC
 338      */
 339     public static final Mode COMPOSE = NFC;
 340
 341     /**
 342      * Compatibility decomposition followed by canonical composition.
 343      * Used with the {@link com.ibm.icu.text.Normalizer constructors} and the static
 344      * {@link #normalize normalize} method to determine the operation to be
 345      * performed.
 346      * <p>
 347      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
 348      * off, this operation produces output that is in
 349      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
 350      * Form</a>
 351      * <b>KC</b>.
 352      * <p>
 353      * @see #setMode
 354      * @deprecated ICU 2.8. Use Normalizer.NFKC
 355      * @see #NFKC
 356      */
 357     public static final Mode COMPOSE_COMPAT = NFKC;
 358
 359     /**
 360      * Canonical decomposition.  This value is passed to the
 361      * {@link com.ibm.icu.text.Normalizer constructors} and the static
 362      * {@link #normalize normalize}
 363      * method to determine the operation to be performed.
 364      * <p>
 365      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
 366      * off, this operation produces output that is in
 367      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
 368      * Form</a>
 369      * <b>D</b>.
 370      * <p>
 371      * @see #setMode
 372      * @deprecated ICU 2.8. Use Normalizer.NFD
 373      * @see #NFD
 374      */
 375     public static final Mode DECOMP = NFD;
 376
 377     /**
 378      * Compatibility decomposition.  This value is passed to the
 379      * {@link com.ibm.icu.text.Normalizer constructors} and the static
 380      * {@link #normalize normalize}
 381      * method to determine the operation to be performed.
 382      * <p>
 383      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
 384      * off, this operation produces output that is in
 385      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
 386      * Form</a>
 387      * <b>KD</b>.
 388      * <p>
 389      * @see #setMode
 390      * @deprecated ICU 2.8. Use Normalizer.NFKD
 391      * @see #NFKD
 392      */
 393     public static final Mode DECOMP_COMPAT = NFKD;
 394
 395     /**
 396      * Option to disable Hangul/Jamo composition and decomposition.
 397      * This option applies to Korean text,
 398      * which can be represented either in the Jamo alphabet or in Hangul
 399      * characters, which are really just two or three Jamo combined
 400      * into one visual glyph.  Since Jamo takes up more storage space than
 401      * Hangul, applications that process only Hangul text may wish to turn
 402      * this option on when decomposing text.
 403      * <p>
 404      * The Unicode standard treates Hangul to Jamo conversion as a
 405      * canonical decomposition, so this option must be turned <b>off</b> if you
 406      * wish to transform strings into one of the standard
 407      * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
 408      * Unicode Normalization Forms</a>.
 409      * <p>
 410      * @see #setOption
 411      * @deprecated ICU 2.8. This option is no longer supported.
 412      */
 413     public static final int IGNORE_HANGUL = 0x0001;
 414
 415     /**
 416      * Result values for quickCheck().
 417      * For details see Unicode Technical Report 15.
 418      * @stable ICU 2.8
 419      */
 420     public static final class QuickCheckResult{
 421         //private int resultValue;
 422         private QuickCheckResult(int value) {
 423             //resultValue=value;
 424         }
 425     }
 426     /**
 427      * Indicates that string is not in the normalized format
 428      * @stable ICU 2.8
 429      */
 430     public static final QuickCheckResult NO = new QuickCheckResult(0);
 431
 432     /**
 433      * Indicates that string is in the normalized format
 434      * @stable ICU 2.8
 435      */
 436     public static final QuickCheckResult YES = new QuickCheckResult(1);
 437
 438     /**
 439      * Indicates it cannot be determined if string is in the normalized
 440      * format without further thorough checks.
 441      * @stable ICU 2.8
 442      */
 443     public static final QuickCheckResult MAYBE = new QuickCheckResult(2);
 444
 445     /**
 446      * Option bit for compare:
 447      * Case sensitively compare the strings
 448      * @stable ICU 2.8
 449      */
 450     public static final int FOLD_CASE_DEFAULT =  UCharacter.FOLD_CASE_DEFAULT;
 451
 452     /**
 453      * Option bit for compare:
 454      * Both input strings are assumed to fulfill FCD conditions.
 455      * @stable ICU 2.8
 456      */
 457     public static final int INPUT_IS_FCD    =      0x20000;
 458
 459     /**
 460      * Option bit for compare:
 461      * Perform case-insensitive comparison.
 462      * @stable ICU 2.8
 463      */
 464     public static final int COMPARE_IGNORE_CASE  =     0x10000;
 465
 466     /**
 467      * Option bit for compare:
 468      * Compare strings in code point order instead of code unit order.
 469      * @stable ICU 2.8
 470      */
 471     public static final int COMPARE_CODE_POINT_ORDER = 0x8000;
 472
 473     /**
 474      * Option value for case folding:
 475      * Use the modified set of mappings provided in CaseFolding.txt to handle dotted I
 476      * and dotless i appropriately for Turkic languages (tr, az).
 477      * @see UCharacter#FOLD_CASE_EXCLUDE_SPECIAL_I
 478      * @stable ICU 2.8
 479      */
 480     public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = UCharacter.FOLD_CASE_EXCLUDE_SPECIAL_I;
 481
 482     /**
 483      * Lowest-order bit number of compare() options bits corresponding to
 484      * normalization options bits.
 485      *
 486      * The options parameter for compare() uses most bits for
 487      * itself and for various comparison and folding flags.
 488      * The most significant bits, however, are shifted down and passed on
 489      * to the normalization implementation.
 490      * (That is, from compare(..., options, ...),
 491      * options>>COMPARE_NORM_OPTIONS_SHIFT will be passed on to the
 492      * internal normalization functions.)
 493      *
 494      * @see #compare
 495      * @stable ICU 2.6
 496      */
 497     public static final int COMPARE_NORM_OPTIONS_SHIFT  = 20;
 498
 499     //-------------------------------------------------------------------------
 500     // Iterator constructors
 501     //-------------------------------------------------------------------------
 502
 503     /**
 504      * Creates a new <tt>Normalizer</tt> object for iterating over the
 505      * normalized form of a given string.
 506      * <p>
 507      * The <tt>options</tt> parameter specifies which optional
 508      * <tt>Normalizer</tt> features are to be enabled for this object.
 509      * <p>
 510      * @param str  The string to be normalized.  The normalization
 511      *              will start at the beginning of the string.
 512      *
 513      * @param mode The normalization mode.
 514      *
 515      * @param opt Any optional features to be enabled.
 516      *            Currently the only available option is {@link #UNICODE_3_2}.
 517      *            If you want the default behavior corresponding to one of the
 518      *            standard Unicode Normalization Forms, use 0 for this argument.
 519      * @stable ICU 2.6
 520      */
 521     public Normalizer(String str, Mode mode, int opt) {
 522         this.text = UCharacterIterator.getInstance(str);
 523         this.mode = mode;
 524         this.options=opt;
 525         norm2 = mode.getNormalizer2(opt);
 526         buffer = new StringBuilder();
 527     }
 528
 529     /**
 530      * Creates a new <tt>Normalizer</tt> object for iterating over the
 531      * normalized form of the given text.
 532      * <p>
 533      * @param iter  The input text to be normalized.  The normalization
 534      *              will start at the beginning of the string.
 535      *
 536      * @param mode  The normalization mode.
 537      *
 538      * @param opt Any optional features to be enabled.
 539      *            Currently the only available option is {@link #UNICODE_3_2}.
 540      *            If you want the default behavior corresponding to one of the
 541      *            standard Unicode Normalization Forms, use 0 for this argument.
 542      * @stable ICU 2.6
 543      */
 544     public Normalizer(CharacterIterator iter, Mode mode, int opt) {
 545         this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone());
 546         this.mode = mode;
 547         this.options = opt;
 548         norm2 = mode.getNormalizer2(opt);
 549         buffer = new StringBuilder();
 550     }
 551
 552     /**
 553      * Creates a new <tt>Normalizer</tt> object for iterating over the
 554      * normalized form of the given text.
 555      * <p>
 556      * @param iter  The input text to be normalized.  The normalization
 557      *              will start at the beginning of the string.
 558      *
 559      * @param mode  The normalization mode.
 560      * @param options The normalization options, ORed together (0 for no options).
 561      * @stable ICU 2.6
 562      */
 563     public Normalizer(UCharacterIterator iter, Mode mode, int options) {
 564         try {
 565             this.text     = (UCharacterIterator)iter.clone();
 566             this.mode     = mode;
 567             this.options  = options;
 568             norm2 = mode.getNormalizer2(options);
 569             buffer = new StringBuilder();
 570         } catch (CloneNotSupportedException e) {
 571             throw new IllegalStateException(e.toString());
 572         }
 573     }
 574
 575     /**
 576      * Clones this <tt>Normalizer</tt> object.  All properties of this
 577      * object are duplicated in the new object, including the cloning of any
 578      * {@link CharacterIterator} that was passed in to the constructor
 579      * or to {@link #setText(CharacterIterator) setText}.
 580      * However, the text storage underlying
 581      * the <tt>CharacterIterator</tt> is not duplicated unless the
 582      * iterator's <tt>clone</tt> method does so.
 583      * @stable ICU 2.8
 584      */
 585     public Object clone() {
 586         try {
 587             Normalizer copy = (Normalizer) super.clone();
 588             copy.text = (UCharacterIterator) text.clone();
 589             copy.mode = mode;
 590             copy.options = options;
 591             copy.norm2 = norm2;
 592             copy.buffer = new StringBuilder(buffer);
 593             copy.bufferPos = bufferPos;
 594             copy.currentIndex = currentIndex;
 595             copy.nextIndex = nextIndex;
 596             return copy;
 597         }
 598         catch (CloneNotSupportedException e) {
 599             throw new IllegalStateException(e);
 600         }
 601     }
 602
 603     //--------------------------------------------------------------------------
 604     // Static Utility methods
 605     //--------------------------------------------------------------------------
 606
 607     private static final Normalizer2 getComposeNormalizer2(boolean compat, int options) {
 608         return (compat ? NFKC : NFC).getNormalizer2(options);
 609     }
 610     private static final Normalizer2 getDecomposeNormalizer2(boolean compat, int options) {
 611         return (compat ? NFKD : NFD).getNormalizer2(options);
 612     }
 613
 614     /**
 615      * Compose a string.
 616      * The string will be composed to according to the specified mode.
 617      * @param str        The string to compose.
 618      * @param compat     If true the string will be composed according to
 619      *                    NFKC rules and if false will be composed according to
 620      *                    NFC rules.
 621      * @return String    The composed string
 622      * @stable ICU 2.8
 623      */
 624     public static String compose(String str, boolean compat) {
 625         return compose(str,compat,0);
 626     }
 627
 628     /**
 629      * Compose a string.
 630      * The string will be composed to according to the specified mode.
 631      * @param str        The string to compose.
 632      * @param compat     If true the string will be composed according to
 633      *                    NFKC rules and if false will be composed according to
 634      *                    NFC rules.
 635      * @param options    The only recognized option is UNICODE_3_2
 636      * @return String    The composed string
 637      * @stable ICU 2.6
 638      */
 639     public static String compose(String str, boolean compat, int options) {
 640         return getComposeNormalizer2(compat, options).normalize(str);
 641     }
 642
 643     /**
 644      * Compose a string.
 645      * The string will be composed to according to the specified mode.
 646      * @param source The char array to compose.
 647      * @param target A char buffer to receive the normalized text.
 648      * @param compat If true the char array will be composed according to
 649      *                NFKC rules and if false will be composed according to
 650      *                NFC rules.
 651      * @param options The normalization options, ORed together (0 for no options).
 652      * @return int   The total buffer size needed;if greater than length of
 653      *                result, the output was truncated.
 654      * @exception IndexOutOfBoundsException if target.length is less than the
 655      *             required length
 656      * @stable ICU 2.6
 657      */
 658     public static int compose(char[] source,char[] target, boolean compat, int options) {
 659         return compose(source, 0, source.length, target, 0, target.length, compat, options);
 660     }
 661
 662     /**
 663      * Compose a string.
 664      * The string will be composed to according to the specified mode.
 665      * @param src       The char array to compose.
 666      * @param srcStart  Start index of the source
 667      * @param srcLimit  Limit index of the source
 668      * @param dest      The char buffer to fill in
 669      * @param destStart Start index of the destination buffer
 670      * @param destLimit End index of the destination buffer
 671      * @param compat If true the char array will be composed according to
 672      *                NFKC rules and if false will be composed according to
 673      *                NFC rules.
 674      * @param options The normalization options, ORed together (0 for no options).
 675      * @return int   The total buffer size needed;if greater than length of
 676      *                result, the output was truncated.
 677      * @exception IndexOutOfBoundsException if target.length is less than the
 678      *             required length
 679      * @stable ICU 2.6
 680      */
 681     public static int compose(char[] src,int srcStart, int srcLimit,
 682                               char[] dest,int destStart, int destLimit,
 683                               boolean compat, int options) {
 684         CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
 685         CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
 686         getComposeNormalizer2(compat, options).normalize(srcBuffer, app);
 687         return app.length();
 688     }
 689
 690     /**
 691      * Decompose a string.
 692      * The string will be decomposed to according to the specified mode.
 693      * @param str       The string to decompose.
 694      * @param compat    If true the string will be decomposed according to NFKD
 695      *                   rules and if false will be decomposed according to NFD
 696      *                   rules.
 697      * @return String   The decomposed string
 698      * @stable ICU 2.8
 699      */
 700     public static String decompose(String str, boolean compat) {
 701         return decompose(str,compat,0);
 702     }
 703
 704     /**
 705      * Decompose a string.
 706      * The string will be decomposed to according to the specified mode.
 707      * @param str     The string to decompose.
 708      * @param compat  If true the string will be decomposed according to NFKD
 709      *                 rules and if false will be decomposed according to NFD
 710      *                 rules.
 711      * @param options The normalization options, ORed together (0 for no options).
 712      * @return String The decomposed string
 713      * @stable ICU 2.6
 714      */
 715     public static String decompose(String str, boolean compat, int options) {
 716         return getDecomposeNormalizer2(compat, options).normalize(str);
 717     }
 718
 719     /**
 720      * Decompose a string.
 721      * The string will be decomposed to according to the specified mode.
 722      * @param source The char array to decompose.
 723      * @param target A char buffer to receive the normalized text.
 724      * @param compat If true the char array will be decomposed according to NFKD
 725      *                rules and if false will be decomposed according to
 726      *                NFD rules.
 727      * @return int   The total buffer size needed;if greater than length of
 728      *                result,the output was truncated.
 729      * @param options The normalization options, ORed together (0 for no options).
 730      * @exception IndexOutOfBoundsException if the target capacity is less than
 731      *             the required length
 732      * @stable ICU 2.6
 733      */
 734     public static int decompose(char[] source,char[] target, boolean compat, int options) {
 735         return decompose(source, 0, source.length, target, 0, target.length, compat, options);
 736     }
 737
 738     /**
 739      * Decompose a string.
 740      * The string will be decomposed to according to the specified mode.
 741      * @param src       The char array to compose.
 742      * @param srcStart  Start index of the source
 743      * @param srcLimit  Limit index of the source
 744      * @param dest      The char buffer to fill in
 745      * @param destStart Start index of the destination buffer
 746      * @param destLimit End index of the destination buffer
 747      * @param compat If true the char array will be decomposed according to NFKD
 748      *                rules and if false will be decomposed according to
 749      *                NFD rules.
 750      * @param options The normalization options, ORed together (0 for no options).
 751      * @return int   The total buffer size needed;if greater than length of
 752      *                result,the output was truncated.
 753      * @exception IndexOutOfBoundsException if the target capacity is less than
 754      *             the required length
 755      * @stable ICU 2.6
 756      */
 757     public static int decompose(char[] src,int srcStart, int srcLimit,
 758                                 char[] dest,int destStart, int destLimit,
 759                                 boolean compat, int options) {
 760         CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
 761         CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
 762         getDecomposeNormalizer2(compat, options).normalize(srcBuffer, app);
 763         return app.length();
 764     }
 765
 766     /**
 767      * Normalizes a <tt>String</tt> using the given normalization operation.
 768      * <p>
 769      * The <tt>options</tt> parameter specifies which optional
 770      * <tt>Normalizer</tt> features are to be enabled for this operation.
 771      * Currently the only available option is {@link #UNICODE_3_2}.
 772      * If you want the default behavior corresponding to one of the standard
 773      * Unicode Normalization Forms, use 0 for this argument.
 774      * <p>
 775      * @param str       the input string to be normalized.
 776      * @param mode      the normalization mode
 777      * @param options   the optional features to be enabled.
 778      * @return String   the normalized string
 779      * @stable ICU 2.6
 780      */
 781     public static String normalize(String str, Mode mode, int options) {
 782         return mode.getNormalizer2(options).normalize(str);
 783     }
 784
 785     /**
 786      * Normalize a string.
 787      * The string will be normalized according to the specified normalization
 788      * mode and options.
 789      * @param src        The string to normalize.
 790      * @param mode       The normalization mode; one of Normalizer.NONE,
 791      *                    Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
 792      *                    Normalizer.NFKD, Normalizer.DEFAULT
 793      * @return the normalized string
 794      * @stable ICU 2.8
 795      *
 796      */
 797     public static String normalize(String src,Mode mode) {
 798         return normalize(src, mode, 0);
 799     }
 800     /**
 801      * Normalize a string.
 802      * The string will be normalized according to the specified normalization
 803      * mode and options.
 804      * @param source The char array to normalize.
 805      * @param target A char buffer to receive the normalized text.
 806      * @param mode   The normalization mode; one of Normalizer.NONE,
 807      *                Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
 808      *                Normalizer.NFKD, Normalizer.DEFAULT
 809      * @param options The normalization options, ORed together (0 for no options).
 810      * @return int   The total buffer size needed;if greater than length of
 811      *                result, the output was truncated.
 812      * @exception    IndexOutOfBoundsException if the target capacity is less
 813      *                than the required length
 814      * @stable ICU 2.6
 815      */
 816     public static int normalize(char[] source,char[] target, Mode  mode, int options) {
 817         return normalize(source,0,source.length,target,0,target.length,mode, options);
 818     }
 819
 820     /**
 821      * Normalize a string.
 822      * The string will be normalized according to the specified normalization
 823      * mode and options.
 824      * @param src       The char array to compose.
 825      * @param srcStart  Start index of the source
 826      * @param srcLimit  Limit index of the source
 827      * @param dest      The char buffer to fill in
 828      * @param destStart Start index of the destination buffer
 829      * @param destLimit End index of the destination buffer
 830      * @param mode      The normalization mode; one of Normalizer.NONE,
 831      *                   Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
 832      *                   Normalizer.NFKD, Normalizer.DEFAULT
 833      * @param options The normalization options, ORed together (0 for no options).
 834      * @return int      The total buffer size needed;if greater than length of
 835      *                   result, the output was truncated.
 836      * @exception       IndexOutOfBoundsException if the target capacity is
 837      *                   less than the required length
 838      * @stable ICU 2.6
 839      */
 840     public static int normalize(char[] src,int srcStart, int srcLimit,
 841                                 char[] dest,int destStart, int destLimit,
 842                                 Mode  mode, int options) {
 843         CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
 844         CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
 845         mode.getNormalizer2(options).normalize(srcBuffer, app);
 846         return app.length();
 847     }
 848
 849     /**
 850      * Normalize a codepoint according to the given mode
 851      * @param char32    The input string to be normalized.
 852      * @param mode      The normalization mode
 853      * @param options   Options for use with exclusion set and tailored Normalization
 854      *                                   The only option that is currently recognized is UNICODE_3_2
 855      * @return String   The normalized string
 856      * @stable ICU 2.6
 857      * @see #UNICODE_3_2
 858      */
 859     public static String normalize(int char32, Mode mode, int options) {
 860         if(mode == NFD && options == 0) {
 861             String decomposition =
 862                 Norm2AllModes.getNFCInstance().impl.getDecomposition(char32);
 863             if(decomposition == null) {
 864                 decomposition = UTF16.valueOf(char32);
 865             }
 866             return decomposition;
 867         }
 868         return normalize(UTF16.valueOf(char32), mode, options);
 869     }
 870
 871     /**
 872      * Convenience method to normalize a codepoint according to the given mode
 873      * @param char32    The input string to be normalized.
 874      * @param mode      The normalization mode
 875      * @return String   The normalized string
 876      * @stable ICU 2.6
 877      */
 878     public static String normalize(int char32, Mode mode) {
 879         return normalize(char32, mode, 0);
 880     }
 881
 882     /**
 883      * Convenience method.
 884      *
 885      * @param source   string for determining if it is in a normalized format
 886      * @param mode     normalization format (Normalizer.NFC,Normalizer.NFD,
 887      *                  Normalizer.NFKC,Normalizer.NFKD)
 888      * @return         Return code to specify if the text is normalized or not
 889      *                     (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
 890      * @stable ICU 2.8
 891      */
 892     public static QuickCheckResult quickCheck(String source, Mode mode) {
 893         return quickCheck(source, mode, 0);
 894     }
 895
 896     /**
 897      * Performing quick check on a string, to quickly determine if the string is
 898      * in a particular normalization format.
 899      * Three types of result can be returned Normalizer.YES, Normalizer.NO or
 900      * Normalizer.MAYBE. Result Normalizer.YES indicates that the argument
 901      * string is in the desired normalized format, Normalizer.NO determines that
 902      * argument string is not in the desired normalized format. A
 903      * Normalizer.MAYBE result indicates that a more thorough check is required,
 904      * the user may have to put the string in its normalized form and compare
 905      * the results.
 906      *
 907      * @param source   string for determining if it is in a normalized format
 908      * @param mode     normalization format (Normalizer.NFC,Normalizer.NFD,
 909      *                  Normalizer.NFKC,Normalizer.NFKD)
 910      * @param options   Options for use with exclusion set and tailored Normalization
 911      *                                   The only option that is currently recognized is UNICODE_3_2
 912      * @return         Return code to specify if the text is normalized or not
 913      *                     (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
 914      * @stable ICU 2.6
 915      */
 916     public static QuickCheckResult quickCheck(String source, Mode mode, int options) {
 917         return mode.getNormalizer2(options).quickCheck(source);
 918     }
 919
 920     /**
 921      * Convenience method.
 922      *
 923      * @param source Array of characters for determining if it is in a
 924      *                normalized format
 925      * @param mode   normalization format (Normalizer.NFC,Normalizer.NFD,
 926      *                Normalizer.NFKC,Normalizer.NFKD)
 927      * @param options   Options for use with exclusion set and tailored Normalization
 928      *                                   The only option that is currently recognized is UNICODE_3_2
 929      * @return       Return code to specify if the text is normalized or not
 930      *                (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
 931      * @stable ICU 2.6
 932      */
 933     public static QuickCheckResult quickCheck(char[] source, Mode mode, int options) {
 934         return quickCheck(source, 0, source.length, mode, options);
 935     }
 936
 937     /**
 938      * Performing quick check on a string, to quickly determine if the string is
 939      * in a particular normalization format.
 940      * Three types of result can be returned Normalizer.YES, Normalizer.NO or
 941      * Normalizer.MAYBE. Result Normalizer.YES indicates that the argument
 942      * string is in the desired normalized format, Normalizer.NO determines that
 943      * argument string is not in the desired normalized format. A
 944      * Normalizer.MAYBE result indicates that a more thorough check is required,
 945      * the user may have to put the string in its normalized form and compare
 946      * the results.
 947      *
 948      * @param source    string for determining if it is in a normalized format
 949      * @param start     the start index of the source
 950      * @param limit     the limit index of the source it is equal to the length
 951      * @param mode      normalization format (Normalizer.NFC,Normalizer.NFD,
 952      *                   Normalizer.NFKC,Normalizer.NFKD)
 953      * @param options   Options for use with exclusion set and tailored Normalization
 954      *                                   The only option that is currently recognized is UNICODE_3_2
 955      * @return          Return code to specify if the text is normalized or not
 956      *                   (Normalizer.YES, Normalizer.NO or
 957      *                   Normalizer.MAYBE)
 958      * @stable ICU 2.6
 959      */
 960
 961     public static QuickCheckResult quickCheck(char[] source,int start,
 962                                               int limit, Mode mode,int options) {
 963         CharBuffer srcBuffer = CharBuffer.wrap(source, start, limit - start);
 964         return mode.getNormalizer2(options).quickCheck(srcBuffer);
 965     }
 966
 967     /**
 968      * Test if a string is in a given normalization form.
 969      * This is semantically equivalent to source.equals(normalize(source, mode)).
 970      *
 971      * Unlike quickCheck(), this function returns a definitive result,
 972      * never a "maybe".
 973      * For NFD, NFKD, and FCD, both functions work exactly the same.
 974      * For NFC and NFKC where quickCheck may return "maybe", this function will
 975      * perform further tests to arrive at a true/false result.
 976      * @param src       The input array of characters to be checked to see if
 977      *                   it is normalized
 978      * @param start     The strart index in the source
 979      * @param limit     The limit index in the source
 980      * @param mode      the normalization mode
 981      * @param options   Options for use with exclusion set and tailored Normalization
 982      *                                   The only option that is currently recognized is UNICODE_3_2
 983      * @return Boolean value indicating whether the source string is in the
 984      *         "mode" normalization form
 985      * @stable ICU 2.6
 986      */
 987     public static boolean isNormalized(char[] src,int start,
 988                                        int limit, Mode mode,
 989                                        int options) {
 990         CharBuffer srcBuffer = CharBuffer.wrap(src, start, limit - start);
 991         return mode.getNormalizer2(options).isNormalized(srcBuffer);
 992     }
 993
 994     /**
 995      * Test if a string is in a given normalization form.
 996      * This is semantically equivalent to source.equals(normalize(source, mode)).
 997      *
 998      * Unlike quickCheck(), this function returns a definitive result,
 999      * never a "maybe".
1000      * For NFD, NFKD, and FCD, both functions work exactly the same.
1001      * For NFC and NFKC where quickCheck may return "maybe", this function will
1002      * perform further tests to arrive at a true/false result.
1003      * @param str       the input string to be checked to see if it is
1004      *                   normalized
1005      * @param mode      the normalization mode
1006      * @param options   Options for use with exclusion set and tailored Normalization
1007      *                  The only option that is currently recognized is UNICODE_3_2
1008      * @see #isNormalized
1009      * @stable ICU 2.6
1010      */
1011     public static boolean isNormalized(String str, Mode mode, int options) {
1012         return mode.getNormalizer2(options).isNormalized(str);
1013     }
1014
1015     /**
1016      * Convenience Method
1017      * @param char32    the input code point to be checked to see if it is
1018      *                   normalized
1019      * @param mode      the normalization mode
1020      * @param options   Options for use with exclusion set and tailored Normalization
1021      *                  The only option that is currently recognized is UNICODE_3_2
1022      *
1023      * @see #isNormalized
1024      * @stable ICU 2.6
1025      */
1026     public static boolean isNormalized(int char32, Mode mode,int options) {
1027         return isNormalized(UTF16.valueOf(char32), mode, options);
1028     }
1029
1030     /**
1031      * Compare two strings for canonical equivalence.
1032      * Further options include case-insensitive comparison and
1033      * code point order (as opposed to code unit order).
1034      *
1035      * Canonical equivalence between two strings is defined as their normalized
1036      * forms (NFD or NFC) being identical.
1037      * This function compares strings incrementally instead of normalizing
1038      * (and optionally case-folding) both strings entirely,
1039      * improving performance significantly.
1040      *
1041      * Bulk normalization is only necessary if the strings do not fulfill the
1042      * FCD conditions. Only in this case, and only if the strings are relatively
1043      * long, is memory allocated temporarily.
1044      * For FCD strings and short non-FCD strings there is no memory allocation.
1045      *
1046      * Semantically, this is equivalent to
1047      *   strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2)))
1048      * where code point order and foldCase are all optional.
1049      *
1050      * @param s1        First source character array.
1051      * @param s1Start   start index of source
1052      * @param s1Limit   limit of the source
1053      *
1054      * @param s2        Second source character array.
1055      * @param s2Start   start index of the source
1056      * @param s2Limit   limit of the source
1057      *
1058      * @param options A bit set of options:
1059      *   - FOLD_CASE_DEFAULT or 0 is used for default options:
1060      *     Case-sensitive comparison in code unit order, and the input strings
1061      *     are quick-checked for FCD.
1062      *
1063      *   - INPUT_IS_FCD
1064      *     Set if the caller knows that both s1 and s2 fulfill the FCD
1065      *     conditions.If not set, the function will quickCheck for FCD
1066      *     and normalize if necessary.
1067      *
1068      *   - COMPARE_CODE_POINT_ORDER
1069      *     Set to choose code point order instead of code unit order
1070      *
1071      *   - COMPARE_IGNORE_CASE
1072      *     Set to compare strings case-insensitively using case folding,
1073      *     instead of case-sensitively.
1074      *     If set, then the following case folding options are used.
1075      *
1076      *
1077      * @return <0 or 0 or >0 as usual for string comparisons
1078      *
1079      * @see #normalize
1080      * @see #FCD
1081      * @stable ICU 2.8
1082      */
1083     public static int compare(char[] s1, int s1Start, int s1Limit,
1084                               char[] s2, int s2Start, int s2Limit,
1085                               int options) {
1086         if( s1==null || s1Start<0 || s1Limit<0 ||
1087             s2==null || s2Start<0 || s2Limit<0 ||
1088             s1Limit<s1Start || s2Limit<s2Start
1089         ) {
1090             throw new IllegalArgumentException();
1091         }
1092         return internalCompare(CharBuffer.wrap(s1, s1Start, s1Limit-s1Start),
1093                                CharBuffer.wrap(s2, s2Start, s2Limit-s2Start),
1094                                options);
1095     }
1096
1097     /**
1098      * Compare two strings for canonical equivalence.
1099      * Further options include case-insensitive comparison and
1100      * code point order (as opposed to code unit order).
1101      *
1102      * Canonical equivalence between two strings is defined as their normalized
1103      * forms (NFD or NFC) being identical.
1104      * This function compares strings incrementally instead of normalizing
1105      * (and optionally case-folding) both strings entirely,
1106      * improving performance significantly.
1107      *
1108      * Bulk normalization is only necessary if the strings do not fulfill the
1109      * FCD conditions. Only in this case, and only if the strings are relatively
1110      * long, is memory allocated temporarily.
1111      * For FCD strings and short non-FCD strings there is no memory allocation.
1112      *
1113      * Semantically, this is equivalent to
1114      *   strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2)))
1115      * where code point order and foldCase are all optional.
1116      *
1117      * @param s1 First source string.
1118      * @param s2 Second source string.
1119      *
1120      * @param options A bit set of options:
1121      *   - FOLD_CASE_DEFAULT or 0 is used for default options:
1122      *     Case-sensitive comparison in code unit order, and the input strings
1123      *     are quick-checked for FCD.
1124      *
1125      *   - INPUT_IS_FCD
1126      *     Set if the caller knows that both s1 and s2 fulfill the FCD
1127      *     conditions. If not set, the function will quickCheck for FCD
1128      *     and normalize if necessary.
1129      *
1130      *   - COMPARE_CODE_POINT_ORDER
1131      *     Set to choose code point order instead of code unit order
1132      *
1133      *   - COMPARE_IGNORE_CASE
1134      *     Set to compare strings case-insensitively using case folding,
1135      *     instead of case-sensitively.
1136      *     If set, then the following case folding options are used.
1137      *
1138      * @return <0 or 0 or >0 as usual for string comparisons
1139      *
1140      * @see #normalize
1141      * @see #FCD
1142      * @stable ICU 2.8
1143      */
1144     public static int compare(String s1, String s2, int options) {
1145         return internalCompare(s1, s2, options);
1146     }
1147
1148     /**
1149      * Compare two strings for canonical equivalence.
1150      * Further options include case-insensitive comparison and
1151      * code point order (as opposed to code unit order).
1152      * Convenience method.
1153      *
1154      * @param s1 First source string.
1155      * @param s2 Second source string.
1156      *
1157      * @param options A bit set of options:
1158      *   - FOLD_CASE_DEFAULT or 0 is used for default options:
1159      *     Case-sensitive comparison in code unit order, and the input strings
1160      *     are quick-checked for FCD.
1161      *
1162      *   - INPUT_IS_FCD
1163      *     Set if the caller knows that both s1 and s2 fulfill the FCD
1164      *     conditions. If not set, the function will quickCheck for FCD
1165      *     and normalize if necessary.
1166      *
1167      *   - COMPARE_CODE_POINT_ORDER
1168      *     Set to choose code point order instead of code unit order
1169      *
1170      *   - COMPARE_IGNORE_CASE
1171      *     Set to compare strings case-insensitively using case folding,
1172      *     instead of case-sensitively.
1173      *     If set, then the following case folding options are used.
1174      *
1175      * @return <0 or 0 or >0 as usual for string comparisons
1176      *
1177      * @see #normalize
1178      * @see #FCD
1179      * @stable ICU 2.8
1180      */
1181     public static int compare(char[] s1, char[] s2, int options) {
1182         return internalCompare(CharBuffer.wrap(s1), CharBuffer.wrap(s2), options);
1183     }
1184
1185     /**
1186      * Convenience method that can have faster implementation
1187      * by not allocating buffers.
1188      * @param char32a    the first code point to be checked against the
1189      * @param char32b    the second code point
1190      * @param options    A bit set of options
1191      * @stable ICU 2.8
1192      */
1193     public static int compare(int char32a, int char32b, int options) {
1194         return internalCompare(UTF16.valueOf(char32a), UTF16.valueOf(char32b), options|INPUT_IS_FCD);
1195     }
1196
1197     /**
1198      * Convenience method that can have faster implementation
1199      * by not allocating buffers.
1200      * @param char32a   the first code point to be checked against
1201      * @param str2      the second string
1202      * @param options   A bit set of options
1203      * @stable ICU 2.8
1204      */
1205     public static int compare(int char32a, String str2, int options) {
1206         return internalCompare(UTF16.valueOf(char32a), str2, options);
1207     }
1208
1209     /* Concatenation of normalized strings --------------------------------- */
1210     /**
1211      * Concatenate normalized strings, making sure that the result is normalized
1212      * as well.
1213      *
1214      * If both the left and the right strings are in
1215      * the normalization form according to "mode",
1216      * then the result will be
1217      *
1218      * <code>
1219      *     dest=normalize(left+right, mode)
1220      * </code>
1221      *
1222      * With the input strings already being normalized,
1223      * this function will use next() and previous()
1224      * to find the adjacent end pieces of the input strings.
1225      * Only the concatenation of these end pieces will be normalized and
1226      * then concatenated with the remaining parts of the input strings.
1227      *
1228      * It is allowed to have dest==left to avoid copying the entire left string.
1229      *
1230      * @param left Left source array, may be same as dest.
1231      * @param leftStart start in the left array.
1232      * @param leftLimit limit in the left array (==length)
1233      * @param right Right source array.
1234      * @param rightStart start in the right array.
1235      * @param rightLimit limit in the right array (==length)
1236      * @param dest The output buffer; can be null if destStart==destLimit==0
1237      *              for pure preflighting.
1238      * @param destStart start in the destination array
1239      * @param destLimit limit in the destination array (==length)
1240      * @param mode The normalization mode.
1241      * @param options The normalization options, ORed together (0 for no options).
1242      * @return Length of output (number of chars) when successful or
1243      *          IndexOutOfBoundsException
1244      * @exception IndexOutOfBoundsException whose message has the string
1245      *             representation of destination capacity required.
1246      * @see #normalize
1247      * @see #next
1248      * @see #previous
1249      * @exception IndexOutOfBoundsException if target capacity is less than the
1250      *             required length
1251      * @stable ICU 2.8
1252      */
1253     public static int concatenate(char[] left,  int leftStart,  int leftLimit,
1254                                   char[] right, int rightStart, int rightLimit,
1255                                   char[] dest,  int destStart,  int destLimit,
1256                                   Normalizer.Mode mode, int options) {
1257         if(dest == null) {
1258             throw new IllegalArgumentException();
1259         }
1260
1261         /* check for overlapping right and destination */
1262         if (right == dest && rightStart < destLimit && destStart < rightLimit) {
1263             throw new IllegalArgumentException("overlapping right and dst ranges");
1264         }
1265
1266         /* allow left==dest */
1267         StringBuilder destBuilder=new StringBuilder(leftLimit-leftStart+rightLimit-rightStart+16);
1268         destBuilder.append(left, leftStart, leftLimit-leftStart);
1269         CharBuffer rightBuffer=CharBuffer.wrap(right, rightStart, rightLimit-rightStart);
1270         mode.getNormalizer2(options).append(destBuilder, rightBuffer);
1271         int destLength=destBuilder.length();
1272         if(destLength<=(destLimit-destStart)) {
1273             destBuilder.getChars(0, destLength, dest, destStart);
1274             return destLength;
1275         } else {
1276             throw new IndexOutOfBoundsException(Integer.toString(destLength));
1277         }
1278     }
1279
1280     /**
1281      * Concatenate normalized strings, making sure that the result is normalized
1282      * as well.
1283      *
1284      * If both the left and the right strings are in
1285      * the normalization form according to "mode",
1286      * then the result will be
1287      *
1288      * <code>
1289      *     dest=normalize(left+right, mode)
1290      * </code>
1291      *
1292      * For details see concatenate
1293      *
1294      * @param left Left source string.
1295      * @param right Right source string.
1296      * @param mode The normalization mode.
1297      * @param options The normalization options, ORed together (0 for no options).
1298      * @return result
1299      *
1300      * @see #concatenate
1301      * @see #normalize
1302      * @see #next
1303      * @see #previous
1304      * @see #concatenate
1305      * @stable ICU 2.8
1306      */
1307     public static String concatenate(char[] left, char[] right,Mode mode, int options) {
1308         StringBuilder dest=new StringBuilder(left.length+right.length+16).append(left);
1309         return mode.getNormalizer2(options).append(dest, CharBuffer.wrap(right)).toString();
1310     }
1311
1312     /**
1313      * Concatenate normalized strings, making sure that the result is normalized
1314      * as well.
1315      *
1316      * If both the left and the right strings are in
1317      * the normalization form according to "mode",
1318      * then the result will be
1319      *
1320      * <code>
1321      *     dest=normalize(left+right, mode)
1322      * </code>
1323      *
1324      * With the input strings already being normalized,
1325      * this function will use next() and previous()
1326      * to find the adjacent end pieces of the input strings.
1327      * Only the concatenation of these end pieces will be normalized and
1328      * then concatenated with the remaining parts of the input strings.
1329      *
1330      * @param left Left source string.
1331      * @param right Right source string.
1332      * @param mode The normalization mode.
1333      * @param options The normalization options, ORed together (0 for no options).
1334      * @return result
1335      *
1336      * @see #concatenate
1337      * @see #normalize
1338      * @see #next
1339      * @see #previous
1340      * @see #concatenate
1341      * @stable ICU 2.8
1342      */
1343     public static String concatenate(String left, String right, Mode mode, int options) {
1344         StringBuilder dest=new StringBuilder(left.length()+right.length()+16).append(left);
1345         return mode.getNormalizer2(options).append(dest, right).toString();
1346     }
1347
1348     /**
1349      * Gets the FC_NFKC closure value.
1350      * @param c The code point whose closure value is to be retrieved
1351      * @param dest The char array to receive the closure value
1352      * @return the length of the closure value; 0 if there is none
1353      * @stable ICU 3.8
1354      */
1355     public static int getFC_NFKC_Closure(int c,char[] dest) {
1356         String closure=getFC_NFKC_Closure(c);
1357         int length=closure.length();
1358         if(length!=0 && dest!=null && length<=dest.length) {
1359             closure.getChars(0, length, dest, 0);
1360         }
1361         return length;
1362     }
1363     /**
1364      * Gets the FC_NFKC closure value.
1365      * @param c The code point whose closure value is to be retrieved
1366      * @return String representation of the closure value; "" if there is none
1367      * @stable ICU 3.8
1368      */
1369     public static String getFC_NFKC_Closure(int c) {
1370         // Compute the FC_NFKC_Closure on the fly:
1371         // We have the API for complete coverage of Unicode properties, although
1372         // this value by itself is not useful via API.
1373         // (What could be useful is a custom normalization table that combines
1374         // case folding and NFKC.)
1375         // For the derivation, see Unicode's DerivedNormalizationProps.txt.
1376         Normalizer2 nfkc=NFKCModeImpl.INSTANCE.normalizer2;
1377         UCaseProps csp=UCaseProps.INSTANCE;
1378         // first: b = NFKC(Fold(a))
1379         StringBuilder folded=new StringBuilder();
1380         int folded1Length=csp.toFullFolding(c, folded, 0);
1381         if(folded1Length<0) {
1382             Normalizer2Impl nfkcImpl=((Norm2AllModes.Normalizer2WithImpl)nfkc).impl;
1383             if(nfkcImpl.getCompQuickCheck(nfkcImpl.getNorm16(c))!=0) {
1384                 return "";  // c does not change at all under CaseFolding+NFKC
1385             }
1386             folded.appendCodePoint(c);
1387         } else {
1388             if(folded1Length>UCaseProps.MAX_STRING_LENGTH) {
1389                 folded.appendCodePoint(folded1Length);
1390             }
1391         }
1392         String kc1=nfkc.normalize(folded);
1393         // second: c = NFKC(Fold(b))
1394         String kc2=nfkc.normalize(UCharacter.foldCase(kc1, 0));
1395         // if (c != b) add the mapping from a to c
1396         if(kc1.equals(kc2)) {
1397             return "";
1398         } else {
1399             return kc2;
1400         }
1401     }
1402
1403     //-------------------------------------------------------------------------
1404     // Iteration API
1405     //-------------------------------------------------------------------------
1406
1407     /**
1408      * Return the current character in the normalized text.
1409      * @return The codepoint as an int
1410      * @stable ICU 2.8
1411      */
1412     public int current() {
1413         if(bufferPos<buffer.length() || nextNormalize()) {
1414             return buffer.codePointAt(bufferPos);
1415         } else {
1416             return DONE;
1417         }
1418     }
1419
1420     /**
1421      * Return the next character in the normalized text and advance
1422      * the iteration position by one.  If the end
1423      * of the text has already been reached, {@link #DONE} is returned.
1424      * @return The codepoint as an int
1425      * @stable ICU 2.8
1426      */
1427     public int next() {
1428         if(bufferPos<buffer.length() ||  nextNormalize()) {
1429             int c=buffer.codePointAt(bufferPos);
1430             bufferPos+=Character.charCount(c);
1431             return c;
1432         } else {
1433             return DONE;
1434         }
1435     }
1436
1437
1438     /**
1439      * Return the previous character in the normalized text and decrement
1440      * the iteration position by one.  If the beginning
1441      * of the text has already been reached, {@link #DONE} is returned.
1442      * @return The codepoint as an int
1443      * @stable ICU 2.8
1444      */
1445     public int previous() {
1446         if(bufferPos>0 || previousNormalize()) {
1447             int c=buffer.codePointBefore(bufferPos);
1448             bufferPos-=Character.charCount(c);
1449             return c;
1450         } else {
1451             return DONE;
1452         }
1453     }
1454
1455     /**
1456      * Reset the index to the beginning of the text.
1457      * This is equivalent to setIndexOnly(startIndex)).
1458      * @stable ICU 2.8
1459      */
1460     public void reset() {
1461         text.setToStart();
1462         currentIndex=nextIndex=0;
1463         clearBuffer();
1464     }
1465
1466     /**
1467      * Set the iteration position in the input text that is being normalized,
1468      * without any immediate normalization.
1469      * After setIndexOnly(), getIndex() will return the same index that is
1470      * specified here.
1471      *
1472      * @param index the desired index in the input text.
1473      * @stable ICU 2.8
1474      */
1475     public void setIndexOnly(int index) {
1476         text.setIndex(index);  // validates index
1477         currentIndex=nextIndex=index;
1478         clearBuffer();
1479     }
1480
1481     /**
1482      * Set the iteration position in the input text that is being normalized
1483      * and return the first normalized character at that position.
1484      * <p>
1485      * <b>Note:</b> This method sets the position in the <em>input</em> text,
1486      * while {@link #next} and {@link #previous} iterate through characters
1487      * in the normalized <em>output</em>.  This means that there is not
1488      * necessarily a one-to-one correspondence between characters returned
1489      * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
1490      * returned from <tt>setIndex</tt> and {@link #getIndex}.
1491      * <p>
1492      * @param index the desired index in the input text.
1493      *
1494      * @return   the first normalized character that is the result of iterating
1495      *            forward starting at the given index.
1496      *
1497      * @throws IllegalArgumentException if the given index is less than
1498      *          {@link #getBeginIndex} or greater than {@link #getEndIndex}.
1499      * @deprecated ICU 3.2
1500      * @obsolete ICU 3.2
1501      */
1502      ///CLOVER:OFF
1503      public int setIndex(int index) {
1504          setIndexOnly(index);
1505          return current();
1506      }
1507      ///CLOVER:ON
1508     /**
1509      * Retrieve the index of the start of the input text. This is the begin
1510      * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the
1511      * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
1512      * @deprecated ICU 2.2. Use startIndex() instead.
1513      * @return The codepoint as an int
1514      * @see #startIndex
1515      */
1516     public int getBeginIndex() {
1517         return 0;
1518     }
1519
1520     /**
1521      * Retrieve the index of the end of the input text.  This is the end index
1522      * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
1523      * over which this <tt>Normalizer</tt> is iterating
1524      * @deprecated ICU 2.2. Use endIndex() instead.
1525      * @return The codepoint as an int
1526      * @see #endIndex
1527      */
1528     public int getEndIndex() {
1529         return endIndex();
1530     }
1531     /**
1532      * Return the first character in the normalized text.  This resets
1533      * the <tt>Normalizer's</tt> position to the beginning of the text.
1534      * @return The codepoint as an int
1535      * @stable ICU 2.8
1536      */
1537     public int first() {
1538         reset();
1539         return next();
1540     }
1541
1542     /**
1543      * Return the last character in the normalized text.  This resets
1544      * the <tt>Normalizer's</tt> position to be just before the
1545      * the input text corresponding to that normalized character.
1546      * @return The codepoint as an int
1547      * @stable ICU 2.8
1548      */
1549     public int last() {
1550         text.setToLimit();
1551         currentIndex=nextIndex=text.getIndex();
1552         clearBuffer();
1553         return previous();
1554     }
1555
1556     /**
1557      * Retrieve the current iteration position in the input text that is
1558      * being normalized.  This method is useful in applications such as
1559      * searching, where you need to be able to determine the position in
1560      * the input text that corresponds to a given normalized output character.
1561      * <p>
1562      * <b>Note:</b> This method sets the position in the <em>input</em>, while
1563      * {@link #next} and {@link #previous} iterate through characters in the
1564      * <em>output</em>.  This means that there is not necessarily a one-to-one
1565      * correspondence between characters returned by <tt>next</tt> and
1566      * <tt>previous</tt> and the indices passed to and returned from
1567      * <tt>setIndex</tt> and {@link #getIndex}.
1568      * @return The current iteration position
1569      * @stable ICU 2.8
1570      */
1571     public int getIndex() {
1572         if(bufferPos<buffer.length()) {
1573             return currentIndex;
1574         } else {
1575             return nextIndex;
1576         }
1577     }
1578
1579     /**
1580      * Retrieve the index of the start of the input text. This is the begin
1581      * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the
1582      * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
1583      * @return The current iteration position
1584      * @stable ICU 2.8
1585      */
1586     public int startIndex() {
1587         return 0;
1588     }
1589
1590     /**
1591      * Retrieve the index of the end of the input text.  This is the end index
1592      * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
1593      * over which this <tt>Normalizer</tt> is iterating
1594      * @return The current iteration position
1595      * @stable ICU 2.8
1596      */
1597     public int endIndex() {
1598         return text.getLength();
1599     }
1600
1601     //-------------------------------------------------------------------------
1602     // Iterator attributes
1603     //-------------------------------------------------------------------------
1604     /**
1605      * Set the normalization mode for this object.
1606      * <p>
1607      * <b>Note:</b>If the normalization mode is changed while iterating
1608      * over a string, calls to {@link #next} and {@link #previous} may
1609      * return previously buffers characters in the old normalization mode
1610      * until the iteration is able to re-sync at the next base character.
1611      * It is safest to call {@link #setText setText()}, {@link #first},
1612      * {@link #last}, etc. after calling <tt>setMode</tt>.
1613      * <p>
1614      * @param newMode the new mode for this <tt>Normalizer</tt>.
1615      * The supported modes are:
1616      * <ul>
1617      *  <li>{@link #NFC}    - Unicode canonical decompositiion
1618      *                        followed by canonical composition.
1619      *  <li>{@link #NFKC}   - Unicode compatibility decompositiion
1620      *                        follwed by canonical composition.
1621      *  <li>{@link #NFD}    - Unicode canonical decomposition
1622      *  <li>{@link #NFKD}   - Unicode compatibility decomposition.
1623      *  <li>{@link #NONE}   - Do nothing but return characters
1624      *                        from the underlying input text.
1625      * </ul>
1626      *
1627      * @see #getMode
1628      * @stable ICU 2.8
1629      */
1630     public void setMode(Mode newMode) {
1631         mode = newMode;
1632         norm2 = mode.getNormalizer2(options);
1633     }
1634     /**
1635      * Return the basic operation performed by this <tt>Normalizer</tt>
1636      *
1637      * @see #setMode
1638      * @stable ICU 2.8
1639      */
1640     public Mode getMode() {
1641         return mode;
1642     }
1643     /**
1644      * Set options that affect this <tt>Normalizer</tt>'s operation.
1645      * Options do not change the basic composition or decomposition operation
1646      * that is being performed , but they control whether
1647      * certain optional portions of the operation are done.
1648      * Currently the only available option is:
1649      * <p>
1650      * <ul>
1651      *   <li>{@link #UNICODE_3_2} - Use Normalization conforming to Unicode version 3.2.
1652      * </ul>
1653      * <p>
1654      * @param   option  the option whose value is to be set.
1655      * @param   value   the new setting for the option.  Use <tt>true</tt> to
1656      *                  turn the option on and <tt>false</tt> to turn it off.
1657      *
1658      * @see #getOption
1659      * @stable ICU 2.6
1660      */
1661     public void setOption(int option,boolean value) {
1662         if (value) {
1663             options |= option;
1664         } else {
1665             options &= (~option);
1666         }
1667         norm2 = mode.getNormalizer2(options);
1668     }
1669
1670     /**
1671      * Determine whether an option is turned on or off.
1672      * <p>
1673      * @see #setOption
1674      * @stable ICU 2.6
1675      */
1676     public int getOption(int option) {
1677         if((options & option)!=0) {
1678             return 1 ;
1679         } else {
1680             return 0;
1681         }
1682     }
1683
1684     /**
1685      * Gets the underlying text storage
1686      * @param fillIn the char buffer to fill the UTF-16 units.
1687      *         The length of the buffer should be equal to the length of the
1688      *         underlying text storage
1689      * @throws IndexOutOfBoundsException If the index passed for the array is invalid.
1690      * @see   #getLength
1691      * @stable ICU 2.8
1692      */
1693     public int getText(char[] fillIn) {
1694         return text.getText(fillIn);
1695     }
1696
1697     /**
1698      * Gets the length of underlying text storage
1699      * @return the length
1700      * @stable ICU 2.8
1701      */
1702     public int getLength() {
1703         return text.getLength();
1704     }
1705
1706     /**
1707      * Returns the text under iteration as a string
1708      * @return a copy of the text under iteration.
1709      * @stable ICU 2.8
1710      */
1711     public String getText() {
1712         return text.getText();
1713     }
1714
1715     /**
1716      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1717      * The iteration position is set to the beginning of the input text.
1718      * @param newText   The new string to be normalized.
1719      * @stable ICU 2.8
1720      */
1721     public void setText(StringBuffer newText) {
1722         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1723         if (newIter == null) {
1724             throw new IllegalStateException("Could not create a new UCharacterIterator");
1725         }
1726         text = newIter;
1727         reset();
1728     }
1729
1730     /**
1731      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1732      * The iteration position is set to the beginning of the input text.
1733      * @param newText   The new string to be normalized.
1734      * @stable ICU 2.8
1735      */
1736     public void setText(char[] newText) {
1737         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1738         if (newIter == null) {
1739             throw new IllegalStateException("Could not create a new UCharacterIterator");
1740         }
1741         text = newIter;
1742         reset();
1743     }
1744
1745     /**
1746      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1747      * The iteration position is set to the beginning of the input text.
1748      * @param newText   The new string to be normalized.
1749      * @stable ICU 2.8
1750      */
1751     public void setText(String newText) {
1752         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1753         if (newIter == null) {
1754             throw new IllegalStateException("Could not create a new UCharacterIterator");
1755         }
1756         text = newIter;
1757         reset();
1758     }
1759
1760     /**
1761      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1762      * The iteration position is set to the beginning of the input text.
1763      * @param newText   The new string to be normalized.
1764      * @stable ICU 2.8
1765      */
1766     public void setText(CharacterIterator newText) {
1767         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1768         if (newIter == null) {
1769             throw new IllegalStateException("Could not create a new UCharacterIterator");
1770         }
1771         text = newIter;
1772         reset();
1773     }
1774
1775     /**
1776      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1777      * The iteration position is set to the beginning of the string.
1778      * @param newText   The new string to be normalized.
1779      * @stable ICU 2.8
1780      */
1781     public void setText(UCharacterIterator newText) {
1782         try{
1783             UCharacterIterator newIter = (UCharacterIterator)newText.clone();
1784             if (newIter == null) {
1785                 throw new IllegalStateException("Could not create a new UCharacterIterator");
1786             }
1787             text = newIter;
1788             reset();
1789         }catch(CloneNotSupportedException e) {
1790             throw new IllegalStateException("Could not clone the UCharacterIterator");
1791         }
1792     }
1793
1794     private void clearBuffer() {
1795         buffer.setLength(0);
1796         bufferPos=0;
1797     }
1798
1799     private boolean nextNormalize() {
1800         clearBuffer();
1801         currentIndex=nextIndex;
1802         text.setIndex(nextIndex);
1803         // Skip at least one character so we make progress.
1804         int c=text.nextCodePoint();
1805         if(c<0) {
1806             return false;
1807         }
1808         StringBuilder segment=new StringBuilder().appendCodePoint(c);
1809         while((c=text.nextCodePoint())>=0) {
1810             if(norm2.hasBoundaryBefore(c)) {
1811                 text.moveCodePointIndex(-1);
1812                 break;
1813             }
1814             segment.appendCodePoint(c);
1815         }
1816         nextIndex=text.getIndex();
1817         norm2.normalize(segment, buffer);
1818         return buffer.length()!=0;
1819     }
1820
1821     private boolean previousNormalize() {
1822         clearBuffer();
1823         nextIndex=currentIndex;
1824         text.setIndex(currentIndex);
1825         StringBuilder segment=new StringBuilder();
1826         int c;
1827         while((c=text.previousCodePoint())>=0) {
1828             if(c<=0xffff) {
1829                 segment.insert(0, (char)c);
1830             } else {
1831                 segment.insert(0, Character.toChars(c));
1832             }
1833             if(norm2.hasBoundaryBefore(c)) {
1834                 break;
1835             }
1836         }
1837         currentIndex=text.getIndex();
1838         norm2.normalize(segment, buffer);
1839         bufferPos=buffer.length();
1840         return buffer.length()!=0;
1841     }
1842
1843     /* compare canonically equivalent ------------------------------------------- */
1844
1845     // TODO: Broaden the public compare(String, String, options) API like this. Ticket #7407
1846     private static int internalCompare(CharSequence s1, CharSequence s2, int options) {
1847         int normOptions=options>>>COMPARE_NORM_OPTIONS_SHIFT;
1848         options|= COMPARE_EQUIV;
1849
1850         /*
1851          * UAX #21 Case Mappings, as fixed for Unicode version 4
1852          * (see Jitterbug 2021), defines a canonical caseless match as
1853          *
1854          * A string X is a canonical caseless match
1855          * for a string Y if and only if
1856          * NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y)))
1857          *
1858          * For better performance, we check for FCD (or let the caller tell us that
1859          * both strings are in FCD) for the inner normalization.
1860          * BasicNormalizerTest::FindFoldFCDExceptions() makes sure that
1861          * case-folding preserves the FCD-ness of a string.
1862          * The outer normalization is then only performed by NormalizerImpl.cmpEquivFold()
1863          * when there is a difference.
1864          *
1865          * Exception: When using the Turkic case-folding option, we do perform
1866          * full NFD first. This is because in the Turkic case precomposed characters
1867          * with 0049 capital I or 0069 small i fold differently whether they
1868          * are first decomposed or not, so an FCD check - a check only for
1869          * canonical order - is not sufficient.
1870          */
1871         if((options&INPUT_IS_FCD)==0 || (options&FOLD_CASE_EXCLUDE_SPECIAL_I)!=0) {
1872             Normalizer2 n2;
1873             if((options&FOLD_CASE_EXCLUDE_SPECIAL_I)!=0) {
1874                 n2=NFD.getNormalizer2(normOptions);
1875             } else {
1876                 n2=FCD.getNormalizer2(normOptions);
1877             }
1878
1879             // check if s1 and/or s2 fulfill the FCD conditions
1880             int spanQCYes1=n2.spanQuickCheckYes(s1);
1881             int spanQCYes2=n2.spanQuickCheckYes(s2);
1882
1883             /*
1884              * ICU 2.4 had a further optimization:
1885              * If both strings were not in FCD, then they were both NFD'ed,
1886              * and the COMPARE_EQUIV option was turned off.
1887              * It is not entirely clear that this is valid with the current
1888              * definition of the canonical caseless match.
1889              * Therefore, ICU 2.6 removes that optimization.
1890              */
1891
1892             if(spanQCYes1<s1.length()) {
1893                 StringBuilder fcd1=new StringBuilder(s1.length()+16).append(s1, 0, spanQCYes1);
1894                 s1=n2.normalizeSecondAndAppend(fcd1, s1.subSequence(spanQCYes1, s1.length()));
1895             }
1896             if(spanQCYes2<s2.length()) {
1897                 StringBuilder fcd2=new StringBuilder(s2.length()+16).append(s2, 0, spanQCYes2);
1898                 s2=n2.normalizeSecondAndAppend(fcd2, s2.subSequence(spanQCYes2, s2.length()));
1899             }
1900         }
1901
1902         return cmpEquivFold(s1, s2, options);
1903     }
1904
1905     /*
1906      * Compare two strings for canonical equivalence.
1907      * Further options include case-insensitive comparison and
1908      * code point order (as opposed to code unit order).
1909      *
1910      * In this function, canonical equivalence is optional as well.
1911      * If canonical equivalence is tested, then both strings must fulfill
1912      * the FCD check.
1913      *
1914      * Semantically, this is equivalent to
1915      *   strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2)))
1916      * where code point order, NFD and foldCase are all optional.
1917      *
1918      * String comparisons almost always yield results before processing both strings
1919      * completely.
1920      * They are generally more efficient working incrementally instead of
1921      * performing the sub-processing (strlen, normalization, case-folding)
1922      * on the entire strings first.
1923      *
1924      * It is also unnecessary to not normalize identical characters.
1925      *
1926      * This function works in principle as follows:
1927      *
1928      * loop {
1929      *   get one code unit c1 from s1 (-1 if end of source)
1930      *   get one code unit c2 from s2 (-1 if end of source)
1931      *
1932      *   if(either string finished) {
1933      *     return result;
1934      *   }
1935      *   if(c1==c2) {
1936      *     continue;
1937      *   }
1938      *
1939      *   // c1!=c2
1940      *   try to decompose/case-fold c1/c2, and continue if one does;
1941      *
1942      *   // still c1!=c2 and neither decomposes/case-folds, return result
1943      *   return c1-c2;
1944      * }
1945      *
1946      * When a character decomposes, then the pointer for that source changes to
1947      * the decomposition, pushing the previous pointer onto a stack.
1948      * When the end of the decomposition is reached, then the code unit reader
1949      * pops the previous source from the stack.
1950      * (Same for case-folding.)
1951      *
1952      * This is complicated further by operating on variable-width UTF-16.
1953      * The top part of the loop works on code units, while lookups for decomposition
1954      * and case-folding need code points.
1955      * Code points are assembled after the equality/end-of-source part.
1956      * The source pointer is only advanced beyond all code units when the code point
1957      * actually decomposes/case-folds.
1958      *
1959      * If we were on a trail surrogate unit when assembling a code point,
1960      * and the code point decomposes/case-folds, then the decomposition/folding
1961      * result must be compared with the part of the other string that corresponds to
1962      * this string's lead surrogate.
1963      * Since we only assemble a code point when hitting a trail unit when the
1964      * preceding lead units were identical, we back up the other string by one unit
1965      * in such a case.
1966      *
1967      * The optional code point order comparison at the end works with
1968      * the same fix-up as the other code point order comparison functions.
1969      * See ustring.c and the comment near the end of this function.
1970      *
1971      * Assumption: A decomposition or case-folding result string never contains
1972      * a single surrogate. This is a safe assumption in the Unicode Standard.
1973      * Therefore, we do not need to check for surrogate pairs across
1974      * decomposition/case-folding boundaries.
1975      *
1976      * Further assumptions (see verifications tstnorm.cpp):
1977      * The API function checks for FCD first, while the core function
1978      * first case-folds and then decomposes. This requires that case-folding does not
1979      * un-FCD any strings.
1980      *
1981      * The API function may also NFD the input and turn off decomposition.
1982      * This requires that case-folding does not un-NFD strings either.
1983      *
1984      * TODO If any of the above two assumptions is violated,
1985      * then this entire code must be re-thought.
1986      * If this happens, then a simple solution is to case-fold both strings up front
1987      * and to turn off UNORM_INPUT_IS_FCD.
1988      * We already do this when not both strings are in FCD because makeFCD
1989      * would be a partial NFD before the case folding, which does not work.
1990      * Note that all of this is only a problem when case-folding _and_
1991      * canonical equivalence come together.
1992      * (Comments in unorm_compare() are more up to date than this TODO.)
1993      */
1994
1995     /* stack element for previous-level source/decomposition pointers */
1996     private static final class CmpEquivLevel {
1997         CharSequence cs;
1998         int s;
1999     };
2000     private static final CmpEquivLevel[] createCmpEquivLevelStack() {
2001         return new CmpEquivLevel[] {
2002             new CmpEquivLevel(), new CmpEquivLevel()
2003         };
2004     }
2005
2006     /**
2007      * Internal option for unorm_cmpEquivFold() for decomposing.
2008      * If not set, just do strcasecmp().
2009      */
2010     private static final int COMPARE_EQUIV=0x80000;
2011
2012     /* internal function; package visibility for use by UTF16.StringComparator */
2013     /*package*/ static int cmpEquivFold(CharSequence cs1, CharSequence cs2, int options) {
2014         Normalizer2Impl nfcImpl;
2015         UCaseProps csp;
2016
2017         /* current-level start/limit - s1/s2 as current */
2018         int s1, s2, limit1, limit2;
2019
2020         /* decomposition and case folding variables */
2021         int length;
2022
2023         /* stacks of previous-level start/current/limit */
2024         CmpEquivLevel[] stack1=null, stack2=null;
2025
2026         /* buffers for algorithmic decompositions */
2027         String decomp1, decomp2;
2028
2029         /* case folding buffers, only use current-level start/limit */
2030         StringBuilder fold1, fold2;
2031
2032         /* track which is the current level per string */
2033         int level1, level2;
2034
2035         /* current code units, and code points for lookups */
2036         int c1, c2, cp1, cp2;
2037
2038         /* no argument error checking because this itself is not an API */
2039
2040         /*
2041          * assume that at least one of the options _COMPARE_EQUIV and U_COMPARE_IGNORE_CASE is set
2042          * otherwise this function must behave exactly as uprv_strCompare()
2043          * not checking for that here makes testing this function easier
2044          */
2045
2046         /* normalization/properties data loaded? */
2047         if((options&COMPARE_EQUIV)!=0) {
2048             nfcImpl=Norm2AllModes.getNFCInstance().impl;
2049         } else {
2050             nfcImpl=null;
2051         }
2052         if((options&COMPARE_IGNORE_CASE)!=0) {
2053             csp=UCaseProps.INSTANCE;
2054             fold1=new StringBuilder();
2055             fold2=new StringBuilder();
2056         } else {
2057             csp=null;
2058             fold1=fold2=null;
2059         }
2060
2061         /* initialize */
2062         s1=0;
2063         limit1=cs1.length();
2064         s2=0;
2065         limit2=cs2.length();
2066
2067         level1=level2=0;
2068         c1=c2=-1;
2069
2070         /* comparison loop */
2071         for(;;) {
2072             /*
2073              * here a code unit value of -1 means "get another code unit"
2074              * below it will mean "this source is finished"
2075              */
2076
2077             if(c1<0) {
2078                 /* get next code unit from string 1, post-increment */
2079                 for(;;) {
2080                     if(s1==limit1) {
2081                         if(level1==0) {
2082                             c1=-1;
2083                             break;
2084                         }
2085                     } else {
2086                         c1=cs1.charAt(s1++);
2087                         break;
2088                     }
2089
2090                     /* reached end of level buffer, pop one level */
2091                     do {
2092                         --level1;
2093                         cs1=stack1[level1].cs;
2094                     } while(cs1==null);
2095                     s1=stack1[level1].s;
2096                     limit1=cs1.length();
2097                 }
2098             }
2099
2100             if(c2<0) {
2101                 /* get next code unit from string 2, post-increment */
2102                 for(;;) {
2103                     if(s2==limit2) {
2104                         if(level2==0) {
2105                             c2=-1;
2106                             break;
2107                         }
2108                     } else {
2109                         c2=cs2.charAt(s2++);
2110                         break;
2111                     }
2112
2113                     /* reached end of level buffer, pop one level */
2114                     do {
2115                         --level2;
2116                         cs2=stack2[level2].cs;
2117                     } while(cs2==null);
2118                     s2=stack2[level2].s;
2119                     limit2=cs2.length();
2120                 }
2121             }
2122
2123             /*
2124              * compare c1 and c2
2125              * either variable c1, c2 is -1 only if the corresponding string is finished
2126              */
2127             if(c1==c2) {
2128                 if(c1<0) {
2129                     return 0;   /* c1==c2==-1 indicating end of strings */
2130                 }
2131                 c1=c2=-1;       /* make us fetch new code units */
2132                 continue;
2133             } else if(c1<0) {
2134                 return -1;      /* string 1 ends before string 2 */
2135             } else if(c2<0) {
2136                 return 1;       /* string 2 ends before string 1 */
2137             }
2138             /* c1!=c2 && c1>=0 && c2>=0 */
2139
2140             /* get complete code points for c1, c2 for lookups if either is a surrogate */
2141             cp1=c1;
2142             if(UTF16.isSurrogate((char)c1)) {
2143                 char c;
2144
2145                 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
2146                     if(s1!=limit1 && Character.isLowSurrogate(c=cs1.charAt(s1))) {
2147                         /* advance ++s1; only below if cp1 decomposes/case-folds */
2148                         cp1=Character.toCodePoint((char)c1, c);
2149                     }
2150                 } else /* isTrail(c1) */ {
2151                     if(0<=(s1-2) && Character.isHighSurrogate(c=cs1.charAt(s1-2))) {
2152                         cp1=Character.toCodePoint(c, (char)c1);
2153                     }
2154                 }
2155             }
2156
2157             cp2=c2;
2158             if(UTF16.isSurrogate((char)c2)) {
2159                 char c;
2160
2161                 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
2162                     if(s2!=limit2 && Character.isLowSurrogate(c=cs2.charAt(s2))) {
2163                         /* advance ++s2; only below if cp2 decomposes/case-folds */
2164                         cp2=Character.toCodePoint((char)c2, c);
2165                     }
2166                 } else /* isTrail(c2) */ {
2167                     if(0<=(s2-2) && Character.isHighSurrogate(c=cs2.charAt(s2-2))) {
2168                         cp2=Character.toCodePoint(c, (char)c2);
2169                     }
2170                 }
2171             }
2172
2173             /*
2174              * go down one level for each string
2175              * continue with the main loop as soon as there is a real change
2176              */
2177
2178             if( level1==0 && (options&COMPARE_IGNORE_CASE)!=0 &&
2179                 (length=csp.toFullFolding(cp1, fold1, options))>=0
2180             ) {
2181                 /* cp1 case-folds to the code point "length" or to p[length] */
2182                 if(UTF16.isSurrogate((char)c1)) {
2183                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
2184                         /* advance beyond source surrogate pair if it case-folds */
2185                         ++s1;
2186                     } else /* isTrail(c1) */ {
2187                         /*
2188                          * we got a supplementary code point when hitting its trail surrogate,
2189                          * therefore the lead surrogate must have been the same as in the other string;
2190                          * compare this decomposition with the lead surrogate in the other string
2191                          * remember that this simulates bulk text replacement:
2192                          * the decomposition would replace the entire code point
2193                          */
2194                         --s2;
2195                         c2=cs2.charAt(s2-1);
2196                     }
2197                 }
2198
2199                 /* push current level pointers */
2200                 if(stack1==null) {
2201                     stack1=createCmpEquivLevelStack();
2202                 }
2203                 stack1[0].cs=cs1;
2204                 stack1[0].s=s1;
2205                 ++level1;
2206
2207                 /* copy the folding result to fold1[] */
2208                 /* Java: the buffer was probably not empty, remove the old contents */
2209                 if(length<=UCaseProps.MAX_STRING_LENGTH) {
2210                     fold1.delete(0, fold1.length()-length);
2211                 } else {
2212                     fold1.setLength(0);
2213                     fold1.appendCodePoint(length);
2214                 }
2215
2216                 /* set next level pointers to case folding */
2217                 cs1=fold1;
2218                 s1=0;
2219                 limit1=fold1.length();
2220
2221                 /* get ready to read from decomposition, continue with loop */
2222                 c1=-1;
2223                 continue;
2224             }
2225
2226             if( level2==0 && (options&COMPARE_IGNORE_CASE)!=0 &&
2227                 (length=csp.toFullFolding(cp2, fold2, options))>=0
2228             ) {
2229                 /* cp2 case-folds to the code point "length" or to p[length] */
2230                 if(UTF16.isSurrogate((char)c2)) {
2231                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
2232                         /* advance beyond source surrogate pair if it case-folds */
2233                         ++s2;
2234                     } else /* isTrail(c2) */ {
2235                         /*
2236                          * we got a supplementary code point when hitting its trail surrogate,
2237                          * therefore the lead surrogate must have been the same as in the other string;
2238                          * compare this decomposition with the lead surrogate in the other string
2239                          * remember that this simulates bulk text replacement:
2240                          * the decomposition would replace the entire code point
2241                          */
2242                         --s1;
2243                         c1=cs1.charAt(s1-1);
2244                     }
2245                 }
2246
2247                 /* push current level pointers */
2248                 if(stack2==null) {
2249                     stack2=createCmpEquivLevelStack();
2250                 }
2251                 stack2[0].cs=cs2;
2252                 stack2[0].s=s2;
2253                 ++level2;
2254
2255                 /* copy the folding result to fold2[] */
2256                 /* Java: the buffer was probably not empty, remove the old contents */
2257                 if(length<=UCaseProps.MAX_STRING_LENGTH) {
2258                     fold2.delete(0, fold2.length()-length);
2259                 } else {
2260                     fold2.setLength(0);
2261                     fold2.appendCodePoint(length);
2262                 }
2263
2264                 /* set next level pointers to case folding */
2265                 cs2=fold2;
2266                 s2=0;
2267                 limit2=fold2.length();
2268
2269                 /* get ready to read from decomposition, continue with loop */
2270                 c2=-1;
2271                 continue;
2272             }
2273
2274             if( level1<2 && (options&COMPARE_EQUIV)!=0 &&
2275                 (decomp1=nfcImpl.getDecomposition(cp1))!=null
2276             ) {
2277                 /* cp1 decomposes into p[length] */
2278                 if(UTF16.isSurrogate((char)c1)) {
2279                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
2280                         /* advance beyond source surrogate pair if it decomposes */
2281                         ++s1;
2282                     } else /* isTrail(c1) */ {
2283                         /*
2284                          * we got a supplementary code point when hitting its trail surrogate,
2285                          * therefore the lead surrogate must have been the same as in the other string;
2286                          * compare this decomposition with the lead surrogate in the other string
2287                          * remember that this simulates bulk text replacement:
2288                          * the decomposition would replace the entire code point
2289                          */
2290                         --s2;
2291                         c2=cs2.charAt(s2-1);
2292                     }
2293                 }
2294
2295                 /* push current level pointers */
2296                 if(stack1==null) {
2297                     stack1=createCmpEquivLevelStack();
2298                 }
2299                 stack1[level1].cs=cs1;
2300                 stack1[level1].s=s1;
2301                 ++level1;
2302
2303                 /* set empty intermediate level if skipped */
2304                 if(level1<2) {
2305                     stack1[level1++].cs=null;
2306                 }
2307
2308                 /* set next level pointers to decomposition */
2309                 cs1=decomp1;
2310                 s1=0;
2311                 limit1=decomp1.length();
2312
2313                 /* get ready to read from decomposition, continue with loop */
2314                 c1=-1;
2315                 continue;
2316             }
2317
2318             if( level2<2 && (options&COMPARE_EQUIV)!=0 &&
2319                 (decomp2=nfcImpl.getDecomposition(cp2))!=null
2320             ) {
2321                 /* cp2 decomposes into p[length] */
2322                 if(UTF16.isSurrogate((char)c2)) {
2323                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
2324                         /* advance beyond source surrogate pair if it decomposes */
2325                         ++s2;
2326                     } else /* isTrail(c2) */ {
2327                         /*
2328                          * we got a supplementary code point when hitting its trail surrogate,
2329                          * therefore the lead surrogate must have been the same as in the other string;
2330                          * compare this decomposition with the lead surrogate in the other string
2331                          * remember that this simulates bulk text replacement:
2332                          * the decomposition would replace the entire code point
2333                          */
2334                         --s1;
2335                         c1=cs1.charAt(s1-1);
2336                     }
2337                 }
2338
2339                 /* push current level pointers */
2340                 if(stack2==null) {
2341                     stack2=createCmpEquivLevelStack();
2342                 }
2343                 stack2[level2].cs=cs2;
2344                 stack2[level2].s=s2;
2345                 ++level2;
2346
2347                 /* set empty intermediate level if skipped */
2348                 if(level2<2) {
2349                     stack2[level2++].cs=null;
2350                 }
2351
2352                 /* set next level pointers to decomposition */
2353                 cs2=decomp2;
2354                 s2=0;
2355                 limit2=decomp2.length();
2356
2357                 /* get ready to read from decomposition, continue with loop */
2358                 c2=-1;
2359                 continue;
2360             }
2361
2362             /*
2363              * no decomposition/case folding, max level for both sides:
2364              * return difference result
2365              *
2366              * code point order comparison must not just return cp1-cp2
2367              * because when single surrogates are present then the surrogate pairs
2368              * that formed cp1 and cp2 may be from different string indexes
2369              *
2370              * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
2371              * c1=d800 cp1=10001 c2=dc00 cp2=10000
2372              * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
2373              *
2374              * therefore, use same fix-up as in ustring.c/uprv_strCompare()
2375              * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
2376              * so we have slightly different pointer/start/limit comparisons here
2377              */
2378
2379             if(c1>=0xd800 && c2>=0xd800 && (options&COMPARE_CODE_POINT_ORDER)!=0) {
2380                 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
2381                 if(
2382                     (c1<=0xdbff && s1!=limit1 && Character.isLowSurrogate(cs1.charAt(s1))) ||
2383                     (Character.isLowSurrogate((char)c1) && 0!=(s1-1) && Character.isHighSurrogate(cs1.charAt(s1-2)))
2384                 ) {
2385                     /* part of a surrogate pair, leave >=d800 */
2386                 } else {
2387                     /* BMP code point - may be surrogate code point - make <d800 */
2388                     c1-=0x2800;
2389                 }
2390
2391                 if(
2392                     (c2<=0xdbff && s2!=limit2 && Character.isLowSurrogate(cs2.charAt(s2))) ||
2393                     (Character.isLowSurrogate((char)c2) && 0!=(s2-1) && Character.isHighSurrogate(cs2.charAt(s2-2)))
2394                 ) {
2395                     /* part of a surrogate pair, leave >=d800 */
2396                 } else {
2397                     /* BMP code point - may be surrogate code point - make <d800 */
2398                     c2-=0x2800;
2399                 }
2400             }
2401
2402             return c1-c2;
2403         }
2404     }
2405
2406     /**
2407      * An Appendable that writes into a char array with a capacity that may be
2408      * less than array.length.
2409      * (By contrast, CharBuffer will write beyond destLimit all the way up to array.length.)
2410      * <p>
2411      * An overflow is only reported at the end, for the old Normalizer API functions that write
2412      * to char arrays.
2413      */
2414     private static final class CharsAppendable implements Appendable {
2415         public CharsAppendable(char[] dest, int destStart, int destLimit) {
2416             chars=dest;
2417             start=offset=destStart;
2418             limit=destLimit;
2419         }
2420         public int length() {
2421             int len=offset-start;
2422             if(offset<=limit) {
2423                 return len;
2424             } else {
2425                 throw new IndexOutOfBoundsException(Integer.toString(len));
2426             }
2427         }
2428         public Appendable append(char c) {
2429             if(offset<limit) {
2430                 chars[offset]=c;
2431             }
2432             ++offset;
2433             return this;
2434         }
2435         public Appendable append(CharSequence s) {
2436             return append(s, 0, s.length());
2437         }
2438         public Appendable append(CharSequence s, int sStart, int sLimit) {
2439             int len=sLimit-sStart;
2440             if(len<=(limit-offset)) {
2441                 while(sStart<sLimit) {  // TODO: Is there a better way to copy the characters?
2442                     chars[offset++]=s.charAt(sStart++);
2443                 }
2444             } else {
2445                 offset+=len;
2446             }
2447             return this;
2448         }
2449
2450         private final char[] chars;
2451         private final int start, limit;
2452         private int offset;
2453     }
2454 }