jars/icu4j-4_2_1-src/src/com/ibm/icu/text/UTF16.java

   1 //##header J2SE15
   2 /**
   3  *******************************************************************************
   4  * Copyright (C) 1996-2009, International Business Machines Corporation and    *
   5  * others. All Rights Reserved.                                                *
   6  *******************************************************************************
   7  */
   8
   9 package com.ibm.icu.text;
  10
  11 import com.ibm.icu.impl.UCharacterProperty;
  12 import com.ibm.icu.impl.NormalizerImpl;
  13
  14 /**
  15  * <p>
  16  * Standalone utility class providing UTF16 character conversions and indexing conversions.
  17  * </p>
  18  * <p>
  19  * Code that uses strings alone rarely need modification. By design, UTF-16 does not allow overlap,
  20  * so searching for strings is a safe operation. Similarly, concatenation is always safe.
  21  * Substringing is safe if the start and end are both on UTF-32 boundaries. In normal code, the
  22  * values for start and end are on those boundaries, since they arose from operations like
  23  * searching. If not, the nearest UTF-32 boundaries can be determined using <code>bounds()</code>.
  24  * </p>
  25  * <strong>Examples:</strong>
  26  * <p>
  27  * The following examples illustrate use of some of these methods.
  28  *
  29  * <pre>
  30  * // iteration forwards: Original
  31  * for (int i = 0; i &lt; s.length(); ++i) {
  32  *     char ch = s.charAt(i);
  33  *     doSomethingWith(ch);
  34  * }
  35  *
  36  * // iteration forwards: Changes for UTF-32
  37  * int ch;
  38  * for (int i = 0; i &lt; s.length(); i += UTF16.getCharCount(ch)) {
  39  *     ch = UTF16.charAt(s, i);
  40  *     doSomethingWith(ch);
  41  * }
  42  *
  43  * // iteration backwards: Original
  44  * for (int i = s.length() - 1; i &gt;= 0; --i) {
  45  *     char ch = s.charAt(i);
  46  *     doSomethingWith(ch);
  47  * }
  48  *
  49  * // iteration backwards: Changes for UTF-32
  50  * int ch;
  51  * for (int i = s.length() - 1; i &gt; 0; i -= UTF16.getCharCount(ch)) {
  52  *     ch = UTF16.charAt(s, i);
  53  *     doSomethingWith(ch);
  54  * }
  55  * </pre>
  56  *
  57  * <strong>Notes:</strong>
  58  * <ul>
  59  * <li> <strong>Naming:</strong> For clarity, High and Low surrogates are called <code>Lead</code>
  60  * and <code>Trail</code> in the API, which gives a better sense of their ordering in a string.
  61  * <code>offset16</code> and <code>offset32</code> are used to distinguish offsets to UTF-16
  62  * boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is used to contain UTF-32
  63  * characters, as opposed to <code>char16</code>, which is a UTF-16 code unit. </li>
  64  * <li> <strong>Roundtripping Offsets:</strong> You can always roundtrip from a UTF-32 offset to a
  65  * UTF-16 offset and back. Because of the difference in structure, you can roundtrip from a UTF-16
  66  * offset to a UTF-32 offset and back if and only if <code>bounds(string, offset16) != TRAIL</code>.
  67  * </li>
  68  * <li> <strong>Exceptions:</strong> The error checking will throw an exception if indices are out
  69  * of bounds. Other than than that, all methods will behave reasonably, even if unmatched surrogates
  70  * or out-of-bounds UTF-32 values are present. <code>UCharacter.isLegal()</code> can be used to
  71  * check for validity if desired. </li>
  72  * <li> <strong>Unmatched Surrogates:</strong> If the string contains unmatched surrogates, then
  73  * these are counted as one UTF-32 value. This matches their iteration behavior, which is vital. It
  74  * also matches common display practice as missing glyphs (see the Unicode Standard Section 5.4,
  75  * 5.5). </li>
  76  * <li> <strong>Optimization:</strong> The method implementations may need optimization if the
  77  * compiler doesn't fold static final methods. Since surrogate pairs will form an exceeding small
  78  * percentage of all the text in the world, the singleton case should always be optimized for. </li>
  79  * </ul>
  80  *
  81  * @author Mark Davis, with help from Markus Scherer
  82  * @stable ICU 2.1
  83  */
  84
  85 public final class UTF16 {
  86     // public variables ---------------------------------------------------
  87
  88     /**
  89      * Value returned in <code><a href="#bounds(java.lang.String, int)">
  90      * bounds()</a></code>.
  91      * These values are chosen specifically so that it actually represents the position of the
  92      * character [offset16 - (value >> 2), offset16 + (value & 3)]
  93      *
  94      * @stable ICU 2.1
  95      */
  96     public static final int SINGLE_CHAR_BOUNDARY = 1, LEAD_SURROGATE_BOUNDARY = 2,
  97             TRAIL_SURROGATE_BOUNDARY = 5;
  98
  99     /**
 100      * The lowest Unicode code point value.
 101      *
 102      * @stable ICU 2.1
 103      */
 104     public static final int CODEPOINT_MIN_VALUE = 0;
 105
 106     /**
 107      * The highest Unicode code point value (scalar value) according to the Unicode Standard.
 108      *
 109      * @stable ICU 2.1
 110      */
 111     public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
 112
 113     /**
 114      * The minimum value for Supplementary code points
 115      *
 116      * @stable ICU 2.1
 117      */
 118     public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
 119
 120     /**
 121      * Lead surrogate minimum value
 122      *
 123      * @stable ICU 2.1
 124      */
 125     public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
 126
 127     /**
 128      * Trail surrogate minimum value
 129      *
 130      * @stable ICU 2.1
 131      */
 132     public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
 133
 134     /**
 135      * Lead surrogate maximum value
 136      *
 137      * @stable ICU 2.1
 138      */
 139     public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
 140
 141     /**
 142      * Trail surrogate maximum value
 143      *
 144      * @stable ICU 2.1
 145      */
 146     public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
 147
 148     /**
 149      * Surrogate minimum value
 150      *
 151      * @stable ICU 2.1
 152      */
 153     public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
 154
 155     /**
 156      * Maximum surrogate value
 157      *
 158      * @stable ICU 2.1
 159      */
 160     public static final int SURROGATE_MAX_VALUE = TRAIL_SURROGATE_MAX_VALUE;
 161
 162     /**
 163      * Lead surrogate bitmask
 164      */
 165     private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00;
 166
 167     /**
 168      * Trail surrogate bitmask
 169      */
 170     private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00;
 171
 172     /**
 173      * Surrogate bitmask
 174      */
 175     private static final int SURROGATE_BITMASK = 0xFFFFF800;
 176
 177     /**
 178      * Lead surrogate bits
 179      */
 180     private static final int LEAD_SURROGATE_BITS = 0xD800;
 181
 182     /**
 183      * Trail surrogate bits
 184      */
 185     private static final int TRAIL_SURROGATE_BITS = 0xDC00;
 186
 187     /**
 188      * Surrogate bits
 189      */
 190     private static final int SURROGATE_BITS = 0xD800;
 191
 192     // constructor --------------------------------------------------------
 193
 194     // /CLOVER:OFF
 195     /**
 196      * Prevent instance from being created.
 197      */
 198     private UTF16() {
 199     }
 200
 201     // /CLOVER:ON
 202     // public method ------------------------------------------------------
 203
 204     /**
 205      * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
 206      * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
 207      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">
 208      * UCharacter.isLegal()</a></code>
 209      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
 210      * character will be returned. If a complete supplementary character is not found the incomplete
 211      * character will be returned
 212      *
 213      * @param source
 214      *            array of UTF-16 chars
 215      * @param offset16
 216      *            UTF-16 offset to the start of the character.
 217      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
 218      *         of that codepoint are the same as in <code>bounds32()</code>.
 219      * @exception IndexOutOfBoundsException
 220      *                thrown if offset16 is out of bounds.
 221      * @stable ICU 2.1
 222      */
 223     public static int charAt(String source, int offset16) {
 224         char single = source.charAt(offset16);
 225         if (single < LEAD_SURROGATE_MIN_VALUE) {
 226             return single;
 227         }
 228         return _charAt(source, offset16, single);
 229     }
 230
 231     private static int _charAt(String source, int offset16, char single) {
 232         if (single > TRAIL_SURROGATE_MAX_VALUE) {
 233             return single;
 234         }
 235
 236         // Convert the UTF-16 surrogate pair if necessary.
 237         // For simplicity in usage, and because the frequency of pairs is
 238         // low, look both directions.
 239
 240         if (single <= LEAD_SURROGATE_MAX_VALUE) {
 241             ++offset16;
 242             if (source.length() != offset16) {
 243                 char trail = source.charAt(offset16);
 244                 if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
 245                     return UCharacterProperty.getRawSupplementary(single, trail);
 246                 }
 247             }
 248         } else {
 249             --offset16;
 250             if (offset16 >= 0) {
 251                 // single is a trail surrogate so
 252                 char lead = source.charAt(offset16);
 253                 if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
 254                     return UCharacterProperty.getRawSupplementary(lead, single);
 255                 }
 256             }
 257         }
 258         return single; // return unmatched surrogate
 259     }
 260
 261 //#if defined(FOUNDATION10) || defined(J2SE13)
 262 //#else
 263     /**
 264      * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
 265      * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
 266      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">
 267      * UCharacter.isLegal()</a></code>
 268      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
 269      * character will be returned. If a complete supplementary character is not found the incomplete
 270      * character will be returned
 271      *
 272      * @param source
 273      *            array of UTF-16 chars
 274      * @param offset16
 275      *            UTF-16 offset to the start of the character.
 276      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
 277      *         of that codepoint are the same as in <code>bounds32()</code>.
 278      * @exception IndexOutOfBoundsException
 279      *                thrown if offset16 is out of bounds.
 280      * @stable ICU 2.1
 281      */
 282     public static int charAt(CharSequence source, int offset16) {
 283         char single = source.charAt(offset16);
 284         if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) {
 285             return single;
 286         }
 287         return _charAt(source, offset16, single);
 288     }
 289
 290     private static int _charAt(CharSequence source, int offset16, char single) {
 291         if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
 292             return single;
 293         }
 294
 295         // Convert the UTF-16 surrogate pair if necessary.
 296         // For simplicity in usage, and because the frequency of pairs is
 297         // low, look both directions.
 298
 299         if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
 300             ++offset16;
 301             if (source.length() != offset16) {
 302                 char trail = source.charAt(offset16);
 303                 if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE
 304                         && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
 305                     return UCharacterProperty.getRawSupplementary(single, trail);
 306                 }
 307             }
 308         } else {
 309             --offset16;
 310             if (offset16 >= 0) {
 311                 // single is a trail surrogate so
 312                 char lead = source.charAt(offset16);
 313                 if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE
 314                         && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
 315                     return UCharacterProperty.getRawSupplementary(lead, single);
 316                 }
 317             }
 318         }
 319         return single; // return unmatched surrogate
 320     }
 321
 322 //#endif
 323
 324     /**
 325      * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
 326      * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
 327      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
 328      * </a></code>
 329      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
 330      * character will be returned. If a complete supplementary character is not found the incomplete
 331      * character will be returned
 332      *
 333      * @param source
 334      *            UTF-16 chars string buffer
 335      * @param offset16
 336      *            UTF-16 offset to the start of the character.
 337      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
 338      *         of that codepoint are the same as in <code>bounds32()</code>.
 339      * @exception IndexOutOfBoundsException
 340      *                thrown if offset16 is out of bounds.
 341      * @stable ICU 2.1
 342      */
 343     public static int charAt(StringBuffer source, int offset16) {
 344         if (offset16 < 0 || offset16 >= source.length()) {
 345             throw new StringIndexOutOfBoundsException(offset16);
 346         }
 347
 348         char single = source.charAt(offset16);
 349         if (!isSurrogate(single)) {
 350             return single;
 351         }
 352
 353         // Convert the UTF-16 surrogate pair if necessary.
 354         // For simplicity in usage, and because the frequency of pairs is
 355         // low, look both directions.
 356
 357         if (single <= LEAD_SURROGATE_MAX_VALUE) {
 358             ++offset16;
 359             if (source.length() != offset16) {
 360                 char trail = source.charAt(offset16);
 361                 if (isTrailSurrogate(trail))
 362                     return UCharacterProperty.getRawSupplementary(single, trail);
 363             }
 364         } else {
 365             --offset16;
 366             if (offset16 >= 0) {
 367                 // single is a trail surrogate so
 368                 char lead = source.charAt(offset16);
 369                 if (isLeadSurrogate(lead)) {
 370                     return UCharacterProperty.getRawSupplementary(lead, single);
 371                 }
 372             }
 373         }
 374         return single; // return unmatched surrogate
 375     }
 376
 377     /**
 378      * Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards
 379      * (with <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
 380      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
 381      * </a></code>
 382      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
 383      * character will be returned. If a complete supplementary character is not found the incomplete
 384      * character will be returned
 385      *
 386      * @param source
 387      *            array of UTF-16 chars
 388      * @param start
 389      *            offset to substring in the source array for analyzing
 390      * @param limit
 391      *            offset to substring in the source array for analyzing
 392      * @param offset16
 393      *            UTF-16 offset relative to start
 394      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
 395      *         of that codepoint are the same as in <code>bounds32()</code>.
 396      * @exception IndexOutOfBoundsException
 397      *                thrown if offset16 is not within the range of start and limit.
 398      * @stable ICU 2.1
 399      */
 400     public static int charAt(char source[], int start, int limit, int offset16) {
 401         offset16 += start;
 402         if (offset16 < start || offset16 >= limit) {
 403             throw new ArrayIndexOutOfBoundsException(offset16);
 404         }
 405
 406         char single = source[offset16];
 407         if (!isSurrogate(single)) {
 408             return single;
 409         }
 410
 411         // Convert the UTF-16 surrogate pair if necessary.
 412         // For simplicity in usage, and because the frequency of pairs is
 413         // low, look both directions.
 414         if (single <= LEAD_SURROGATE_MAX_VALUE) {
 415             offset16++;
 416             if (offset16 >= limit) {
 417                 return single;
 418             }
 419             char trail = source[offset16];
 420             if (isTrailSurrogate(trail)) {
 421                 return UCharacterProperty.getRawSupplementary(single, trail);
 422             }
 423         } else { // isTrailSurrogate(single), so
 424             if (offset16 == start) {
 425                 return single;
 426             }
 427             offset16--;
 428             char lead = source[offset16];
 429             if (isLeadSurrogate(lead))
 430                 return UCharacterProperty.getRawSupplementary(lead, single);
 431         }
 432         return single; // return unmatched surrogate
 433     }
 434
 435     /**
 436      * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
 437      * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
 438      * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
 439      * </a></code>
 440      * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
 441      * character will be returned. If a complete supplementary character is not found the incomplete
 442      * character will be returned
 443      *
 444      * @param source
 445      *            UTF-16 chars string buffer
 446      * @param offset16
 447      *            UTF-16 offset to the start of the character.
 448      * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
 449      *         of that codepoint are the same as in <code>bounds32()</code>.
 450      * @exception IndexOutOfBoundsException
 451      *                thrown if offset16 is out of bounds.
 452      * @stable ICU 2.1
 453      */
 454     public static int charAt(Replaceable source, int offset16) {
 455         if (offset16 < 0 || offset16 >= source.length()) {
 456             throw new StringIndexOutOfBoundsException(offset16);
 457         }
 458
 459         char single = source.charAt(offset16);
 460         if (!isSurrogate(single)) {
 461             return single;
 462         }
 463
 464         // Convert the UTF-16 surrogate pair if necessary.
 465         // For simplicity in usage, and because the frequency of pairs is
 466         // low, look both directions.
 467
 468         if (single <= LEAD_SURROGATE_MAX_VALUE) {
 469             ++offset16;
 470             if (source.length() != offset16) {
 471                 char trail = source.charAt(offset16);
 472                 if (isTrailSurrogate(trail))
 473                     return UCharacterProperty.getRawSupplementary(single, trail);
 474             }
 475         } else {
 476             --offset16;
 477             if (offset16 >= 0) {
 478                 // single is a trail surrogate so
 479                 char lead = source.charAt(offset16);
 480                 if (isLeadSurrogate(lead)) {
 481                     return UCharacterProperty.getRawSupplementary(lead, single);
 482                 }
 483             }
 484         }
 485         return single; // return unmatched surrogate
 486     }
 487
 488     /**
 489      * Determines how many chars this char32 requires. If a validity check is required, use <code>
 490      * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
 491      * on char32 before calling.
 492      *
 493      * @param char32
 494      *            the input codepoint.
 495      * @return 2 if is in supplementary space, otherwise 1.
 496      * @stable ICU 2.1
 497      */
 498     public static int getCharCount(int char32) {
 499         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
 500             return 1;
 501         }
 502         return 2;
 503     }
 504
 505     /**
 506      * Returns the type of the boundaries around the char at offset16. Used for random access.
 507      *
 508      * @param source
 509      *            text to analyse
 510      * @param offset16
 511      *            UTF-16 offset
 512      * @return
 513      *            <ul>
 514      *            <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16+1]
 515      *            <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
 516      *            are [offset16, offset16 + 2]
 517      *            <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
 518      *            bounds are [offset16 - 1, offset16 + 1]
 519      *            </ul>
 520      *            For bit-twiddlers, the return values for these are chosen so that the boundaries
 521      *            can be gotten by: [offset16 - (value >> 2), offset16 + (value & 3)].
 522      * @exception IndexOutOfBoundsException
 523      *                if offset16 is out of bounds.
 524      * @stable ICU 2.1
 525      */
 526     public static int bounds(String source, int offset16) {
 527         char ch = source.charAt(offset16);
 528         if (isSurrogate(ch)) {
 529             if (isLeadSurrogate(ch)) {
 530                 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) {
 531                     return LEAD_SURROGATE_BOUNDARY;
 532                 }
 533             } else {
 534                 // isTrailSurrogate(ch), so
 535                 --offset16;
 536                 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) {
 537                     return TRAIL_SURROGATE_BOUNDARY;
 538                 }
 539             }
 540         }
 541         return SINGLE_CHAR_BOUNDARY;
 542     }
 543
 544     /**
 545      * Returns the type of the boundaries around the char at offset16. Used for random access.
 546      *
 547      * @param source
 548      *            string buffer to analyse
 549      * @param offset16
 550      *            UTF16 offset
 551      * @return
 552      *            <ul>
 553      *            <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16 + 1]
 554      *            <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
 555      *            are [offset16, offset16 + 2]
 556      *            <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
 557      *            bounds are [offset16 - 1, offset16 + 1]
 558      *            </ul>
 559      *            For bit-twiddlers, the return values for these are chosen so that the boundaries
 560      *            can be gotten by: [offset16 - (value >> 2), offset16 + (value & 3)].
 561      * @exception IndexOutOfBoundsException
 562      *                if offset16 is out of bounds.
 563      * @stable ICU 2.1
 564      */
 565     public static int bounds(StringBuffer source, int offset16) {
 566         char ch = source.charAt(offset16);
 567         if (isSurrogate(ch)) {
 568             if (isLeadSurrogate(ch)) {
 569                 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) {
 570                     return LEAD_SURROGATE_BOUNDARY;
 571                 }
 572             } else {
 573                 // isTrailSurrogate(ch), so
 574                 --offset16;
 575                 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) {
 576                     return TRAIL_SURROGATE_BOUNDARY;
 577                 }
 578             }
 579         }
 580         return SINGLE_CHAR_BOUNDARY;
 581     }
 582
 583     /**
 584      * Returns the type of the boundaries around the char at offset16. Used for random access. Note
 585      * that the boundaries are determined with respect to the subarray, hence the char array
 586      * {0xD800, 0xDC00} has the result SINGLE_CHAR_BOUNDARY for start = offset16 = 0 and limit = 1.
 587      *
 588      * @param source
 589      *            char array to analyse
 590      * @param start
 591      *            offset to substring in the source array for analyzing
 592      * @param limit
 593      *            offset to substring in the source array for analyzing
 594      * @param offset16
 595      *            UTF16 offset relative to start
 596      * @return
 597      *            <ul>
 598      *            <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are
 599      *            <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
 600      *            are [offset16, offset16 + 2]
 601      *            <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
 602      *            bounds are [offset16 - 1, offset16 + 1]
 603      *            </ul>
 604      *            For bit-twiddlers, the boundary values for these are chosen so that the boundaries
 605      *            can be gotten by: [offset16 - (boundvalue >> 2), offset16 + (boundvalue & 3)].
 606      * @exception IndexOutOfBoundsException
 607      *                if offset16 is not within the range of start and limit.
 608      * @stable ICU 2.1
 609      */
 610     public static int bounds(char source[], int start, int limit, int offset16) {
 611         offset16 += start;
 612         if (offset16 < start || offset16 >= limit) {
 613             throw new ArrayIndexOutOfBoundsException(offset16);
 614         }
 615         char ch = source[offset16];
 616         if (isSurrogate(ch)) {
 617             if (isLeadSurrogate(ch)) {
 618                 ++offset16;
 619                 if (offset16 < limit && isTrailSurrogate(source[offset16])) {
 620                     return LEAD_SURROGATE_BOUNDARY;
 621                 }
 622             } else { // isTrailSurrogate(ch), so
 623                 --offset16;
 624                 if (offset16 >= start && isLeadSurrogate(source[offset16])) {
 625                     return TRAIL_SURROGATE_BOUNDARY;
 626                 }
 627             }
 628         }
 629         return SINGLE_CHAR_BOUNDARY;
 630     }
 631
 632     /**
 633      * Determines whether the code value is a surrogate.
 634      *
 635      * @param char16
 636      *            the input character.
 637      * @return true iff the input character is a surrogate.
 638      * @stable ICU 2.1
 639      */
 640     public static boolean isSurrogate(char char16) {
 641         return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS;
 642     }
 643
 644     /**
 645      * Determines whether the character is a trail surrogate.
 646      *
 647      * @param char16
 648      *            the input character.
 649      * @return true iff the input character is a trail surrogate.
 650      * @stable ICU 2.1
 651      */
 652     public static boolean isTrailSurrogate(char char16) {
 653         return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
 654     }
 655
 656     /**
 657      * Determines whether the character is a lead surrogate.
 658      *
 659      * @param char16
 660      *            the input character.
 661      * @return true iff the input character is a lead surrogate
 662      * @stable ICU 2.1
 663      */
 664     public static boolean isLeadSurrogate(char char16) {
 665         return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
 666     }
 667
 668     /**
 669      * Returns the lead surrogate. If a validity check is required, use
 670      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32
 671      * before calling.
 672      *
 673      * @param char32
 674      *            the input character.
 675      * @return lead surrogate if the getCharCount(ch) is 2; <br>
 676      *         and 0 otherwise (note: 0 is not a valid lead surrogate).
 677      * @stable ICU 2.1
 678      */
 679     public static char getLeadSurrogate(int char32) {
 680         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
 681             return (char) (LEAD_SURROGATE_OFFSET_ + (char32 >> LEAD_SURROGATE_SHIFT_));
 682         }
 683         return 0;
 684     }
 685
 686     /**
 687      * Returns the trail surrogate. If a validity check is required, use
 688      * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32
 689      * before calling.
 690      *
 691      * @param char32
 692      *            the input character.
 693      * @return the trail surrogate if the getCharCount(ch) is 2; <br>
 694      *         otherwise the character itself
 695      * @stable ICU 2.1
 696      */
 697     public static char getTrailSurrogate(int char32) {
 698         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
 699             return (char) (TRAIL_SURROGATE_MIN_VALUE + (char32 & TRAIL_SURROGATE_MASK_));
 700         }
 701         return (char) char32;
 702     }
 703
 704     /**
 705      * Convenience method corresponding to String.valueOf(char). Returns a one or two char string
 706      * containing the UTF-32 value in UTF16 format. If a validity check is required, use <a
 707      * href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32 before calling.
 708      *
 709      * @param char32
 710      *            the input character.
 711      * @return string value of char32 in UTF16 format
 712      * @exception IllegalArgumentException
 713      *                thrown if char32 is a invalid codepoint.
 714      * @stable ICU 2.1
 715      */
 716     public static String valueOf(int char32) {
 717         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
 718             throw new IllegalArgumentException("Illegal codepoint");
 719         }
 720         return toString(char32);
 721     }
 722
 723     /**
 724      * Convenience method corresponding to String.valueOf(codepoint at offset16). Returns a one or
 725      * two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a surrogate
 726      * character, the whole supplementary codepoint will be returned. If a validity check is
 727      * required, use <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on the
 728      * codepoint at offset16 before calling. The result returned will be a newly created String
 729      * obtained by calling source.substring(..) with the appropriate indexes.
 730      *
 731      * @param source
 732      *            the input string.
 733      * @param offset16
 734      *            the UTF16 index to the codepoint in source
 735      * @return string value of char32 in UTF16 format
 736      * @stable ICU 2.1
 737      */
 738     public static String valueOf(String source, int offset16) {
 739         switch (bounds(source, offset16)) {
 740         case LEAD_SURROGATE_BOUNDARY:
 741             return source.substring(offset16, offset16 + 2);
 742         case TRAIL_SURROGATE_BOUNDARY:
 743             return source.substring(offset16 - 1, offset16 + 1);
 744         default:
 745             return source.substring(offset16, offset16 + 1);
 746         }
 747     }
 748
 749     /**
 750      * Convenience method corresponding to StringBuffer.valueOf(codepoint at offset16). Returns a
 751      * one or two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a
 752      * surrogate character, the whole supplementary codepoint will be returned. If a validity check
 753      * is required, use <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
 754      * the codepoint at offset16 before calling. The result returned will be a newly created String
 755      * obtained by calling source.substring(..) with the appropriate indexes.
 756      *
 757      * @param source
 758      *            the input string buffer.
 759      * @param offset16
 760      *            the UTF16 index to the codepoint in source
 761      * @return string value of char32 in UTF16 format
 762      * @stable ICU 2.1
 763      */
 764     public static String valueOf(StringBuffer source, int offset16) {
 765         switch (bounds(source, offset16)) {
 766         case LEAD_SURROGATE_BOUNDARY:
 767             return source.substring(offset16, offset16 + 2);
 768         case TRAIL_SURROGATE_BOUNDARY:
 769             return source.substring(offset16 - 1, offset16 + 1);
 770         default:
 771             return source.substring(offset16, offset16 + 1);
 772         }
 773     }
 774
 775     /**
 776      * Convenience method. Returns a one or two char string containing the UTF-32 value in UTF16
 777      * format. If offset16 indexes a surrogate character, the whole supplementary codepoint will be
 778      * returned, except when either the leading or trailing surrogate character lies out of the
 779      * specified subarray. In the latter case, only the surrogate character within bounds will be
 780      * returned. If a validity check is required, use <a
 781      * href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on the codepoint at
 782      * offset16 before calling. The result returned will be a newly created String containing the
 783      * relevant characters.
 784      *
 785      * @param source
 786      *            the input char array.
 787      * @param start
 788      *            start index of the subarray
 789      * @param limit
 790      *            end index of the subarray
 791      * @param offset16
 792      *            the UTF16 index to the codepoint in source relative to start
 793      * @return string value of char32 in UTF16 format
 794      * @stable ICU 2.1
 795      */
 796     public static String valueOf(char source[], int start, int limit, int offset16) {
 797         switch (bounds(source, start, limit, offset16)) {
 798         case LEAD_SURROGATE_BOUNDARY:
 799             return new String(source, start + offset16, 2);
 800         case TRAIL_SURROGATE_BOUNDARY:
 801             return new String(source, start + offset16 - 1, 2);
 802         }
 803         return new String(source, start + offset16, 1);
 804     }
 805
 806     /**
 807      * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
 808      * the <a name="_top_">class description</a> for notes on roundtripping.
 809      *
 810      * @param source
 811      *            the UTF-16 string
 812      * @param offset32
 813      *            UTF-32 offset
 814      * @return UTF-16 offset
 815      * @exception IndexOutOfBoundsException
 816      *                if offset32 is out of bounds.
 817      * @stable ICU 2.1
 818      */
 819     public static int findOffsetFromCodePoint(String source, int offset32) {
 820         char ch;
 821         int size = source.length(), result = 0, count = offset32;
 822         if (offset32 < 0 || offset32 > size) {
 823             throw new StringIndexOutOfBoundsException(offset32);
 824         }
 825         while (result < size && count > 0) {
 826             ch = source.charAt(result);
 827             if (isLeadSurrogate(ch) && ((result + 1) < size)
 828                     && isTrailSurrogate(source.charAt(result + 1))) {
 829                 result++;
 830             }
 831
 832             count--;
 833             result++;
 834         }
 835         if (count != 0) {
 836             throw new StringIndexOutOfBoundsException(offset32);
 837         }
 838         return result;
 839     }
 840
 841     /**
 842      * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
 843      * the <a name="_top_">class description</a> for notes on roundtripping.
 844      *
 845      * @param source
 846      *            the UTF-16 string buffer
 847      * @param offset32
 848      *            UTF-32 offset
 849      * @return UTF-16 offset
 850      * @exception IndexOutOfBoundsException
 851      *                if offset32 is out of bounds.
 852      * @stable ICU 2.1
 853      */
 854     public static int findOffsetFromCodePoint(StringBuffer source, int offset32) {
 855         char ch;
 856         int size = source.length(), result = 0, count = offset32;
 857         if (offset32 < 0 || offset32 > size) {
 858             throw new StringIndexOutOfBoundsException(offset32);
 859         }
 860         while (result < size && count > 0) {
 861             ch = source.charAt(result);
 862             if (isLeadSurrogate(ch) && ((result + 1) < size)
 863                     && isTrailSurrogate(source.charAt(result + 1))) {
 864                 result++;
 865             }
 866
 867             count--;
 868             result++;
 869         }
 870         if (count != 0) {
 871             throw new StringIndexOutOfBoundsException(offset32);
 872         }
 873         return result;
 874     }
 875
 876     /**
 877      * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
 878      * the <a name="_top_">class description</a> for notes on roundtripping.
 879      *
 880      * @param source
 881      *            the UTF-16 char array whose substring is to be analysed
 882      * @param start
 883      *            offset of the substring to be analysed
 884      * @param limit
 885      *            offset of the substring to be analysed
 886      * @param offset32
 887      *            UTF-32 offset relative to start
 888      * @return UTF-16 offset relative to start
 889      * @exception IndexOutOfBoundsException
 890      *                if offset32 is out of bounds.
 891      * @stable ICU 2.1
 892      */
 893     public static int findOffsetFromCodePoint(char source[], int start, int limit, int offset32) {
 894         char ch;
 895         int result = start, count = offset32;
 896         if (offset32 > limit - start) {
 897             throw new ArrayIndexOutOfBoundsException(offset32);
 898         }
 899         while (result < limit && count > 0) {
 900             ch = source[result];
 901             if (isLeadSurrogate(ch) && ((result + 1) < limit)
 902                     && isTrailSurrogate(source[result + 1])) {
 903                 result++;
 904             }
 905
 906             count--;
 907             result++;
 908         }
 909         if (count != 0) {
 910             throw new ArrayIndexOutOfBoundsException(offset32);
 911         }
 912         return result - start;
 913     }
 914
 915     /**
 916      * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at or after the given
 917      * UTF-16 offset. Used for random access. See the <a name="_top_">class description</a> for
 918      * notes on roundtripping.<br>
 919      * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
 920      * of the <strong>lead</strong> of the pair is returned. </i>
 921      * <p>
 922      * To find the UTF-32 length of a string, use:
 923      *
 924      * <pre>
 925      * len32 = countCodePoint(source, source.length());
 926      * </pre>
 927      *
 928      * </p>
 929      * <p>
 930      *
 931      * @param source
 932      *            text to analyse
 933      * @param offset16
 934      *            UTF-16 offset < source text length.
 935      * @return UTF-32 offset
 936      * @exception IndexOutOfBoundsException
 937      *                if offset16 is out of bounds.
 938      * @stable ICU 2.1
 939      */
 940     public static int findCodePointOffset(String source, int offset16) {
 941         if (offset16 < 0 || offset16 > source.length()) {
 942             throw new StringIndexOutOfBoundsException(offset16);
 943         }
 944
 945         int result = 0;
 946         char ch;
 947         boolean hadLeadSurrogate = false;
 948
 949         for (int i = 0; i < offset16; ++i) {
 950             ch = source.charAt(i);
 951             if (hadLeadSurrogate && isTrailSurrogate(ch)) {
 952                 hadLeadSurrogate = false; // count valid trail as zero
 953             } else {
 954                 hadLeadSurrogate = isLeadSurrogate(ch);
 955                 ++result; // count others as 1
 956             }
 957         }
 958
 959         if (offset16 == source.length()) {
 960             return result;
 961         }
 962
 963         // end of source being the less significant surrogate character
 964         // shift result back to the start of the supplementary character
 965         if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) {
 966             result--;
 967         }
 968
 969         return result;
 970     }
 971
 972     /**
 973      * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16
 974      * offset. Used for random access. See the <a name="_top_">class description</a> for notes on
 975      * roundtripping.<br>
 976      * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
 977      * of the <strong>lead</strong> of the pair is returned. </i>
 978      * <p>
 979      * To find the UTF-32 length of a string, use:
 980      *
 981      * <pre>
 982      * len32 = countCodePoint(source);
 983      * </pre>
 984      *
 985      * </p>
 986      * <p>
 987      *
 988      * @param source
 989      *            text to analyse
 990      * @param offset16
 991      *            UTF-16 offset < source text length.
 992      * @return UTF-32 offset
 993      * @exception IndexOutOfBoundsException
 994      *                if offset16 is out of bounds.
 995      * @stable ICU 2.1
 996      */
 997     public static int findCodePointOffset(StringBuffer source, int offset16) {
 998         if (offset16 < 0 || offset16 > source.length()) {
 999             throw new StringIndexOutOfBoundsException(offset16);
1000         }
1001
1002         int result = 0;
1003         char ch;
1004         boolean hadLeadSurrogate = false;
1005
1006         for (int i = 0; i < offset16; ++i) {
1007             ch = source.charAt(i);
1008             if (hadLeadSurrogate && isTrailSurrogate(ch)) {
1009                 hadLeadSurrogate = false; // count valid trail as zero
1010             } else {
1011                 hadLeadSurrogate = isLeadSurrogate(ch);
1012                 ++result; // count others as 1
1013             }
1014         }
1015
1016         if (offset16 == source.length()) {
1017             return result;
1018         }
1019
1020         // end of source being the less significant surrogate character
1021         // shift result back to the start of the supplementary character
1022         if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) {
1023             result--;
1024         }
1025
1026         return result;
1027     }
1028
1029     /**
1030      * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16
1031      * offset. Used for random access. See the <a name="_top_">class description</a> for notes on
1032      * roundtripping.<br>
1033      * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
1034      * of the <strong>lead</strong> of the pair is returned. </i>
1035      * <p>
1036      * To find the UTF-32 length of a substring, use:
1037      *
1038      * <pre>
1039      * len32 = countCodePoint(source, start, limit);
1040      * </pre>
1041      *
1042      * </p>
1043      * <p>
1044      *
1045      * @param source
1046      *            text to analyse
1047      * @param start
1048      *            offset of the substring
1049      * @param limit
1050      *            offset of the substring
1051      * @param offset16
1052      *            UTF-16 relative to start
1053      * @return UTF-32 offset relative to start
1054      * @exception IndexOutOfBoundsException
1055      *                if offset16 is not within the range of start and limit.
1056      * @stable ICU 2.1
1057      */
1058     public static int findCodePointOffset(char source[], int start, int limit, int offset16) {
1059         offset16 += start;
1060         if (offset16 > limit) {
1061             throw new StringIndexOutOfBoundsException(offset16);
1062         }
1063
1064         int result = 0;
1065         char ch;
1066         boolean hadLeadSurrogate = false;
1067
1068         for (int i = start; i < offset16; ++i) {
1069             ch = source[i];
1070             if (hadLeadSurrogate && isTrailSurrogate(ch)) {
1071                 hadLeadSurrogate = false; // count valid trail as zero
1072             } else {
1073                 hadLeadSurrogate = isLeadSurrogate(ch);
1074                 ++result; // count others as 1
1075             }
1076         }
1077
1078         if (offset16 == limit) {
1079             return result;
1080         }
1081
1082         // end of source being the less significant surrogate character
1083         // shift result back to the start of the supplementary character
1084         if (hadLeadSurrogate && (isTrailSurrogate(source[offset16]))) {
1085             result--;
1086         }
1087
1088         return result;
1089     }
1090
1091     /**
1092      * Append a single UTF-32 value to the end of a StringBuffer. If a validity check is required,
1093      * use <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32 before
1094      * calling.
1095      *
1096      * @param target
1097      *            the buffer to append to
1098      * @param char32
1099      *            value to append.
1100      * @return the updated StringBuffer
1101      * @exception IllegalArgumentException
1102      *                thrown when char32 does not lie within the range of the Unicode codepoints
1103      * @stable ICU 2.1
1104      */
1105     public static StringBuffer append(StringBuffer target, int char32) {
1106         // Check for irregular values
1107         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1108             throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
1109         }
1110
1111         // Write the UTF-16 values
1112         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
1113             target.append(getLeadSurrogate(char32));
1114             target.append(getTrailSurrogate(char32));
1115         } else {
1116             target.append((char) char32);
1117         }
1118         return target;
1119     }
1120
1121     /**
1122      * Cover JDK 1.5 APIs. Append the code point to the buffer and return the buffer as a
1123      * convenience.
1124      *
1125      * @param target
1126      *            the buffer to append to
1127      * @param cp
1128      *            the code point to append
1129      * @return the updated StringBuffer
1130      * @throws IllegalArgumentException
1131      *             if cp is not a valid code point
1132      * @stable ICU 3.0
1133      */
1134     public static StringBuffer appendCodePoint(StringBuffer target, int cp) {
1135         return append(target, cp);
1136     }
1137
1138     /**
1139      * Adds a codepoint to offset16 position of the argument char array.
1140      *
1141      * @param target
1142      *            char array to be append with the new code point
1143      * @param limit
1144      *            UTF16 offset which the codepoint will be appended.
1145      * @param char32
1146      *            code point to be appended
1147      * @return offset after char32 in the array.
1148      * @exception IllegalArgumentException
1149      *                thrown if there is not enough space for the append, or when char32 does not
1150      *                lie within the range of the Unicode codepoints.
1151      * @stable ICU 2.1
1152      */
1153     public static int append(char[] target, int limit, int char32) {
1154         // Check for irregular values
1155         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1156             throw new IllegalArgumentException("Illegal codepoint");
1157         }
1158         // Write the UTF-16 values
1159         if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
1160             target[limit++] = getLeadSurrogate(char32);
1161             target[limit++] = getTrailSurrogate(char32);
1162         } else {
1163             target[limit++] = (char) char32;
1164         }
1165         return limit;
1166     }
1167
1168     /**
1169      * Number of codepoints in a UTF16 String
1170      *
1171      * @param source
1172      *            UTF16 string
1173      * @return number of codepoint in string
1174      * @stable ICU 2.1
1175      */
1176     public static int countCodePoint(String source) {
1177         if (source == null || source.length() == 0) {
1178             return 0;
1179         }
1180         return findCodePointOffset(source, source.length());
1181     }
1182
1183     /**
1184      * Number of codepoints in a UTF16 String buffer
1185      *
1186      * @param source
1187      *            UTF16 string buffer
1188      * @return number of codepoint in string
1189      * @stable ICU 2.1
1190      */
1191     public static int countCodePoint(StringBuffer source) {
1192         if (source == null || source.length() == 0) {
1193             return 0;
1194         }
1195         return findCodePointOffset(source, source.length());
1196     }
1197
1198     /**
1199      * Number of codepoints in a UTF16 char array substring
1200      *
1201      * @param source
1202      *            UTF16 char array
1203      * @param start
1204      *            offset of the substring
1205      * @param limit
1206      *            offset of the substring
1207      * @return number of codepoint in the substring
1208      * @exception IndexOutOfBoundsException
1209      *                if start and limit are not valid.
1210      * @stable ICU 2.1
1211      */
1212     public static int countCodePoint(char source[], int start, int limit) {
1213         if (source == null || source.length == 0) {
1214             return 0;
1215         }
1216         return findCodePointOffset(source, start, limit, limit - start);
1217     }
1218
1219     /**
1220      * Set a code point into a UTF16 position. Adjusts target according if we are replacing a
1221      * non-supplementary codepoint with a supplementary and vice versa.
1222      *
1223      * @param target
1224      *            stringbuffer
1225      * @param offset16
1226      *            UTF16 position to insert into
1227      * @param char32
1228      *            code point
1229      * @stable ICU 2.1
1230      */
1231     public static void setCharAt(StringBuffer target, int offset16, int char32) {
1232         int count = 1;
1233         char single = target.charAt(offset16);
1234
1235         if (isSurrogate(single)) {
1236             // pairs of the surrogate with offset16 at the lead char found
1237             if (isLeadSurrogate(single) && (target.length() > offset16 + 1)
1238                     && isTrailSurrogate(target.charAt(offset16 + 1))) {
1239                 count++;
1240             } else {
1241                 // pairs of the surrogate with offset16 at the trail char
1242                 // found
1243                 if (isTrailSurrogate(single) && (offset16 > 0)
1244                         && isLeadSurrogate(target.charAt(offset16 - 1))) {
1245                     offset16--;
1246                     count++;
1247                 }
1248             }
1249         }
1250         target.replace(offset16, offset16 + count, valueOf(char32));
1251     }
1252
1253     /**
1254      * Set a code point into a UTF16 position in a char array. Adjusts target according if we are
1255      * replacing a non-supplementary codepoint with a supplementary and vice versa.
1256      *
1257      * @param target
1258      *            char array
1259      * @param limit
1260      *            numbers of valid chars in target, different from target.length. limit counts the
1261      *            number of chars in target that represents a string, not the size of array target.
1262      * @param offset16
1263      *            UTF16 position to insert into
1264      * @param char32
1265      *            code point
1266      * @return new number of chars in target that represents a string
1267      * @exception IndexOutOfBoundsException
1268      *                if offset16 is out of range
1269      * @stable ICU 2.1
1270      */
1271     public static int setCharAt(char target[], int limit, int offset16, int char32) {
1272         if (offset16 >= limit) {
1273             throw new ArrayIndexOutOfBoundsException(offset16);
1274         }
1275         int count = 1;
1276         char single = target[offset16];
1277
1278         if (isSurrogate(single)) {
1279             // pairs of the surrogate with offset16 at the lead char found
1280             if (isLeadSurrogate(single) && (target.length > offset16 + 1)
1281                     && isTrailSurrogate(target[offset16 + 1])) {
1282                 count++;
1283             } else {
1284                 // pairs of the surrogate with offset16 at the trail char
1285                 // found
1286                 if (isTrailSurrogate(single) && (offset16 > 0)
1287                         && isLeadSurrogate(target[offset16 - 1])) {
1288                     offset16--;
1289                     count++;
1290                 }
1291             }
1292         }
1293
1294         String str = valueOf(char32);
1295         int result = limit;
1296         int strlength = str.length();
1297         target[offset16] = str.charAt(0);
1298         if (count == strlength) {
1299             if (count == 2) {
1300                 target[offset16 + 1] = str.charAt(1);
1301             }
1302         } else {
1303             // this is not exact match in space, we'll have to do some
1304             // shifting
1305             System.arraycopy(target, offset16 + count, target, offset16 + strlength, limit
1306                     - (offset16 + count));
1307             if (count < strlength) {
1308                 // char32 is a supplementary character trying to squeeze into
1309                 // a non-supplementary space
1310                 target[offset16 + 1] = str.charAt(1);
1311                 result++;
1312                 if (result < target.length) {
1313                     target[result] = 0;
1314                 }
1315             } else {
1316                 // char32 is a non-supplementary character trying to fill
1317                 // into a supplementary space
1318                 result--;
1319                 target[result] = 0;
1320             }
1321         }
1322         return result;
1323     }
1324
1325     /**
1326      * Shifts offset16 by the argument number of codepoints
1327      *
1328      * @param source
1329      *            string
1330      * @param offset16
1331      *            UTF16 position to shift
1332      * @param shift32
1333      *            number of codepoints to shift
1334      * @return new shifted offset16
1335      * @exception IndexOutOfBoundsException
1336      *                if the new offset16 is out of bounds.
1337      * @stable ICU 2.1
1338      */
1339     public static int moveCodePointOffset(String source, int offset16, int shift32) {
1340         int result = offset16;
1341         int size = source.length();
1342         int count;
1343         char ch;
1344         if (offset16 < 0 || offset16 > size) {
1345             throw new StringIndexOutOfBoundsException(offset16);
1346         }
1347         if (shift32 > 0) {
1348             if (shift32 + offset16 > size) {
1349                 throw new StringIndexOutOfBoundsException(offset16);
1350             }
1351             count = shift32;
1352             while (result < size && count > 0) {
1353                 ch = source.charAt(result);
1354                 if (isLeadSurrogate(ch) && ((result + 1) < size)
1355                         && isTrailSurrogate(source.charAt(result + 1))) {
1356                     result++;
1357                 }
1358                 count--;
1359                 result++;
1360             }
1361         } else {
1362             if (offset16 + shift32 < 0) {
1363                 throw new StringIndexOutOfBoundsException(offset16);
1364             }
1365             for (count = -shift32; count > 0; count--) {
1366                 result--;
1367                 if (result < 0) {
1368                     break;
1369                 }
1370                 ch = source.charAt(result);
1371                 if (isTrailSurrogate(ch) && result > 0
1372                         && isLeadSurrogate(source.charAt(result - 1))) {
1373                     result--;
1374                 }
1375             }
1376         }
1377         if (count != 0) {
1378             throw new StringIndexOutOfBoundsException(shift32);
1379         }
1380         return result;
1381     }
1382
1383     /**
1384      * Shifts offset16 by the argument number of codepoints
1385      *
1386      * @param source
1387      *            string buffer
1388      * @param offset16
1389      *            UTF16 position to shift
1390      * @param shift32
1391      *            number of codepoints to shift
1392      * @return new shifted offset16
1393      * @exception IndexOutOfBoundsException
1394      *                if the new offset16 is out of bounds.
1395      * @stable ICU 2.1
1396      */
1397     public static int moveCodePointOffset(StringBuffer source, int offset16, int shift32) {
1398         int result = offset16;
1399         int size = source.length();
1400         int count;
1401         char ch;
1402         if (offset16 < 0 || offset16 > size) {
1403             throw new StringIndexOutOfBoundsException(offset16);
1404         }
1405         if (shift32 > 0) {
1406             if (shift32 + offset16 > size) {
1407                 throw new StringIndexOutOfBoundsException(offset16);
1408             }
1409             count = shift32;
1410             while (result < size && count > 0) {
1411                 ch = source.charAt(result);
1412                 if (isLeadSurrogate(ch) && ((result + 1) < size)
1413                         && isTrailSurrogate(source.charAt(result + 1))) {
1414                     result++;
1415                 }
1416                 count--;
1417                 result++;
1418             }
1419         } else {
1420             if (offset16 + shift32 < 0) {
1421                 throw new StringIndexOutOfBoundsException(offset16);
1422             }
1423             for (count = -shift32; count > 0; count--) {
1424                 result--;
1425                 if (result < 0) {
1426                     break;
1427                 }
1428                 ch = source.charAt(result);
1429                 if (isTrailSurrogate(ch) && result > 0
1430                         && isLeadSurrogate(source.charAt(result - 1))) {
1431                     result--;
1432                 }
1433             }
1434         }
1435         if (count != 0) {
1436             throw new StringIndexOutOfBoundsException(shift32);
1437         }
1438         return result;
1439     }
1440
1441     /**
1442      * Shifts offset16 by the argument number of codepoints within a subarray.
1443      *
1444      * @param source
1445      *            char array
1446      * @param start
1447      *            position of the subarray to be performed on
1448      * @param limit
1449      *            position of the subarray to be performed on
1450      * @param offset16
1451      *            UTF16 position to shift relative to start
1452      * @param shift32
1453      *            number of codepoints to shift
1454      * @return new shifted offset16 relative to start
1455      * @exception IndexOutOfBoundsException
1456      *                if the new offset16 is out of bounds with respect to the subarray or the
1457      *                subarray bounds are out of range.
1458      * @stable ICU 2.1
1459      */
1460     public static int moveCodePointOffset(char source[], int start, int limit, int offset16,
1461             int shift32) {
1462         int size = source.length;
1463         int count;
1464         char ch;
1465         int result = offset16 + start;
1466         if (start < 0 || limit < start) {
1467             throw new StringIndexOutOfBoundsException(start);
1468         }
1469         if (limit > size) {
1470             throw new StringIndexOutOfBoundsException(limit);
1471         }
1472         if (offset16 < 0 || result > limit) {
1473             throw new StringIndexOutOfBoundsException(offset16);
1474         }
1475         if (shift32 > 0) {
1476             if (shift32 + result > size) {
1477                 throw new StringIndexOutOfBoundsException(result);
1478             }
1479             count = shift32;
1480             while (result < limit && count > 0) {
1481                 ch = source[result];
1482                 if (isLeadSurrogate(ch) && (result + 1 < limit)
1483                         && isTrailSurrogate(source[result + 1])) {
1484                     result++;
1485                 }
1486                 count--;
1487                 result++;
1488             }
1489         } else {
1490             if (result + shift32 < start) {
1491                 throw new StringIndexOutOfBoundsException(result);
1492             }
1493             for (count = -shift32; count > 0; count--) {
1494                 result--;
1495                 if (result < start) {
1496                     break;
1497                 }
1498                 ch = source[result];
1499                 if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) {
1500                     result--;
1501                 }
1502             }
1503         }
1504         if (count != 0) {
1505             throw new StringIndexOutOfBoundsException(shift32);
1506         }
1507         result -= start;
1508         return result;
1509     }
1510
1511     /**
1512      * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the
1513      * middle of a supplementary codepoint, char32 will be inserted after the supplementary
1514      * codepoint. The length of target increases by one if codepoint is non-supplementary, 2
1515      * otherwise.
1516      * <p>
1517      * The overall effect is exactly as if the argument were converted to a string by the method
1518      * valueOf(char) and the characters in that string were then inserted into target at the
1519      * position indicated by offset16.
1520      * </p>
1521      * <p>
1522      * The offset argument must be greater than or equal to 0, and less than or equal to the length
1523      * of source.
1524      *
1525      * @param target
1526      *            string buffer to insert to
1527      * @param offset16
1528      *            offset which char32 will be inserted in
1529      * @param char32
1530      *            codepoint to be inserted
1531      * @return a reference to target
1532      * @exception IndexOutOfBoundsException
1533      *                thrown if offset16 is invalid.
1534      * @stable ICU 2.1
1535      */
1536     public static StringBuffer insert(StringBuffer target, int offset16, int char32) {
1537         String str = valueOf(char32);
1538         if (offset16 != target.length() && bounds(target, offset16) == TRAIL_SURROGATE_BOUNDARY) {
1539             offset16++;
1540         }
1541         target.insert(offset16, str);
1542         return target;
1543     }
1544
1545     /**
1546      * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the
1547      * middle of a supplementary codepoint, char32 will be inserted after the supplementary
1548      * codepoint. Limit increases by one if codepoint is non-supplementary, 2 otherwise.
1549      * <p>
1550      * The overall effect is exactly as if the argument were converted to a string by the method
1551      * valueOf(char) and the characters in that string were then inserted into target at the
1552      * position indicated by offset16.
1553      * </p>
1554      * <p>
1555      * The offset argument must be greater than or equal to 0, and less than or equal to the limit.
1556      *
1557      * @param target
1558      *            char array to insert to
1559      * @param limit
1560      *            end index of the char array, limit <= target.length
1561      * @param offset16
1562      *            offset which char32 will be inserted in
1563      * @param char32
1564      *            codepoint to be inserted
1565      * @return new limit size
1566      * @exception IndexOutOfBoundsException
1567      *                thrown if offset16 is invalid.
1568      * @stable ICU 2.1
1569      */
1570     public static int insert(char target[], int limit, int offset16, int char32) {
1571         String str = valueOf(char32);
1572         if (offset16 != limit && bounds(target, 0, limit, offset16) == TRAIL_SURROGATE_BOUNDARY) {
1573             offset16++;
1574         }
1575         int size = str.length();
1576         if (limit + size > target.length) {
1577             throw new ArrayIndexOutOfBoundsException(offset16 + size);
1578         }
1579         System.arraycopy(target, offset16, target, offset16 + size, limit - offset16);
1580         target[offset16] = str.charAt(0);
1581         if (size == 2) {
1582             target[offset16 + 1] = str.charAt(1);
1583         }
1584         return limit + size;
1585     }
1586
1587     /**
1588      * Removes the codepoint at the specified position in this target (shortening target by 1
1589      * character if the codepoint is a non-supplementary, 2 otherwise).
1590      *
1591      * @param target
1592      *            string buffer to remove codepoint from
1593      * @param offset16
1594      *            offset which the codepoint will be removed
1595      * @return a reference to target
1596      * @exception IndexOutOfBoundsException
1597      *                thrown if offset16 is invalid.
1598      * @stable ICU 2.1
1599      */
1600     public static StringBuffer delete(StringBuffer target, int offset16) {
1601         int count = 1;
1602         switch (bounds(target, offset16)) {
1603         case LEAD_SURROGATE_BOUNDARY:
1604             count++;
1605             break;
1606         case TRAIL_SURROGATE_BOUNDARY:
1607             count++;
1608             offset16--;
1609             break;
1610         }
1611         target.delete(offset16, offset16 + count);
1612         return target;
1613     }
1614
1615     /**
1616      * Removes the codepoint at the specified position in this target (shortening target by 1
1617      * character if the codepoint is a non-supplementary, 2 otherwise).
1618      *
1619      * @param target
1620      *            string buffer to remove codepoint from
1621      * @param limit
1622      *            end index of the char array, limit <= target.length
1623      * @param offset16
1624      *            offset which the codepoint will be removed
1625      * @return a new limit size
1626      * @exception IndexOutOfBoundsException
1627      *                thrown if offset16 is invalid.
1628      * @stable ICU 2.1
1629      */
1630     public static int delete(char target[], int limit, int offset16) {
1631         int count = 1;
1632         switch (bounds(target, 0, limit, offset16)) {
1633         case LEAD_SURROGATE_BOUNDARY:
1634             count++;
1635             break;
1636         case TRAIL_SURROGATE_BOUNDARY:
1637             count++;
1638             offset16--;
1639             break;
1640         }
1641         System.arraycopy(target, offset16 + count, target, offset16, limit - (offset16 + count));
1642         target[limit - count] = 0;
1643         return limit - count;
1644     }
1645
1646     /**
1647      * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1648      * the argument codepoint. I.e., the smallest index <code>i</code> such that
1649      * <code>UTF16.charAt(source, i) ==
1650      * char32</code> is true.
1651      * <p>
1652      * If no such character occurs in this string, then -1 is returned.
1653      * </p>
1654      * <p>
1655      * Examples:<br>
1656      * UTF16.indexOf("abc", 'a') returns 0<br>
1657      * UTF16.indexOf("abc\ud800\udc00", 0x10000) returns 3<br>
1658      * UTF16.indexOf("abc\ud800\udc00", 0xd800) returns -1<br>
1659      * </p>
1660      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1661      * characters to its fullest.
1662      *
1663      * @param source
1664      *            UTF16 format Unicode string that will be searched
1665      * @param char32
1666      *            codepoint to search for
1667      * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
1668      *         -1 if the codepoint does not occur.
1669      * @stable ICU 2.6
1670      */
1671     public static int indexOf(String source, int char32) {
1672         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1673             throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
1674         }
1675         // non-surrogate bmp
1676         if (char32 < LEAD_SURROGATE_MIN_VALUE
1677                 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1678             return source.indexOf((char) char32);
1679         }
1680         // surrogate
1681         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1682             int result = source.indexOf((char) char32);
1683             if (result >= 0) {
1684                 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
1685                         && isTrailSurrogate(source.charAt(result + 1))) {
1686                     return indexOf(source, char32, result + 1);
1687                 }
1688                 // trail surrogate
1689                 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
1690                     return indexOf(source, char32, result + 1);
1691                 }
1692             }
1693             return result;
1694         }
1695         // supplementary
1696         String char32str = toString(char32);
1697         return source.indexOf(char32str);
1698     }
1699
1700     /**
1701      * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1702      * the argument string str. This method is implemented based on codepoints, hence a "lead
1703      * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
1704      * starts with trail surrogate character at index 0, a source with a leading a surrogate
1705      * character before str found at in source will not have a valid match. Vice versa for lead
1706      * surrogates that ends str. See example below.
1707      * <p>
1708      * If no such string str occurs in this source, then -1 is returned.
1709      * </p>
1710      * <p>
1711      * Examples:<br>
1712      * UTF16.indexOf("abc", "ab") returns 0<br>
1713      * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br>
1714      * UTF16.indexOf("abc\ud800\udc00", "\ud800") returns -1<br>
1715      * </p>
1716      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1717      * characters to its fullest.
1718      *
1719      * @param source
1720      *            UTF16 format Unicode string that will be searched
1721      * @param str
1722      *            UTF16 format Unicode string to search for
1723      * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
1724      *         -1 if the codepoint does not occur.
1725      * @stable ICU 2.6
1726      */
1727     public static int indexOf(String source, String str) {
1728         int strLength = str.length();
1729         // non-surrogate ends
1730         if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
1731             return source.indexOf(str);
1732         }
1733
1734         int result = source.indexOf(str);
1735         int resultEnd = result + strLength;
1736         if (result >= 0) {
1737             // check last character
1738             if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
1739                     && isTrailSurrogate(source.charAt(resultEnd + 1))) {
1740                 return indexOf(source, str, resultEnd + 1);
1741             }
1742             // check first character which is a trail surrogate
1743             if (isTrailSurrogate(str.charAt(0)) && result > 0
1744                     && isLeadSurrogate(source.charAt(result - 1))) {
1745                 return indexOf(source, str, resultEnd + 1);
1746             }
1747         }
1748         return result;
1749     }
1750
1751     /**
1752      * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1753      * the argument codepoint. I.e., the smallest index i such that: <br>
1754      * (UTF16.charAt(source, i) == char32 && i >= fromIndex) is true.
1755      * <p>
1756      * If no such character occurs in this string, then -1 is returned.
1757      * </p>
1758      * <p>
1759      * Examples:<br>
1760      * UTF16.indexOf("abc", 'a', 1) returns -1<br>
1761      * UTF16.indexOf("abc\ud800\udc00", 0x10000, 1) returns 3<br>
1762      * UTF16.indexOf("abc\ud800\udc00", 0xd800, 1) returns -1<br>
1763      * </p>
1764      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1765      * characters to its fullest.
1766      *
1767      * @param source
1768      *            UTF16 format Unicode string that will be searched
1769      * @param char32
1770      *            codepoint to search for
1771      * @param fromIndex
1772      *            the index to start the search from.
1773      * @return the index of the first occurrence of the codepoint in the argument Unicode string at
1774      *         or after fromIndex, or -1 if the codepoint does not occur.
1775      * @stable ICU 2.6
1776      */
1777     public static int indexOf(String source, int char32, int fromIndex) {
1778         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1779             throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
1780         }
1781         // non-surrogate bmp
1782         if (char32 < LEAD_SURROGATE_MIN_VALUE
1783                 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1784             return source.indexOf((char) char32, fromIndex);
1785         }
1786         // surrogate
1787         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1788             int result = source.indexOf((char) char32, fromIndex);
1789             if (result >= 0) {
1790                 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
1791                         && isTrailSurrogate(source.charAt(result + 1))) {
1792                     return indexOf(source, char32, result + 1);
1793                 }
1794                 // trail surrogate
1795                 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
1796                     return indexOf(source, char32, result + 1);
1797                 }
1798             }
1799             return result;
1800         }
1801         // supplementary
1802         String char32str = toString(char32);
1803         return source.indexOf(char32str, fromIndex);
1804     }
1805
1806     /**
1807      * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1808      * the argument string str. This method is implemented based on codepoints, hence a "lead
1809      * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
1810      * starts with trail surrogate character at index 0, a source with a leading a surrogate
1811      * character before str found at in source will not have a valid match. Vice versa for lead
1812      * surrogates that ends str. See example below.
1813      * <p>
1814      * If no such string str occurs in this source, then -1 is returned.
1815      * </p>
1816      * <p>
1817      * Examples:<br>
1818      * UTF16.indexOf("abc", "ab", 0) returns 0<br>
1819      * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 0) returns 3<br>
1820      * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 2) returns 3<br>
1821      * UTF16.indexOf("abc\ud800\udc00", "\ud800", 0) returns -1<br>
1822      * </p>
1823      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1824      * characters to its fullest.
1825      *
1826      * @param source
1827      *            UTF16 format Unicode string that will be searched
1828      * @param str
1829      *            UTF16 format Unicode string to search for
1830      * @param fromIndex
1831      *            the index to start the search from.
1832      * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
1833      *         -1 if the codepoint does not occur.
1834      * @stable ICU 2.6
1835      */
1836     public static int indexOf(String source, String str, int fromIndex) {
1837         int strLength = str.length();
1838         // non-surrogate ends
1839         if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
1840             return source.indexOf(str, fromIndex);
1841         }
1842
1843         int result = source.indexOf(str, fromIndex);
1844         int resultEnd = result + strLength;
1845         if (result >= 0) {
1846             // check last character
1847             if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
1848                     && isTrailSurrogate(source.charAt(resultEnd))) {
1849                 return indexOf(source, str, resultEnd + 1);
1850             }
1851             // check first character which is a trail surrogate
1852             if (isTrailSurrogate(str.charAt(0)) && result > 0
1853                     && isLeadSurrogate(source.charAt(result - 1))) {
1854                 return indexOf(source, str, resultEnd + 1);
1855             }
1856         }
1857         return result;
1858     }
1859
1860     /**
1861      * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
1862      * the argument codepoint. I.e., the index returned is the largest value i such that:
1863      * UTF16.charAt(source, i) == char32 is true.
1864      * <p>
1865      * Examples:<br>
1866      * UTF16.lastIndexOf("abc", 'a') returns 0<br>
1867      * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000) returns 3<br>
1868      * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br>
1869      * </p>
1870      * <p>
1871      * source is searched backwards starting at the last character.
1872      * </p>
1873      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1874      * characters to its fullest.
1875      *
1876      * @param source
1877      *            UTF16 format Unicode string that will be searched
1878      * @param char32
1879      *            codepoint to search for
1880      * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
1881      *         does not occur.
1882      * @stable ICU 2.6
1883      */
1884     public static int lastIndexOf(String source, int char32) {
1885         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1886             throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
1887         }
1888         // non-surrogate bmp
1889         if (char32 < LEAD_SURROGATE_MIN_VALUE
1890                 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1891             return source.lastIndexOf((char) char32);
1892         }
1893         // surrogate
1894         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1895             int result = source.lastIndexOf((char) char32);
1896             if (result >= 0) {
1897                 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
1898                         && isTrailSurrogate(source.charAt(result + 1))) {
1899                     return lastIndexOf(source, char32, result - 1);
1900                 }
1901                 // trail surrogate
1902                 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
1903                     return lastIndexOf(source, char32, result - 1);
1904                 }
1905             }
1906             return result;
1907         }
1908         // supplementary
1909         String char32str = toString(char32);
1910         return source.lastIndexOf(char32str);
1911     }
1912
1913     /**
1914      * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
1915      * the argument string str. This method is implemented based on codepoints, hence a "lead
1916      * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
1917      * starts with trail surrogate character at index 0, a source with a leading a surrogate
1918      * character before str found at in source will not have a valid match. Vice versa for lead
1919      * surrogates that ends str. See example below.
1920      * <p>
1921      * Examples:<br>
1922      * UTF16.lastIndexOf("abc", "a") returns 0<br>
1923      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br>
1924      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800") returns -1<br>
1925      * </p>
1926      * <p>
1927      * source is searched backwards starting at the last character.
1928      * </p>
1929      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1930      * characters to its fullest.
1931      *
1932      * @param source
1933      *            UTF16 format Unicode string that will be searched
1934      * @param str
1935      *            UTF16 format Unicode string to search for
1936      * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
1937      *         does not occur.
1938      * @stable ICU 2.6
1939      */
1940     public static int lastIndexOf(String source, String str) {
1941         int strLength = str.length();
1942         // non-surrogate ends
1943         if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
1944             return source.lastIndexOf(str);
1945         }
1946
1947         int result = source.lastIndexOf(str);
1948         if (result >= 0) {
1949             // check last character
1950             if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
1951                     && isTrailSurrogate(source.charAt(result + strLength + 1))) {
1952                 return lastIndexOf(source, str, result - 1);
1953             }
1954             // check first character which is a trail surrogate
1955             if (isTrailSurrogate(str.charAt(0)) && result > 0
1956                     && isLeadSurrogate(source.charAt(result - 1))) {
1957                 return lastIndexOf(source, str, result - 1);
1958             }
1959         }
1960         return result;
1961     }
1962
1963     /**
1964      * <p>
1965      * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
1966      * the argument codepoint, where the result is less than or equals to fromIndex.
1967      * </p>
1968      * <p>
1969      * This method is implemented based on codepoints, hence a single surrogate character will not
1970      * match a supplementary character.
1971      * </p>
1972      * <p>
1973      * source is searched backwards starting at the last character starting at the specified index.
1974      * </p>
1975      * <p>
1976      * Examples:<br>
1977      * UTF16.lastIndexOf("abc", 'c', 2) returns 2<br>
1978      * UTF16.lastIndexOf("abc", 'c', 1) returns -1<br>
1979      * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 5) returns 3<br>
1980      * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 3) returns 3<br>
1981      * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br>
1982      * </p>
1983      * Note this method is provided as support to jdk 1.3, which does not support supplementary
1984      * characters to its fullest.
1985      *
1986      * @param source
1987      *            UTF16 format Unicode string that will be searched
1988      * @param char32
1989      *            codepoint to search for
1990      * @param fromIndex
1991      *            the index to start the search from. There is no restriction on the value of
1992      *            fromIndex. If it is greater than or equal to the length of this string, it has the
1993      *            same effect as if it were equal to one less than the length of this string: this
1994      *            entire string may be searched. If it is negative, it has the same effect as if it
1995      *            were -1: -1 is returned.
1996      * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
1997      *         does not occur.
1998      * @stable ICU 2.6
1999      */
2000     public static int lastIndexOf(String source, int char32, int fromIndex) {
2001         if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
2002             throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
2003         }
2004         // non-surrogate bmp
2005         if (char32 < LEAD_SURROGATE_MIN_VALUE
2006                 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
2007             return source.lastIndexOf((char) char32, fromIndex);
2008         }
2009         // surrogate
2010         if (char32 < SUPPLEMENTARY_MIN_VALUE) {
2011             int result = source.lastIndexOf((char) char32, fromIndex);
2012             if (result >= 0) {
2013                 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
2014                         && isTrailSurrogate(source.charAt(result + 1))) {
2015                     return lastIndexOf(source, char32, result - 1);
2016                 }
2017                 // trail surrogate
2018                 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
2019                     return lastIndexOf(source, char32, result - 1);
2020                 }
2021             }
2022             return result;
2023         }
2024         // supplementary
2025         String char32str = toString(char32);
2026         return source.lastIndexOf(char32str, fromIndex);
2027     }
2028
2029     /**
2030      * <p>
2031      * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
2032      * the argument string str, where the result is less than or equals to fromIndex.
2033      * </p>
2034      * <p>
2035      * This method is implemented based on codepoints, hence a "lead surrogate character + trail
2036      * surrogate character" is treated as one entity. Hence if the str starts with trail surrogate
2037      * character at index 0, a source with a leading a surrogate character before str found at in
2038      * source will not have a valid match. Vice versa for lead surrogates that ends str.
2039      * </p>
2040      * See example below.
2041      * <p>
2042      * Examples:<br>
2043      * UTF16.lastIndexOf("abc", "c", 2) returns 2<br>
2044      * UTF16.lastIndexOf("abc", "c", 1) returns -1<br>
2045      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 5) returns 3<br>
2046      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 3) returns 3<br>
2047      * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800", 4) returns -1<br>
2048      * </p>
2049      * <p>
2050      * source is searched backwards starting at the last character.
2051      * </p>
2052      * Note this method is provided as support to jdk 1.3, which does not support supplementary
2053      * characters to its fullest.
2054      *
2055      * @param source
2056      *            UTF16 format Unicode string that will be searched
2057      * @param str
2058      *            UTF16 format Unicode string to search for
2059      * @param fromIndex
2060      *            the index to start the search from. There is no restriction on the value of
2061      *            fromIndex. If it is greater than or equal to the length of this string, it has the
2062      *            same effect as if it were equal to one less than the length of this string: this
2063      *            entire string may be searched. If it is negative, it has the same effect as if it
2064      *            were -1: -1 is returned.
2065      * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
2066      *         does not occur.
2067      * @stable ICU 2.6
2068      */
2069     public static int lastIndexOf(String source, String str, int fromIndex) {
2070         int strLength = str.length();
2071         // non-surrogate ends
2072         if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
2073             return source.lastIndexOf(str, fromIndex);
2074         }
2075
2076         int result = source.lastIndexOf(str, fromIndex);
2077         if (result >= 0) {
2078             // check last character
2079             if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
2080                     && isTrailSurrogate(source.charAt(result + strLength))) {
2081                 return lastIndexOf(source, str, result - 1);
2082             }
2083             // check first character which is a trail surrogate
2084             if (isTrailSurrogate(str.charAt(0)) && result > 0
2085                     && isLeadSurrogate(source.charAt(result - 1))) {
2086                 return lastIndexOf(source, str, result - 1);
2087             }
2088         }
2089         return result;
2090     }
2091
2092     /**
2093      * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of
2094      * oldChar32 in source with newChar32. If the character oldChar32 does not occur in the UTF16
2095      * format Unicode string source, then source will be returned. Otherwise, a new String object is
2096      * created that represents a codepoint sequence identical to the codepoint sequence represented
2097      * by source, except that every occurrence of oldChar32 is replaced by an occurrence of
2098      * newChar32.
2099      * <p>
2100      * Examples: <br>
2101      * UTF16.replace("mesquite in your cellar", 'e', 'o');<br>
2102      * returns "mosquito in your collar"<br>
2103      * UTF16.replace("JonL", 'q', 'x');<br>
2104      * returns "JonL" (no change)<br>
2105      * UTF16.replace("Supplementary character \ud800\udc00", 0x10000, '!'); <br>
2106      * returns "Supplementary character !"<br>
2107      * UTF16.replace("Supplementary character \ud800\udc00", 0xd800, '!'); <br>
2108      * returns "Supplementary character \ud800\udc00"<br>
2109      * </p>
2110      * Note this method is provided as support to jdk 1.3, which does not support supplementary
2111      * characters to its fullest.
2112      *
2113      * @param source
2114      *            UTF16 format Unicode string which the codepoint replacements will be based on.
2115      * @param oldChar32
2116      *            non-zero old codepoint to be replaced.
2117      * @param newChar32
2118      *            the new codepoint to replace oldChar32
2119      * @return new String derived from source by replacing every occurrence of oldChar32 with
2120      *         newChar32, unless when no oldChar32 is found in source then source will be returned.
2121      * @stable ICU 2.6
2122      */
2123     public static String replace(String source, int oldChar32, int newChar32) {
2124         if (oldChar32 <= 0 || oldChar32 > CODEPOINT_MAX_VALUE) {
2125             throw new IllegalArgumentException("Argument oldChar32 is not a valid codepoint");
2126         }
2127         if (newChar32 <= 0 || newChar32 > CODEPOINT_MAX_VALUE) {
2128             throw new IllegalArgumentException("Argument newChar32 is not a valid codepoint");
2129         }
2130
2131         int index = indexOf(source, oldChar32);
2132         if (index == -1) {
2133             return source;
2134         }
2135         String newChar32Str = toString(newChar32);
2136         int oldChar32Size = 1;
2137         int newChar32Size = newChar32Str.length();
2138         StringBuffer result = new StringBuffer(source);
2139         int resultIndex = index;
2140
2141         if (oldChar32 >= SUPPLEMENTARY_MIN_VALUE) {
2142             oldChar32Size = 2;
2143         }
2144
2145         while (index != -1) {
2146             int endResultIndex = resultIndex + oldChar32Size;
2147             result.replace(resultIndex, endResultIndex, newChar32Str);
2148             int lastEndIndex = index + oldChar32Size;
2149             index = indexOf(source, oldChar32, lastEndIndex);
2150             resultIndex += newChar32Size + index - lastEndIndex;
2151         }
2152         return result.toString();
2153     }
2154
2155     /**
2156      * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of oldStr
2157      * in source with newStr. If the string oldStr does not occur in the UTF16 format Unicode string
2158      * source, then source will be returned. Otherwise, a new String object is created that
2159      * represents a codepoint sequence identical to the codepoint sequence represented by source,
2160      * except that every occurrence of oldStr is replaced by an occurrence of newStr.
2161      * <p>
2162      * Examples: <br>
2163      * UTF16.replace("mesquite in your cellar", "e", "o");<br>
2164      * returns "mosquito in your collar"<br>
2165      * UTF16.replace("mesquite in your cellar", "mesquite", "cat");<br>
2166      * returns "cat in your cellar"<br>
2167      * UTF16.replace("JonL", "q", "x");<br>
2168      * returns "JonL" (no change)<br>
2169      * UTF16.replace("Supplementary character \ud800\udc00", "\ud800\udc00", '!'); <br>
2170      * returns "Supplementary character !"<br>
2171      * UTF16.replace("Supplementary character \ud800\udc00", "\ud800", '!'); <br>
2172      * returns "Supplementary character \ud800\udc00"<br>
2173      * </p>
2174      * Note this method is provided as support to jdk 1.3, which does not support supplementary
2175      * characters to its fullest.
2176      *
2177      * @param source
2178      *            UTF16 format Unicode string which the replacements will be based on.
2179      * @param oldStr
2180      *            non-zero-length string to be replaced.
2181      * @param newStr
2182      *            the new string to replace oldStr
2183      * @return new String derived from source by replacing every occurrence of oldStr with newStr.
2184      *         When no oldStr is found in source, then source will be returned.
2185      * @stable ICU 2.6
2186      */
2187     public static String replace(String source, String oldStr, String newStr) {
2188         int index = indexOf(source, oldStr);
2189         if (index == -1) {
2190             return source;
2191         }
2192         int oldStrSize = oldStr.length();
2193         int newStrSize = newStr.length();
2194         StringBuffer result = new StringBuffer(source);
2195         int resultIndex = index;
2196
2197         while (index != -1) {
2198             int endResultIndex = resultIndex + oldStrSize;
2199             result.replace(resultIndex, endResultIndex, newStr);
2200             int lastEndIndex = index + oldStrSize;
2201             index = indexOf(source, oldStr, lastEndIndex);
2202             resultIndex += newStrSize + index - lastEndIndex;
2203         }
2204         return result.toString();
2205     }
2206
2207     /**
2208      * Reverses a UTF16 format Unicode string and replaces source's content with it. This method
2209      * will reverse surrogate characters correctly, instead of blindly reversing every character.
2210      * <p>
2211      * Examples:<br>
2212      * UTF16.reverse(new StringBuffer( "Supplementary characters \ud800\udc00\ud801\udc01"))<br>
2213      * returns "\ud801\udc01\ud800\udc00 sretcarahc yratnemelppuS".
2214      *
2215      * @param source
2216      *            the source StringBuffer that contains UTF16 format Unicode string to be reversed
2217      * @return a modified source with reversed UTF16 format Unicode string.
2218      * @stable ICU 2.6
2219      */
2220     public static StringBuffer reverse(StringBuffer source) {
2221         int length = source.length();
2222         StringBuffer result = new StringBuffer(length);
2223         for (int i = length; i-- > 0;) {
2224             char ch = source.charAt(i);
2225             if (isTrailSurrogate(ch) && i > 0) {
2226                 char ch2 = source.charAt(i - 1);
2227                 if (isLeadSurrogate(ch2)) {
2228                     result.append(ch2);
2229                     result.append(ch);
2230                     --i;
2231                     continue;
2232                 }
2233             }
2234             result.append(ch);
2235         }
2236         return result;
2237     }
2238
2239     /**
2240      * Check if the string contains more Unicode code points than a certain number. This is more
2241      * efficient than counting all code points in the entire string and comparing that number with a
2242      * threshold. This function may not need to scan the string at all if the length is within a
2243      * certain range, and never needs to count more than 'number + 1' code points. Logically
2244      * equivalent to (countCodePoint(s) > number). A Unicode code point may occupy either one or two
2245      * code units.
2246      *
2247      * @param source
2248      *            The input string.
2249      * @param number
2250      *            The number of code points in the string is compared against the 'number'
2251      *            parameter.
2252      * @return boolean value for whether the string contains more Unicode code points than 'number'.
2253      * @stable ICU 2.4
2254      */
2255     public static boolean hasMoreCodePointsThan(String source, int number) {
2256         if (number < 0) {
2257             return true;
2258         }
2259         if (source == null) {
2260             return false;
2261         }
2262         int length = source.length();
2263
2264         // length >= 0 known
2265         // source contains at least (length + 1) / 2 code points: <= 2
2266         // chars per cp
2267         if (((length + 1) >> 1) > number) {
2268             return true;
2269         }
2270
2271         // check if source does not even contain enough chars
2272         int maxsupplementary = length - number;
2273         if (maxsupplementary <= 0) {
2274             return false;
2275         }
2276
2277         // there are maxsupplementary = length - number more chars than
2278         // asked-for code points
2279
2280         // count code points until they exceed and also check that there are
2281         // no more than maxsupplementary supplementary code points (char pairs)
2282         int start = 0;
2283         while (true) {
2284             if (length == 0) {
2285                 return false;
2286             }
2287             if (number == 0) {
2288                 return true;
2289             }
2290             if (isLeadSurrogate(source.charAt(start++)) && start != length
2291                     && isTrailSurrogate(source.charAt(start))) {
2292                 start++;
2293                 if (--maxsupplementary <= 0) {
2294                     // too many pairs - too few code points
2295                     return false;
2296                 }
2297             }
2298             --number;
2299         }
2300     }
2301
2302     /**
2303      * Check if the sub-range of char array, from argument start to limit, contains more Unicode
2304      * code points than a certain number. This is more efficient than counting all code points in
2305      * the entire char array range and comparing that number with a threshold. This function may not
2306      * need to scan the char array at all if start and limit is within a certain range, and never
2307      * needs to count more than 'number + 1' code points. Logically equivalent to
2308      * (countCodePoint(source, start, limit) > number). A Unicode code point may occupy either one
2309      * or two code units.
2310      *
2311      * @param source
2312      *            array of UTF-16 chars
2313      * @param start
2314      *            offset to substring in the source array for analyzing
2315      * @param limit
2316      *            offset to substring in the source array for analyzing
2317      * @param number
2318      *            The number of code points in the string is compared against the 'number'
2319      *            parameter.
2320      * @return boolean value for whether the string contains more Unicode code points than 'number'.
2321      * @exception IndexOutOfBoundsException
2322      *                thrown when limit &lt; start
2323      * @stable ICU 2.4
2324      */
2325     public static boolean hasMoreCodePointsThan(char source[], int start, int limit, int number) {
2326         int length = limit - start;
2327         if (length < 0 || start < 0 || limit < 0) {
2328             throw new IndexOutOfBoundsException(
2329                     "Start and limit indexes should be non-negative and start <= limit");
2330         }
2331         if (number < 0) {
2332             return true;
2333         }
2334         if (source == null) {
2335             return false;
2336         }
2337
2338         // length >= 0 known
2339         // source contains at least (length + 1) / 2 code points: <= 2
2340         // chars per cp
2341         if (((length + 1) >> 1) > number) {
2342             return true;
2343         }
2344
2345         // check if source does not even contain enough chars
2346         int maxsupplementary = length - number;
2347         if (maxsupplementary <= 0) {
2348             return false;
2349         }
2350
2351         // there are maxsupplementary = length - number more chars than
2352         // asked-for code points
2353
2354         // count code points until they exceed and also check that there are
2355         // no more than maxsupplementary supplementary code points (char pairs)
2356         while (true) {
2357             if (length == 0) {
2358                 return false;
2359             }
2360             if (number == 0) {
2361                 return true;
2362             }
2363             if (isLeadSurrogate(source[start++]) && start != limit
2364                     && isTrailSurrogate(source[start])) {
2365                 start++;
2366                 if (--maxsupplementary <= 0) {
2367                     // too many pairs - too few code points
2368                     return false;
2369                 }
2370             }
2371             --number;
2372         }
2373     }
2374
2375     /**
2376      * Check if the string buffer contains more Unicode code points than a certain number. This is
2377      * more efficient than counting all code points in the entire string buffer and comparing that
2378      * number with a threshold. This function may not need to scan the string buffer at all if the
2379      * length is within a certain range, and never needs to count more than 'number + 1' code
2380      * points. Logically equivalent to (countCodePoint(s) > number). A Unicode code point may occupy
2381      * either one or two code units.
2382      *
2383      * @param source
2384      *            The input string buffer.
2385      * @param number
2386      *            The number of code points in the string buffer is compared against the 'number'
2387      *            parameter.
2388      * @return boolean value for whether the string buffer contains more Unicode code points than
2389      *         'number'.
2390      * @stable ICU 2.4
2391      */
2392     public static boolean hasMoreCodePointsThan(StringBuffer source, int number) {
2393         if (number < 0) {
2394             return true;
2395         }
2396         if (source == null) {
2397             return false;
2398         }
2399         int length = source.length();
2400
2401         // length >= 0 known
2402         // source contains at least (length + 1) / 2 code points: <= 2
2403         // chars per cp
2404         if (((length + 1) >> 1) > number) {
2405             return true;
2406         }
2407
2408         // check if source does not even contain enough chars
2409         int maxsupplementary = length - number;
2410         if (maxsupplementary <= 0) {
2411             return false;
2412         }
2413
2414         // there are maxsupplementary = length - number more chars than
2415         // asked-for code points
2416
2417         // count code points until they exceed and also check that there are
2418         // no more than maxsupplementary supplementary code points (char pairs)
2419         int start = 0;
2420         while (true) {
2421             if (length == 0) {
2422                 return false;
2423             }
2424             if (number == 0) {
2425                 return true;
2426             }
2427             if (isLeadSurrogate(source.charAt(start++)) && start != length
2428                     && isTrailSurrogate(source.charAt(start))) {
2429                 start++;
2430                 if (--maxsupplementary <= 0) {
2431                     // too many pairs - too few code points
2432                     return false;
2433                 }
2434             }
2435             --number;
2436         }
2437     }
2438
2439     /**
2440      * Cover JDK 1.5 API. Create a String from an array of codePoints.
2441      *
2442      * @param codePoints
2443      *            the code array
2444      * @param offset
2445      *            the start of the text in the code point array
2446      * @param count
2447      *            the number of code points
2448      * @return a String representing the code points between offset and count
2449      * @throws IllegalArgumentException
2450      *             if an invalid code point is encountered
2451      * @throws IndexOutOfBoundsException
2452      *             if the offset or count are out of bounds.
2453      * @stable ICU 3.0
2454      */
2455     public static String newString(int[] codePoints, int offset, int count) {
2456         if (count < 0) {
2457             throw new IllegalArgumentException();
2458         }
2459         char[] chars = new char[count];
2460         int w = 0;
2461         for (int r = offset, e = offset + count; r < e; ++r) {
2462             int cp = codePoints[r];
2463             if (cp < 0 || cp > 0x10ffff) {
2464                 throw new IllegalArgumentException();
2465             }
2466             while (true) {
2467                 try {
2468                     if (cp < 0x010000) {
2469                         chars[w] = (char) cp;
2470                         w++;
2471                     } else {
2472                         chars[w] = (char) (LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_));
2473                         chars[w + 1] = (char) (TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_));
2474                         w += 2;
2475                     }
2476                     break;
2477                 } catch (IndexOutOfBoundsException ex) {
2478                     int newlen = (int) (Math.ceil((double) codePoints.length * (w + 2)
2479                             / (r - offset + 1)));
2480                     char[] temp = new char[newlen];
2481                     System.arraycopy(chars, 0, temp, 0, w);
2482                     chars = temp;
2483                 }
2484             }
2485         }
2486         return new String(chars, 0, w);
2487     }
2488
2489     /**
2490      * <p>
2491      * UTF16 string comparator class. Allows UTF16 string comparison to be done with the various
2492      * modes
2493      * </p>
2494      * <ul>
2495      * <li> Code point comparison or code unit comparison
2496      * <li> Case sensitive comparison, case insensitive comparison or case insensitive comparison
2497      * with special handling for character 'i'.
2498      * </ul>
2499      * <p>
2500      * The code unit or code point comparison differ only when comparing supplementary code points
2501      * (&#92;u10000..&#92;u10ffff) to BMP code points near the end of the BMP (i.e.,
2502      * &#92;ue000..&#92;uffff). In code unit comparison, high BMP code points sort after
2503      * supplementary code points because they are stored as pairs of surrogates which are at
2504      * &#92;ud800..&#92;udfff.
2505      * </p>
2506      *
2507      * @see #FOLD_CASE_DEFAULT
2508      * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2509      * @stable ICU 2.1
2510      */
2511     public static final class StringComparator implements java.util.Comparator {
2512         // public constructor ------------------------------------------------
2513
2514         /**
2515          * Default constructor that does code unit comparison and case sensitive comparison.
2516          *
2517          * @stable ICU 2.1
2518          */
2519         public StringComparator() {
2520             this(false, false, FOLD_CASE_DEFAULT);
2521         }
2522
2523         /**
2524          * Constructor that does comparison based on the argument options.
2525          *
2526          * @param codepointcompare
2527          *            flag to indicate true for code point comparison or false for code unit
2528          *            comparison.
2529          * @param ignorecase
2530          *            false for case sensitive comparison, true for case-insensitive comparison
2531          * @param foldcaseoption
2532          *            FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only
2533          *            when ignorecase is set to true. If ignorecase is false, this option is
2534          *            ignored.
2535          * @see #FOLD_CASE_DEFAULT
2536          * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2537          * @throws IllegalArgumentException
2538          *             if foldcaseoption is out of range
2539          * @stable ICU 2.4
2540          */
2541         public StringComparator(boolean codepointcompare, boolean ignorecase, int foldcaseoption) {
2542             setCodePointCompare(codepointcompare);
2543             m_ignoreCase_ = ignorecase;
2544             if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) {
2545                 throw new IllegalArgumentException("Invalid fold case option");
2546             }
2547             m_foldCase_ = foldcaseoption;
2548         }
2549
2550         // public data member ------------------------------------------------
2551
2552         /**
2553          * <p>
2554          * Option value for case folding comparison:
2555          * </p>
2556          * <p>
2557          * Comparison is case insensitive, strings are folded using default mappings defined in
2558          * Unicode data file CaseFolding.txt, before comparison.
2559          * </p>
2560          *
2561          * @stable ICU 2.4
2562          */
2563         public static final int FOLD_CASE_DEFAULT = 0;
2564
2565         /**
2566          * <p>
2567          * Option value for case folding comparison:
2568          * </p>
2569          * <p>
2570          * Comparison is case insensitive, strings are folded using modified mappings defined in
2571          * Unicode data file CaseFolding.txt, before comparison.
2572          * </p>
2573          * <p>
2574          * The modified set of mappings is provided in a Unicode data file CaseFolding.txt to handle
2575          * dotted I and dotless i appropriately for Turkic languages (tr, az).
2576          * </p>
2577          * <p>
2578          * Before Unicode 3.2, CaseFolding.txt contains mappings marked with 'I' that are to be
2579          * included for default mappings and excluded for the Turkic-specific mappings.
2580          * </p>
2581          * <p>
2582          * Unicode 3.2 CaseFolding.txt instead contains mappings marked with 'T' that are to be
2583          * excluded for default mappings and included for the Turkic-specific mappings.
2584          * </p>
2585          *
2586          * @stable ICU 2.4
2587          */
2588         public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = 1;
2589
2590         // public methods ----------------------------------------------------
2591
2592         // public setters ----------------------------------------------------
2593
2594         /**
2595          * Sets the comparison mode to code point compare if flag is true. Otherwise comparison mode
2596          * is set to code unit compare
2597          *
2598          * @param flag
2599          *            true for code point compare, false for code unit compare
2600          * @stable ICU 2.4
2601          */
2602         public void setCodePointCompare(boolean flag) {
2603             if (flag) {
2604                 m_codePointCompare_ = Normalizer.COMPARE_CODE_POINT_ORDER;
2605             } else {
2606                 m_codePointCompare_ = 0;
2607             }
2608         }
2609
2610         /**
2611          * Sets the Comparator to case-insensitive comparison mode if argument is true, otherwise
2612          * case sensitive comparison mode if set to false.
2613          *
2614          * @param ignorecase
2615          *            true for case-insitive comparison, false for case sensitive comparison
2616          * @param foldcaseoption
2617          *            FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only
2618          *            when ignorecase is set to true. If ignorecase is false, this option is
2619          *            ignored.
2620          * @see #FOLD_CASE_DEFAULT
2621          * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2622          * @stable ICU 2.4
2623          */
2624         public void setIgnoreCase(boolean ignorecase, int foldcaseoption) {
2625             m_ignoreCase_ = ignorecase;
2626             if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) {
2627                 throw new IllegalArgumentException("Invalid fold case option");
2628             }
2629             m_foldCase_ = foldcaseoption;
2630         }
2631
2632         // public getters ----------------------------------------------------
2633
2634         /**
2635          * Checks if the comparison mode is code point compare.
2636          *
2637          * @return true for code point compare, false for code unit compare
2638          * @stable ICU 2.4
2639          */
2640         public boolean getCodePointCompare() {
2641             return m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER;
2642         }
2643
2644         /**
2645          * Checks if Comparator is in the case insensitive mode.
2646          *
2647          * @return true if Comparator performs case insensitive comparison, false otherwise
2648          * @stable ICU 2.4
2649          */
2650         public boolean getIgnoreCase() {
2651             return m_ignoreCase_;
2652         }
2653
2654         /**
2655          * Gets the fold case options set in Comparator to be used with case insensitive comparison.
2656          *
2657          * @return either FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I
2658          * @see #FOLD_CASE_DEFAULT
2659          * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2660          * @stable ICU 2.4
2661          */
2662         public int getIgnoreCaseOption() {
2663             return m_foldCase_;
2664         }
2665
2666         // public other methods ----------------------------------------------
2667
2668         /**
2669          * Compare two strings depending on the options selected during construction.
2670          *
2671          * @param a
2672          *            first source string.
2673          * @param b
2674          *            second source string.
2675          * @return 0 returned if a == b. If a < b, a negative value is returned. Otherwise if a > b,
2676          *         a positive value is returned.
2677          * @exception ClassCastException
2678          *                thrown when either a or b is not a String object
2679          * @stable ICU 2.4
2680          */
2681         public int compare(Object a, Object b) {
2682             String str1 = (String) a;
2683             String str2 = (String) b;
2684
2685             if (str1 == str2) {
2686                 return 0;
2687             }
2688             if (str1 == null) {
2689                 return -1;
2690             }
2691             if (str2 == null) {
2692                 return 1;
2693             }
2694
2695             if (m_ignoreCase_) {
2696                 return compareCaseInsensitive(str1, str2);
2697             }
2698             return compareCaseSensitive(str1, str2);
2699         }
2700
2701         // private data member ----------------------------------------------
2702
2703         /**
2704          * Code unit comparison flag. True if code unit comparison is required. False if code point
2705          * comparison is required.
2706          */
2707         private int m_codePointCompare_;
2708
2709         /**
2710          * Fold case comparison option.
2711          */
2712         private int m_foldCase_;
2713
2714         /**
2715          * Flag indicator if ignore case is to be used during comparison
2716          */
2717         private boolean m_ignoreCase_;
2718
2719         /**
2720          * Code point order offset for surrogate characters
2721          */
2722         private static final int CODE_POINT_COMPARE_SURROGATE_OFFSET_ = 0x2800;
2723
2724         // private method ---------------------------------------------------
2725
2726         /**
2727          * Compares case insensitive. This is a direct port of ICU4C, to make maintainence life
2728          * easier.
2729          *
2730          * @param s1
2731          *            first string to compare
2732          * @param s2
2733          *            second string to compare
2734          * @return -1 is s1 &lt; s2, 0 if equals,
2735          */
2736         private int compareCaseInsensitive(String s1, String s2) {
2737             return NormalizerImpl.cmpEquivFold(s1, s2, m_foldCase_ | m_codePointCompare_
2738                     | Normalizer.COMPARE_IGNORE_CASE);
2739         }
2740
2741         /**
2742          * Compares case sensitive. This is a direct port of ICU4C, to make maintainence life
2743          * easier.
2744          *
2745          * @param s1
2746          *            first string to compare
2747          * @param s2
2748          *            second string to compare
2749          * @return -1 is s1 &lt; s2, 0 if equals,
2750          */
2751         private int compareCaseSensitive(String s1, String s2) {
2752             // compare identical prefixes - they do not need to be fixed up
2753             // limit1 = start1 + min(lenght1, length2)
2754             int length1 = s1.length();
2755             int length2 = s2.length();
2756             int minlength = length1;
2757             int result = 0;
2758             if (length1 < length2) {
2759                 result = -1;
2760             } else if (length1 > length2) {
2761                 result = 1;
2762                 minlength = length2;
2763             }
2764
2765             char c1 = 0;
2766             char c2 = 0;
2767             int index = 0;
2768             for (; index < minlength; index++) {
2769                 c1 = s1.charAt(index);
2770                 c2 = s2.charAt(index);
2771                 // check pseudo-limit
2772                 if (c1 != c2) {
2773                     break;
2774                 }
2775             }
2776
2777             if (index == minlength) {
2778                 return result;
2779             }
2780
2781             boolean codepointcompare = m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER;
2782             // if both values are in or above the surrogate range, fix them up
2783             if (c1 >= LEAD_SURROGATE_MIN_VALUE && c2 >= LEAD_SURROGATE_MIN_VALUE
2784                     && codepointcompare) {
2785                 // subtract 0x2800 from BMP code points to make them smaller
2786                 // than supplementary ones
2787                 if ((c1 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length1 && isTrailSurrogate(s1.charAt(index + 1)))
2788                         || (isTrailSurrogate(c1) && index != 0 && isLeadSurrogate(s1.charAt(index - 1)))) {
2789                     // part of a surrogate pair, leave >=d800
2790                 } else {
2791                     // BMP code point - may be surrogate code point - make
2792                     // < d800
2793                     c1 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_;
2794                 }
2795
2796                 if ((c2 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length2 && isTrailSurrogate(s2.charAt(index + 1)))
2797                         || (isTrailSurrogate(c2) && index != 0 && isLeadSurrogate(s2.charAt(index - 1)))) {
2798                     // part of a surrogate pair, leave >=d800
2799                 } else {
2800                     // BMP code point - may be surrogate code point - make <d800
2801                     c2 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_;
2802                 }
2803             }
2804
2805             // now c1 and c2 are in UTF-32-compatible order
2806             return c1 - c2;
2807         }
2808     }
2809
2810     // private data members -------------------------------------------------
2811
2812     /**
2813      * Shift value for lead surrogate to form a supplementary character.
2814      */
2815     private static final int LEAD_SURROGATE_SHIFT_ = 10;
2816
2817     /**
2818      * Mask to retrieve the significant value from a trail surrogate.
2819      */
2820     private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
2821
2822     /**
2823      * Value that all lead surrogate starts with
2824      */
2825     private static final int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE
2826             - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);
2827
2828     // private methods ------------------------------------------------------
2829
2830     /**
2831      * <p>
2832      * Converts argument code point and returns a String object representing the code point's value
2833      * in UTF16 format.
2834      * </p>
2835      * <p>
2836      * This method does not check for the validity of the codepoint, the results are not guaranteed
2837      * if a invalid codepoint is passed as argument.
2838      * </p>
2839      * <p>
2840      * The result is a string whose length is 1 for non-supplementary code points, 2 otherwise.
2841      * </p>
2842      *
2843      * @param ch
2844      *            code point
2845      * @return string representation of the code point
2846      */
2847     private static String toString(int ch) {
2848         if (ch < SUPPLEMENTARY_MIN_VALUE) {
2849             return String.valueOf((char) ch);
2850         }
2851
2852         StringBuffer result = new StringBuffer();
2853         result.append(getLeadSurrogate(ch));
2854         result.append(getTrailSurrogate(ch));
2855         return result.toString();
2856     }
2857 }
2858 // eof