3 *******************************************************************************
4 * Copyright (C) 1996-2009, International Business Machines Corporation and *
5 * others. All Rights Reserved. *
6 *******************************************************************************
9 package com.ibm.icu.text;
11 import com.ibm.icu.impl.UCharacterProperty;
12 import com.ibm.icu.impl.NormalizerImpl;
16 * Standalone utility class providing UTF16 character conversions and indexing conversions.
19 * Code that uses strings alone rarely need modification. By design, UTF-16 does not allow overlap,
20 * so searching for strings is a safe operation. Similarly, concatenation is always safe.
21 * Substringing is safe if the start and end are both on UTF-32 boundaries. In normal code, the
22 * values for start and end are on those boundaries, since they arose from operations like
23 * searching. If not, the nearest UTF-32 boundaries can be determined using <code>bounds()</code>.
25 * <strong>Examples:</strong>
27 * The following examples illustrate use of some of these methods.
30 * // iteration forwards: Original
31 * for (int i = 0; i < s.length(); ++i) {
32 * char ch = s.charAt(i);
33 * doSomethingWith(ch);
36 * // iteration forwards: Changes for UTF-32
38 * for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) {
39 * ch = UTF16.charAt(s, i);
40 * doSomethingWith(ch);
43 * // iteration backwards: Original
44 * for (int i = s.length() - 1; i >= 0; --i) {
45 * char ch = s.charAt(i);
46 * doSomethingWith(ch);
49 * // iteration backwards: Changes for UTF-32
51 * for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) {
52 * ch = UTF16.charAt(s, i);
53 * doSomethingWith(ch);
57 * <strong>Notes:</strong>
59 * <li> <strong>Naming:</strong> For clarity, High and Low surrogates are called <code>Lead</code>
60 * and <code>Trail</code> in the API, which gives a better sense of their ordering in a string.
61 * <code>offset16</code> and <code>offset32</code> are used to distinguish offsets to UTF-16
62 * boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is used to contain UTF-32
63 * characters, as opposed to <code>char16</code>, which is a UTF-16 code unit. </li>
64 * <li> <strong>Roundtripping Offsets:</strong> You can always roundtrip from a UTF-32 offset to a
65 * UTF-16 offset and back. Because of the difference in structure, you can roundtrip from a UTF-16
66 * offset to a UTF-32 offset and back if and only if <code>bounds(string, offset16) != TRAIL</code>.
68 * <li> <strong>Exceptions:</strong> The error checking will throw an exception if indices are out
69 * of bounds. Other than than that, all methods will behave reasonably, even if unmatched surrogates
70 * or out-of-bounds UTF-32 values are present. <code>UCharacter.isLegal()</code> can be used to
71 * check for validity if desired. </li>
72 * <li> <strong>Unmatched Surrogates:</strong> If the string contains unmatched surrogates, then
73 * these are counted as one UTF-32 value. This matches their iteration behavior, which is vital. It
74 * also matches common display practice as missing glyphs (see the Unicode Standard Section 5.4,
76 * <li> <strong>Optimization:</strong> The method implementations may need optimization if the
77 * compiler doesn't fold static final methods. Since surrogate pairs will form an exceeding small
78 * percentage of all the text in the world, the singleton case should always be optimized for. </li>
81 * @author Mark Davis, with help from Markus Scherer
85 public final class UTF16 {
86 // public variables ---------------------------------------------------
89 * Value returned in <code><a href="#bounds(java.lang.String, int)">
90 * bounds()</a></code>.
91 * These values are chosen specifically so that it actually represents the position of the
92 * character [offset16 - (value >> 2), offset16 + (value & 3)]
96 public static final int SINGLE_CHAR_BOUNDARY = 1, LEAD_SURROGATE_BOUNDARY = 2,
97 TRAIL_SURROGATE_BOUNDARY = 5;
100 * The lowest Unicode code point value.
104 public static final int CODEPOINT_MIN_VALUE = 0;
107 * The highest Unicode code point value (scalar value) according to the Unicode Standard.
111 public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
114 * The minimum value for Supplementary code points
118 public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
121 * Lead surrogate minimum value
125 public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
128 * Trail surrogate minimum value
132 public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
135 * Lead surrogate maximum value
139 public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
142 * Trail surrogate maximum value
146 public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
149 * Surrogate minimum value
153 public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
156 * Maximum surrogate value
160 public static final int SURROGATE_MAX_VALUE = TRAIL_SURROGATE_MAX_VALUE;
163 * Lead surrogate bitmask
165 private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00;
168 * Trail surrogate bitmask
170 private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00;
175 private static final int SURROGATE_BITMASK = 0xFFFFF800;
178 * Lead surrogate bits
180 private static final int LEAD_SURROGATE_BITS = 0xD800;
183 * Trail surrogate bits
185 private static final int TRAIL_SURROGATE_BITS = 0xDC00;
190 private static final int SURROGATE_BITS = 0xD800;
192 // constructor --------------------------------------------------------
196 * Prevent instance from being created.
202 // public method ------------------------------------------------------
205 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
206 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
207 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">
208 * UCharacter.isLegal()</a></code>
209 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
210 * character will be returned. If a complete supplementary character is not found the incomplete
211 * character will be returned
214 * array of UTF-16 chars
216 * UTF-16 offset to the start of the character.
217 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
218 * of that codepoint are the same as in <code>bounds32()</code>.
219 * @exception IndexOutOfBoundsException
220 * thrown if offset16 is out of bounds.
223 public static int charAt(String source, int offset16) {
224 char single = source.charAt(offset16);
225 if (single < LEAD_SURROGATE_MIN_VALUE) {
228 return _charAt(source, offset16, single);
231 private static int _charAt(String source, int offset16, char single) {
232 if (single > TRAIL_SURROGATE_MAX_VALUE) {
236 // Convert the UTF-16 surrogate pair if necessary.
237 // For simplicity in usage, and because the frequency of pairs is
238 // low, look both directions.
240 if (single <= LEAD_SURROGATE_MAX_VALUE) {
242 if (source.length() != offset16) {
243 char trail = source.charAt(offset16);
244 if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
245 return UCharacterProperty.getRawSupplementary(single, trail);
251 // single is a trail surrogate so
252 char lead = source.charAt(offset16);
253 if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
254 return UCharacterProperty.getRawSupplementary(lead, single);
258 return single; // return unmatched surrogate
261 //#if defined(FOUNDATION10) || defined(J2SE13)
264 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
265 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
266 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">
267 * UCharacter.isLegal()</a></code>
268 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
269 * character will be returned. If a complete supplementary character is not found the incomplete
270 * character will be returned
273 * array of UTF-16 chars
275 * UTF-16 offset to the start of the character.
276 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
277 * of that codepoint are the same as in <code>bounds32()</code>.
278 * @exception IndexOutOfBoundsException
279 * thrown if offset16 is out of bounds.
282 public static int charAt(CharSequence source, int offset16) {
283 char single = source.charAt(offset16);
284 if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) {
287 return _charAt(source, offset16, single);
290 private static int _charAt(CharSequence source, int offset16, char single) {
291 if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
295 // Convert the UTF-16 surrogate pair if necessary.
296 // For simplicity in usage, and because the frequency of pairs is
297 // low, look both directions.
299 if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
301 if (source.length() != offset16) {
302 char trail = source.charAt(offset16);
303 if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE
304 && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
305 return UCharacterProperty.getRawSupplementary(single, trail);
311 // single is a trail surrogate so
312 char lead = source.charAt(offset16);
313 if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE
314 && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
315 return UCharacterProperty.getRawSupplementary(lead, single);
319 return single; // return unmatched surrogate
325 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
326 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
327 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
329 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
330 * character will be returned. If a complete supplementary character is not found the incomplete
331 * character will be returned
334 * UTF-16 chars string buffer
336 * UTF-16 offset to the start of the character.
337 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
338 * of that codepoint are the same as in <code>bounds32()</code>.
339 * @exception IndexOutOfBoundsException
340 * thrown if offset16 is out of bounds.
343 public static int charAt(StringBuffer source, int offset16) {
344 if (offset16 < 0 || offset16 >= source.length()) {
345 throw new StringIndexOutOfBoundsException(offset16);
348 char single = source.charAt(offset16);
349 if (!isSurrogate(single)) {
353 // Convert the UTF-16 surrogate pair if necessary.
354 // For simplicity in usage, and because the frequency of pairs is
355 // low, look both directions.
357 if (single <= LEAD_SURROGATE_MAX_VALUE) {
359 if (source.length() != offset16) {
360 char trail = source.charAt(offset16);
361 if (isTrailSurrogate(trail))
362 return UCharacterProperty.getRawSupplementary(single, trail);
367 // single is a trail surrogate so
368 char lead = source.charAt(offset16);
369 if (isLeadSurrogate(lead)) {
370 return UCharacterProperty.getRawSupplementary(lead, single);
374 return single; // return unmatched surrogate
378 * Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards
379 * (with <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
380 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
382 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
383 * character will be returned. If a complete supplementary character is not found the incomplete
384 * character will be returned
387 * array of UTF-16 chars
389 * offset to substring in the source array for analyzing
391 * offset to substring in the source array for analyzing
393 * UTF-16 offset relative to start
394 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
395 * of that codepoint are the same as in <code>bounds32()</code>.
396 * @exception IndexOutOfBoundsException
397 * thrown if offset16 is not within the range of start and limit.
400 public static int charAt(char source[], int start, int limit, int offset16) {
402 if (offset16 < start || offset16 >= limit) {
403 throw new ArrayIndexOutOfBoundsException(offset16);
406 char single = source[offset16];
407 if (!isSurrogate(single)) {
411 // Convert the UTF-16 surrogate pair if necessary.
412 // For simplicity in usage, and because the frequency of pairs is
413 // low, look both directions.
414 if (single <= LEAD_SURROGATE_MAX_VALUE) {
416 if (offset16 >= limit) {
419 char trail = source[offset16];
420 if (isTrailSurrogate(trail)) {
421 return UCharacterProperty.getRawSupplementary(single, trail);
423 } else { // isTrailSurrogate(single), so
424 if (offset16 == start) {
428 char lead = source[offset16];
429 if (isLeadSurrogate(lead))
430 return UCharacterProperty.getRawSupplementary(lead, single);
432 return single; // return unmatched surrogate
436 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
437 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
438 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
440 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
441 * character will be returned. If a complete supplementary character is not found the incomplete
442 * character will be returned
445 * UTF-16 chars string buffer
447 * UTF-16 offset to the start of the character.
448 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
449 * of that codepoint are the same as in <code>bounds32()</code>.
450 * @exception IndexOutOfBoundsException
451 * thrown if offset16 is out of bounds.
454 public static int charAt(Replaceable source, int offset16) {
455 if (offset16 < 0 || offset16 >= source.length()) {
456 throw new StringIndexOutOfBoundsException(offset16);
459 char single = source.charAt(offset16);
460 if (!isSurrogate(single)) {
464 // Convert the UTF-16 surrogate pair if necessary.
465 // For simplicity in usage, and because the frequency of pairs is
466 // low, look both directions.
468 if (single <= LEAD_SURROGATE_MAX_VALUE) {
470 if (source.length() != offset16) {
471 char trail = source.charAt(offset16);
472 if (isTrailSurrogate(trail))
473 return UCharacterProperty.getRawSupplementary(single, trail);
478 // single is a trail surrogate so
479 char lead = source.charAt(offset16);
480 if (isLeadSurrogate(lead)) {
481 return UCharacterProperty.getRawSupplementary(lead, single);
485 return single; // return unmatched surrogate
489 * Determines how many chars this char32 requires. If a validity check is required, use <code>
490 * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
491 * on char32 before calling.
494 * the input codepoint.
495 * @return 2 if is in supplementary space, otherwise 1.
498 public static int getCharCount(int char32) {
499 if (char32 < SUPPLEMENTARY_MIN_VALUE) {
506 * Returns the type of the boundaries around the char at offset16. Used for random access.
514 * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16+1]
515 * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
516 * are [offset16, offset16 + 2]
517 * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
518 * bounds are [offset16 - 1, offset16 + 1]
520 * For bit-twiddlers, the return values for these are chosen so that the boundaries
521 * can be gotten by: [offset16 - (value >> 2), offset16 + (value & 3)].
522 * @exception IndexOutOfBoundsException
523 * if offset16 is out of bounds.
526 public static int bounds(String source, int offset16) {
527 char ch = source.charAt(offset16);
528 if (isSurrogate(ch)) {
529 if (isLeadSurrogate(ch)) {
530 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) {
531 return LEAD_SURROGATE_BOUNDARY;
534 // isTrailSurrogate(ch), so
536 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) {
537 return TRAIL_SURROGATE_BOUNDARY;
541 return SINGLE_CHAR_BOUNDARY;
545 * Returns the type of the boundaries around the char at offset16. Used for random access.
548 * string buffer to analyse
553 * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16 + 1]
554 * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
555 * are [offset16, offset16 + 2]
556 * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
557 * bounds are [offset16 - 1, offset16 + 1]
559 * For bit-twiddlers, the return values for these are chosen so that the boundaries
560 * can be gotten by: [offset16 - (value >> 2), offset16 + (value & 3)].
561 * @exception IndexOutOfBoundsException
562 * if offset16 is out of bounds.
565 public static int bounds(StringBuffer source, int offset16) {
566 char ch = source.charAt(offset16);
567 if (isSurrogate(ch)) {
568 if (isLeadSurrogate(ch)) {
569 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) {
570 return LEAD_SURROGATE_BOUNDARY;
573 // isTrailSurrogate(ch), so
575 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) {
576 return TRAIL_SURROGATE_BOUNDARY;
580 return SINGLE_CHAR_BOUNDARY;
584 * Returns the type of the boundaries around the char at offset16. Used for random access. Note
585 * that the boundaries are determined with respect to the subarray, hence the char array
586 * {0xD800, 0xDC00} has the result SINGLE_CHAR_BOUNDARY for start = offset16 = 0 and limit = 1.
589 * char array to analyse
591 * offset to substring in the source array for analyzing
593 * offset to substring in the source array for analyzing
595 * UTF16 offset relative to start
598 * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are
599 * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
600 * are [offset16, offset16 + 2]
601 * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
602 * bounds are [offset16 - 1, offset16 + 1]
604 * For bit-twiddlers, the boundary values for these are chosen so that the boundaries
605 * can be gotten by: [offset16 - (boundvalue >> 2), offset16 + (boundvalue & 3)].
606 * @exception IndexOutOfBoundsException
607 * if offset16 is not within the range of start and limit.
610 public static int bounds(char source[], int start, int limit, int offset16) {
612 if (offset16 < start || offset16 >= limit) {
613 throw new ArrayIndexOutOfBoundsException(offset16);
615 char ch = source[offset16];
616 if (isSurrogate(ch)) {
617 if (isLeadSurrogate(ch)) {
619 if (offset16 < limit && isTrailSurrogate(source[offset16])) {
620 return LEAD_SURROGATE_BOUNDARY;
622 } else { // isTrailSurrogate(ch), so
624 if (offset16 >= start && isLeadSurrogate(source[offset16])) {
625 return TRAIL_SURROGATE_BOUNDARY;
629 return SINGLE_CHAR_BOUNDARY;
633 * Determines whether the code value is a surrogate.
636 * the input character.
637 * @return true iff the input character is a surrogate.
640 public static boolean isSurrogate(char char16) {
641 return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS;
645 * Determines whether the character is a trail surrogate.
648 * the input character.
649 * @return true iff the input character is a trail surrogate.
652 public static boolean isTrailSurrogate(char char16) {
653 return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
657 * Determines whether the character is a lead surrogate.
660 * the input character.
661 * @return true iff the input character is a lead surrogate
664 public static boolean isLeadSurrogate(char char16) {
665 return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
669 * Returns the lead surrogate. If a validity check is required, use
670 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32
674 * the input character.
675 * @return lead surrogate if the getCharCount(ch) is 2; <br>
676 * and 0 otherwise (note: 0 is not a valid lead surrogate).
679 public static char getLeadSurrogate(int char32) {
680 if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
681 return (char) (LEAD_SURROGATE_OFFSET_ + (char32 >> LEAD_SURROGATE_SHIFT_));
687 * Returns the trail surrogate. If a validity check is required, use
688 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32
692 * the input character.
693 * @return the trail surrogate if the getCharCount(ch) is 2; <br>
694 * otherwise the character itself
697 public static char getTrailSurrogate(int char32) {
698 if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
699 return (char) (TRAIL_SURROGATE_MIN_VALUE + (char32 & TRAIL_SURROGATE_MASK_));
701 return (char) char32;
705 * Convenience method corresponding to String.valueOf(char). Returns a one or two char string
706 * containing the UTF-32 value in UTF16 format. If a validity check is required, use <a
707 * href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32 before calling.
710 * the input character.
711 * @return string value of char32 in UTF16 format
712 * @exception IllegalArgumentException
713 * thrown if char32 is a invalid codepoint.
716 public static String valueOf(int char32) {
717 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
718 throw new IllegalArgumentException("Illegal codepoint");
720 return toString(char32);
724 * Convenience method corresponding to String.valueOf(codepoint at offset16). Returns a one or
725 * two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a surrogate
726 * character, the whole supplementary codepoint will be returned. If a validity check is
727 * required, use <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on the
728 * codepoint at offset16 before calling. The result returned will be a newly created String
729 * obtained by calling source.substring(..) with the appropriate indexes.
734 * the UTF16 index to the codepoint in source
735 * @return string value of char32 in UTF16 format
738 public static String valueOf(String source, int offset16) {
739 switch (bounds(source, offset16)) {
740 case LEAD_SURROGATE_BOUNDARY:
741 return source.substring(offset16, offset16 + 2);
742 case TRAIL_SURROGATE_BOUNDARY:
743 return source.substring(offset16 - 1, offset16 + 1);
745 return source.substring(offset16, offset16 + 1);
750 * Convenience method corresponding to StringBuffer.valueOf(codepoint at offset16). Returns a
751 * one or two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a
752 * surrogate character, the whole supplementary codepoint will be returned. If a validity check
753 * is required, use <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
754 * the codepoint at offset16 before calling. The result returned will be a newly created String
755 * obtained by calling source.substring(..) with the appropriate indexes.
758 * the input string buffer.
760 * the UTF16 index to the codepoint in source
761 * @return string value of char32 in UTF16 format
764 public static String valueOf(StringBuffer source, int offset16) {
765 switch (bounds(source, offset16)) {
766 case LEAD_SURROGATE_BOUNDARY:
767 return source.substring(offset16, offset16 + 2);
768 case TRAIL_SURROGATE_BOUNDARY:
769 return source.substring(offset16 - 1, offset16 + 1);
771 return source.substring(offset16, offset16 + 1);
776 * Convenience method. Returns a one or two char string containing the UTF-32 value in UTF16
777 * format. If offset16 indexes a surrogate character, the whole supplementary codepoint will be
778 * returned, except when either the leading or trailing surrogate character lies out of the
779 * specified subarray. In the latter case, only the surrogate character within bounds will be
780 * returned. If a validity check is required, use <a
781 * href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on the codepoint at
782 * offset16 before calling. The result returned will be a newly created String containing the
783 * relevant characters.
786 * the input char array.
788 * start index of the subarray
790 * end index of the subarray
792 * the UTF16 index to the codepoint in source relative to start
793 * @return string value of char32 in UTF16 format
796 public static String valueOf(char source[], int start, int limit, int offset16) {
797 switch (bounds(source, start, limit, offset16)) {
798 case LEAD_SURROGATE_BOUNDARY:
799 return new String(source, start + offset16, 2);
800 case TRAIL_SURROGATE_BOUNDARY:
801 return new String(source, start + offset16 - 1, 2);
803 return new String(source, start + offset16, 1);
807 * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
808 * the <a name="_top_">class description</a> for notes on roundtripping.
814 * @return UTF-16 offset
815 * @exception IndexOutOfBoundsException
816 * if offset32 is out of bounds.
819 public static int findOffsetFromCodePoint(String source, int offset32) {
821 int size = source.length(), result = 0, count = offset32;
822 if (offset32 < 0 || offset32 > size) {
823 throw new StringIndexOutOfBoundsException(offset32);
825 while (result < size && count > 0) {
826 ch = source.charAt(result);
827 if (isLeadSurrogate(ch) && ((result + 1) < size)
828 && isTrailSurrogate(source.charAt(result + 1))) {
836 throw new StringIndexOutOfBoundsException(offset32);
842 * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
843 * the <a name="_top_">class description</a> for notes on roundtripping.
846 * the UTF-16 string buffer
849 * @return UTF-16 offset
850 * @exception IndexOutOfBoundsException
851 * if offset32 is out of bounds.
854 public static int findOffsetFromCodePoint(StringBuffer source, int offset32) {
856 int size = source.length(), result = 0, count = offset32;
857 if (offset32 < 0 || offset32 > size) {
858 throw new StringIndexOutOfBoundsException(offset32);
860 while (result < size && count > 0) {
861 ch = source.charAt(result);
862 if (isLeadSurrogate(ch) && ((result + 1) < size)
863 && isTrailSurrogate(source.charAt(result + 1))) {
871 throw new StringIndexOutOfBoundsException(offset32);
877 * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
878 * the <a name="_top_">class description</a> for notes on roundtripping.
881 * the UTF-16 char array whose substring is to be analysed
883 * offset of the substring to be analysed
885 * offset of the substring to be analysed
887 * UTF-32 offset relative to start
888 * @return UTF-16 offset relative to start
889 * @exception IndexOutOfBoundsException
890 * if offset32 is out of bounds.
893 public static int findOffsetFromCodePoint(char source[], int start, int limit, int offset32) {
895 int result = start, count = offset32;
896 if (offset32 > limit - start) {
897 throw new ArrayIndexOutOfBoundsException(offset32);
899 while (result < limit && count > 0) {
901 if (isLeadSurrogate(ch) && ((result + 1) < limit)
902 && isTrailSurrogate(source[result + 1])) {
910 throw new ArrayIndexOutOfBoundsException(offset32);
912 return result - start;
916 * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at or after the given
917 * UTF-16 offset. Used for random access. See the <a name="_top_">class description</a> for
918 * notes on roundtripping.<br>
919 * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
920 * of the <strong>lead</strong> of the pair is returned. </i>
922 * To find the UTF-32 length of a string, use:
925 * len32 = countCodePoint(source, source.length());
934 * UTF-16 offset < source text length.
935 * @return UTF-32 offset
936 * @exception IndexOutOfBoundsException
937 * if offset16 is out of bounds.
940 public static int findCodePointOffset(String source, int offset16) {
941 if (offset16 < 0 || offset16 > source.length()) {
942 throw new StringIndexOutOfBoundsException(offset16);
947 boolean hadLeadSurrogate = false;
949 for (int i = 0; i < offset16; ++i) {
950 ch = source.charAt(i);
951 if (hadLeadSurrogate && isTrailSurrogate(ch)) {
952 hadLeadSurrogate = false; // count valid trail as zero
954 hadLeadSurrogate = isLeadSurrogate(ch);
955 ++result; // count others as 1
959 if (offset16 == source.length()) {
963 // end of source being the less significant surrogate character
964 // shift result back to the start of the supplementary character
965 if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) {
973 * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16
974 * offset. Used for random access. See the <a name="_top_">class description</a> for notes on
976 * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
977 * of the <strong>lead</strong> of the pair is returned. </i>
979 * To find the UTF-32 length of a string, use:
982 * len32 = countCodePoint(source);
991 * UTF-16 offset < source text length.
992 * @return UTF-32 offset
993 * @exception IndexOutOfBoundsException
994 * if offset16 is out of bounds.
997 public static int findCodePointOffset(StringBuffer source, int offset16) {
998 if (offset16 < 0 || offset16 > source.length()) {
999 throw new StringIndexOutOfBoundsException(offset16);
1004 boolean hadLeadSurrogate = false;
1006 for (int i = 0; i < offset16; ++i) {
1007 ch = source.charAt(i);
1008 if (hadLeadSurrogate && isTrailSurrogate(ch)) {
1009 hadLeadSurrogate = false; // count valid trail as zero
1011 hadLeadSurrogate = isLeadSurrogate(ch);
1012 ++result; // count others as 1
1016 if (offset16 == source.length()) {
1020 // end of source being the less significant surrogate character
1021 // shift result back to the start of the supplementary character
1022 if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) {
1030 * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16
1031 * offset. Used for random access. See the <a name="_top_">class description</a> for notes on
1032 * roundtripping.<br>
1033 * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
1034 * of the <strong>lead</strong> of the pair is returned. </i>
1036 * To find the UTF-32 length of a substring, use:
1039 * len32 = countCodePoint(source, start, limit);
1048 * offset of the substring
1050 * offset of the substring
1052 * UTF-16 relative to start
1053 * @return UTF-32 offset relative to start
1054 * @exception IndexOutOfBoundsException
1055 * if offset16 is not within the range of start and limit.
1058 public static int findCodePointOffset(char source[], int start, int limit, int offset16) {
1060 if (offset16 > limit) {
1061 throw new StringIndexOutOfBoundsException(offset16);
1066 boolean hadLeadSurrogate = false;
1068 for (int i = start; i < offset16; ++i) {
1070 if (hadLeadSurrogate && isTrailSurrogate(ch)) {
1071 hadLeadSurrogate = false; // count valid trail as zero
1073 hadLeadSurrogate = isLeadSurrogate(ch);
1074 ++result; // count others as 1
1078 if (offset16 == limit) {
1082 // end of source being the less significant surrogate character
1083 // shift result back to the start of the supplementary character
1084 if (hadLeadSurrogate && (isTrailSurrogate(source[offset16]))) {
1092 * Append a single UTF-32 value to the end of a StringBuffer. If a validity check is required,
1093 * use <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32 before
1097 * the buffer to append to
1100 * @return the updated StringBuffer
1101 * @exception IllegalArgumentException
1102 * thrown when char32 does not lie within the range of the Unicode codepoints
1105 public static StringBuffer append(StringBuffer target, int char32) {
1106 // Check for irregular values
1107 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1108 throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
1111 // Write the UTF-16 values
1112 if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
1113 target.append(getLeadSurrogate(char32));
1114 target.append(getTrailSurrogate(char32));
1116 target.append((char) char32);
1122 * Cover JDK 1.5 APIs. Append the code point to the buffer and return the buffer as a
1126 * the buffer to append to
1128 * the code point to append
1129 * @return the updated StringBuffer
1130 * @throws IllegalArgumentException
1131 * if cp is not a valid code point
1134 public static StringBuffer appendCodePoint(StringBuffer target, int cp) {
1135 return append(target, cp);
1139 * Adds a codepoint to offset16 position of the argument char array.
1142 * char array to be append with the new code point
1144 * UTF16 offset which the codepoint will be appended.
1146 * code point to be appended
1147 * @return offset after char32 in the array.
1148 * @exception IllegalArgumentException
1149 * thrown if there is not enough space for the append, or when char32 does not
1150 * lie within the range of the Unicode codepoints.
1153 public static int append(char[] target, int limit, int char32) {
1154 // Check for irregular values
1155 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1156 throw new IllegalArgumentException("Illegal codepoint");
1158 // Write the UTF-16 values
1159 if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
1160 target[limit++] = getLeadSurrogate(char32);
1161 target[limit++] = getTrailSurrogate(char32);
1163 target[limit++] = (char) char32;
1169 * Number of codepoints in a UTF16 String
1173 * @return number of codepoint in string
1176 public static int countCodePoint(String source) {
1177 if (source == null || source.length() == 0) {
1180 return findCodePointOffset(source, source.length());
1184 * Number of codepoints in a UTF16 String buffer
1187 * UTF16 string buffer
1188 * @return number of codepoint in string
1191 public static int countCodePoint(StringBuffer source) {
1192 if (source == null || source.length() == 0) {
1195 return findCodePointOffset(source, source.length());
1199 * Number of codepoints in a UTF16 char array substring
1204 * offset of the substring
1206 * offset of the substring
1207 * @return number of codepoint in the substring
1208 * @exception IndexOutOfBoundsException
1209 * if start and limit are not valid.
1212 public static int countCodePoint(char source[], int start, int limit) {
1213 if (source == null || source.length == 0) {
1216 return findCodePointOffset(source, start, limit, limit - start);
1220 * Set a code point into a UTF16 position. Adjusts target according if we are replacing a
1221 * non-supplementary codepoint with a supplementary and vice versa.
1226 * UTF16 position to insert into
1231 public static void setCharAt(StringBuffer target, int offset16, int char32) {
1233 char single = target.charAt(offset16);
1235 if (isSurrogate(single)) {
1236 // pairs of the surrogate with offset16 at the lead char found
1237 if (isLeadSurrogate(single) && (target.length() > offset16 + 1)
1238 && isTrailSurrogate(target.charAt(offset16 + 1))) {
1241 // pairs of the surrogate with offset16 at the trail char
1243 if (isTrailSurrogate(single) && (offset16 > 0)
1244 && isLeadSurrogate(target.charAt(offset16 - 1))) {
1250 target.replace(offset16, offset16 + count, valueOf(char32));
1254 * Set a code point into a UTF16 position in a char array. Adjusts target according if we are
1255 * replacing a non-supplementary codepoint with a supplementary and vice versa.
1260 * numbers of valid chars in target, different from target.length. limit counts the
1261 * number of chars in target that represents a string, not the size of array target.
1263 * UTF16 position to insert into
1266 * @return new number of chars in target that represents a string
1267 * @exception IndexOutOfBoundsException
1268 * if offset16 is out of range
1271 public static int setCharAt(char target[], int limit, int offset16, int char32) {
1272 if (offset16 >= limit) {
1273 throw new ArrayIndexOutOfBoundsException(offset16);
1276 char single = target[offset16];
1278 if (isSurrogate(single)) {
1279 // pairs of the surrogate with offset16 at the lead char found
1280 if (isLeadSurrogate(single) && (target.length > offset16 + 1)
1281 && isTrailSurrogate(target[offset16 + 1])) {
1284 // pairs of the surrogate with offset16 at the trail char
1286 if (isTrailSurrogate(single) && (offset16 > 0)
1287 && isLeadSurrogate(target[offset16 - 1])) {
1294 String str = valueOf(char32);
1296 int strlength = str.length();
1297 target[offset16] = str.charAt(0);
1298 if (count == strlength) {
1300 target[offset16 + 1] = str.charAt(1);
1303 // this is not exact match in space, we'll have to do some
1305 System.arraycopy(target, offset16 + count, target, offset16 + strlength, limit
1306 - (offset16 + count));
1307 if (count < strlength) {
1308 // char32 is a supplementary character trying to squeeze into
1309 // a non-supplementary space
1310 target[offset16 + 1] = str.charAt(1);
1312 if (result < target.length) {
1316 // char32 is a non-supplementary character trying to fill
1317 // into a supplementary space
1326 * Shifts offset16 by the argument number of codepoints
1331 * UTF16 position to shift
1333 * number of codepoints to shift
1334 * @return new shifted offset16
1335 * @exception IndexOutOfBoundsException
1336 * if the new offset16 is out of bounds.
1339 public static int moveCodePointOffset(String source, int offset16, int shift32) {
1340 int result = offset16;
1341 int size = source.length();
1344 if (offset16 < 0 || offset16 > size) {
1345 throw new StringIndexOutOfBoundsException(offset16);
1348 if (shift32 + offset16 > size) {
1349 throw new StringIndexOutOfBoundsException(offset16);
1352 while (result < size && count > 0) {
1353 ch = source.charAt(result);
1354 if (isLeadSurrogate(ch) && ((result + 1) < size)
1355 && isTrailSurrogate(source.charAt(result + 1))) {
1362 if (offset16 + shift32 < 0) {
1363 throw new StringIndexOutOfBoundsException(offset16);
1365 for (count = -shift32; count > 0; count--) {
1370 ch = source.charAt(result);
1371 if (isTrailSurrogate(ch) && result > 0
1372 && isLeadSurrogate(source.charAt(result - 1))) {
1378 throw new StringIndexOutOfBoundsException(shift32);
1384 * Shifts offset16 by the argument number of codepoints
1389 * UTF16 position to shift
1391 * number of codepoints to shift
1392 * @return new shifted offset16
1393 * @exception IndexOutOfBoundsException
1394 * if the new offset16 is out of bounds.
1397 public static int moveCodePointOffset(StringBuffer source, int offset16, int shift32) {
1398 int result = offset16;
1399 int size = source.length();
1402 if (offset16 < 0 || offset16 > size) {
1403 throw new StringIndexOutOfBoundsException(offset16);
1406 if (shift32 + offset16 > size) {
1407 throw new StringIndexOutOfBoundsException(offset16);
1410 while (result < size && count > 0) {
1411 ch = source.charAt(result);
1412 if (isLeadSurrogate(ch) && ((result + 1) < size)
1413 && isTrailSurrogate(source.charAt(result + 1))) {
1420 if (offset16 + shift32 < 0) {
1421 throw new StringIndexOutOfBoundsException(offset16);
1423 for (count = -shift32; count > 0; count--) {
1428 ch = source.charAt(result);
1429 if (isTrailSurrogate(ch) && result > 0
1430 && isLeadSurrogate(source.charAt(result - 1))) {
1436 throw new StringIndexOutOfBoundsException(shift32);
1442 * Shifts offset16 by the argument number of codepoints within a subarray.
1447 * position of the subarray to be performed on
1449 * position of the subarray to be performed on
1451 * UTF16 position to shift relative to start
1453 * number of codepoints to shift
1454 * @return new shifted offset16 relative to start
1455 * @exception IndexOutOfBoundsException
1456 * if the new offset16 is out of bounds with respect to the subarray or the
1457 * subarray bounds are out of range.
1460 public static int moveCodePointOffset(char source[], int start, int limit, int offset16,
1462 int size = source.length;
1465 int result = offset16 + start;
1466 if (start < 0 || limit < start) {
1467 throw new StringIndexOutOfBoundsException(start);
1470 throw new StringIndexOutOfBoundsException(limit);
1472 if (offset16 < 0 || result > limit) {
1473 throw new StringIndexOutOfBoundsException(offset16);
1476 if (shift32 + result > size) {
1477 throw new StringIndexOutOfBoundsException(result);
1480 while (result < limit && count > 0) {
1481 ch = source[result];
1482 if (isLeadSurrogate(ch) && (result + 1 < limit)
1483 && isTrailSurrogate(source[result + 1])) {
1490 if (result + shift32 < start) {
1491 throw new StringIndexOutOfBoundsException(result);
1493 for (count = -shift32; count > 0; count--) {
1495 if (result < start) {
1498 ch = source[result];
1499 if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) {
1505 throw new StringIndexOutOfBoundsException(shift32);
1512 * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the
1513 * middle of a supplementary codepoint, char32 will be inserted after the supplementary
1514 * codepoint. The length of target increases by one if codepoint is non-supplementary, 2
1517 * The overall effect is exactly as if the argument were converted to a string by the method
1518 * valueOf(char) and the characters in that string were then inserted into target at the
1519 * position indicated by offset16.
1522 * The offset argument must be greater than or equal to 0, and less than or equal to the length
1526 * string buffer to insert to
1528 * offset which char32 will be inserted in
1530 * codepoint to be inserted
1531 * @return a reference to target
1532 * @exception IndexOutOfBoundsException
1533 * thrown if offset16 is invalid.
1536 public static StringBuffer insert(StringBuffer target, int offset16, int char32) {
1537 String str = valueOf(char32);
1538 if (offset16 != target.length() && bounds(target, offset16) == TRAIL_SURROGATE_BOUNDARY) {
1541 target.insert(offset16, str);
1546 * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the
1547 * middle of a supplementary codepoint, char32 will be inserted after the supplementary
1548 * codepoint. Limit increases by one if codepoint is non-supplementary, 2 otherwise.
1550 * The overall effect is exactly as if the argument were converted to a string by the method
1551 * valueOf(char) and the characters in that string were then inserted into target at the
1552 * position indicated by offset16.
1555 * The offset argument must be greater than or equal to 0, and less than or equal to the limit.
1558 * char array to insert to
1560 * end index of the char array, limit <= target.length
1562 * offset which char32 will be inserted in
1564 * codepoint to be inserted
1565 * @return new limit size
1566 * @exception IndexOutOfBoundsException
1567 * thrown if offset16 is invalid.
1570 public static int insert(char target[], int limit, int offset16, int char32) {
1571 String str = valueOf(char32);
1572 if (offset16 != limit && bounds(target, 0, limit, offset16) == TRAIL_SURROGATE_BOUNDARY) {
1575 int size = str.length();
1576 if (limit + size > target.length) {
1577 throw new ArrayIndexOutOfBoundsException(offset16 + size);
1579 System.arraycopy(target, offset16, target, offset16 + size, limit - offset16);
1580 target[offset16] = str.charAt(0);
1582 target[offset16 + 1] = str.charAt(1);
1584 return limit + size;
1588 * Removes the codepoint at the specified position in this target (shortening target by 1
1589 * character if the codepoint is a non-supplementary, 2 otherwise).
1592 * string buffer to remove codepoint from
1594 * offset which the codepoint will be removed
1595 * @return a reference to target
1596 * @exception IndexOutOfBoundsException
1597 * thrown if offset16 is invalid.
1600 public static StringBuffer delete(StringBuffer target, int offset16) {
1602 switch (bounds(target, offset16)) {
1603 case LEAD_SURROGATE_BOUNDARY:
1606 case TRAIL_SURROGATE_BOUNDARY:
1611 target.delete(offset16, offset16 + count);
1616 * Removes the codepoint at the specified position in this target (shortening target by 1
1617 * character if the codepoint is a non-supplementary, 2 otherwise).
1620 * string buffer to remove codepoint from
1622 * end index of the char array, limit <= target.length
1624 * offset which the codepoint will be removed
1625 * @return a new limit size
1626 * @exception IndexOutOfBoundsException
1627 * thrown if offset16 is invalid.
1630 public static int delete(char target[], int limit, int offset16) {
1632 switch (bounds(target, 0, limit, offset16)) {
1633 case LEAD_SURROGATE_BOUNDARY:
1636 case TRAIL_SURROGATE_BOUNDARY:
1641 System.arraycopy(target, offset16 + count, target, offset16, limit - (offset16 + count));
1642 target[limit - count] = 0;
1643 return limit - count;
1647 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1648 * the argument codepoint. I.e., the smallest index <code>i</code> such that
1649 * <code>UTF16.charAt(source, i) ==
1650 * char32</code> is true.
1652 * If no such character occurs in this string, then -1 is returned.
1656 * UTF16.indexOf("abc", 'a') returns 0<br>
1657 * UTF16.indexOf("abc\ud800\udc00", 0x10000) returns 3<br>
1658 * UTF16.indexOf("abc\ud800\udc00", 0xd800) returns -1<br>
1660 * Note this method is provided as support to jdk 1.3, which does not support supplementary
1661 * characters to its fullest.
1664 * UTF16 format Unicode string that will be searched
1666 * codepoint to search for
1667 * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
1668 * -1 if the codepoint does not occur.
1671 public static int indexOf(String source, int char32) {
1672 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1673 throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
1675 // non-surrogate bmp
1676 if (char32 < LEAD_SURROGATE_MIN_VALUE
1677 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1678 return source.indexOf((char) char32);
1681 if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1682 int result = source.indexOf((char) char32);
1684 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
1685 && isTrailSurrogate(source.charAt(result + 1))) {
1686 return indexOf(source, char32, result + 1);
1689 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
1690 return indexOf(source, char32, result + 1);
1696 String char32str = toString(char32);
1697 return source.indexOf(char32str);
1701 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1702 * the argument string str. This method is implemented based on codepoints, hence a "lead
1703 * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
1704 * starts with trail surrogate character at index 0, a source with a leading a surrogate
1705 * character before str found at in source will not have a valid match. Vice versa for lead
1706 * surrogates that ends str. See example below.
1708 * If no such string str occurs in this source, then -1 is returned.
1712 * UTF16.indexOf("abc", "ab") returns 0<br>
1713 * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br>
1714 * UTF16.indexOf("abc\ud800\udc00", "\ud800") returns -1<br>
1716 * Note this method is provided as support to jdk 1.3, which does not support supplementary
1717 * characters to its fullest.
1720 * UTF16 format Unicode string that will be searched
1722 * UTF16 format Unicode string to search for
1723 * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
1724 * -1 if the codepoint does not occur.
1727 public static int indexOf(String source, String str) {
1728 int strLength = str.length();
1729 // non-surrogate ends
1730 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
1731 return source.indexOf(str);
1734 int result = source.indexOf(str);
1735 int resultEnd = result + strLength;
1737 // check last character
1738 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
1739 && isTrailSurrogate(source.charAt(resultEnd + 1))) {
1740 return indexOf(source, str, resultEnd + 1);
1742 // check first character which is a trail surrogate
1743 if (isTrailSurrogate(str.charAt(0)) && result > 0
1744 && isLeadSurrogate(source.charAt(result - 1))) {
1745 return indexOf(source, str, resultEnd + 1);
1752 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1753 * the argument codepoint. I.e., the smallest index i such that: <br>
1754 * (UTF16.charAt(source, i) == char32 && i >= fromIndex) is true.
1756 * If no such character occurs in this string, then -1 is returned.
1760 * UTF16.indexOf("abc", 'a', 1) returns -1<br>
1761 * UTF16.indexOf("abc\ud800\udc00", 0x10000, 1) returns 3<br>
1762 * UTF16.indexOf("abc\ud800\udc00", 0xd800, 1) returns -1<br>
1764 * Note this method is provided as support to jdk 1.3, which does not support supplementary
1765 * characters to its fullest.
1768 * UTF16 format Unicode string that will be searched
1770 * codepoint to search for
1772 * the index to start the search from.
1773 * @return the index of the first occurrence of the codepoint in the argument Unicode string at
1774 * or after fromIndex, or -1 if the codepoint does not occur.
1777 public static int indexOf(String source, int char32, int fromIndex) {
1778 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1779 throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
1781 // non-surrogate bmp
1782 if (char32 < LEAD_SURROGATE_MIN_VALUE
1783 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1784 return source.indexOf((char) char32, fromIndex);
1787 if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1788 int result = source.indexOf((char) char32, fromIndex);
1790 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
1791 && isTrailSurrogate(source.charAt(result + 1))) {
1792 return indexOf(source, char32, result + 1);
1795 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
1796 return indexOf(source, char32, result + 1);
1802 String char32str = toString(char32);
1803 return source.indexOf(char32str, fromIndex);
1807 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1808 * the argument string str. This method is implemented based on codepoints, hence a "lead
1809 * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
1810 * starts with trail surrogate character at index 0, a source with a leading a surrogate
1811 * character before str found at in source will not have a valid match. Vice versa for lead
1812 * surrogates that ends str. See example below.
1814 * If no such string str occurs in this source, then -1 is returned.
1818 * UTF16.indexOf("abc", "ab", 0) returns 0<br>
1819 * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 0) returns 3<br>
1820 * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 2) returns 3<br>
1821 * UTF16.indexOf("abc\ud800\udc00", "\ud800", 0) returns -1<br>
1823 * Note this method is provided as support to jdk 1.3, which does not support supplementary
1824 * characters to its fullest.
1827 * UTF16 format Unicode string that will be searched
1829 * UTF16 format Unicode string to search for
1831 * the index to start the search from.
1832 * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
1833 * -1 if the codepoint does not occur.
1836 public static int indexOf(String source, String str, int fromIndex) {
1837 int strLength = str.length();
1838 // non-surrogate ends
1839 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
1840 return source.indexOf(str, fromIndex);
1843 int result = source.indexOf(str, fromIndex);
1844 int resultEnd = result + strLength;
1846 // check last character
1847 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
1848 && isTrailSurrogate(source.charAt(resultEnd))) {
1849 return indexOf(source, str, resultEnd + 1);
1851 // check first character which is a trail surrogate
1852 if (isTrailSurrogate(str.charAt(0)) && result > 0
1853 && isLeadSurrogate(source.charAt(result - 1))) {
1854 return indexOf(source, str, resultEnd + 1);
1861 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
1862 * the argument codepoint. I.e., the index returned is the largest value i such that:
1863 * UTF16.charAt(source, i) == char32 is true.
1866 * UTF16.lastIndexOf("abc", 'a') returns 0<br>
1867 * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000) returns 3<br>
1868 * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br>
1871 * source is searched backwards starting at the last character.
1873 * Note this method is provided as support to jdk 1.3, which does not support supplementary
1874 * characters to its fullest.
1877 * UTF16 format Unicode string that will be searched
1879 * codepoint to search for
1880 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
1884 public static int lastIndexOf(String source, int char32) {
1885 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1886 throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
1888 // non-surrogate bmp
1889 if (char32 < LEAD_SURROGATE_MIN_VALUE
1890 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1891 return source.lastIndexOf((char) char32);
1894 if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1895 int result = source.lastIndexOf((char) char32);
1897 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
1898 && isTrailSurrogate(source.charAt(result + 1))) {
1899 return lastIndexOf(source, char32, result - 1);
1902 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
1903 return lastIndexOf(source, char32, result - 1);
1909 String char32str = toString(char32);
1910 return source.lastIndexOf(char32str);
1914 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
1915 * the argument string str. This method is implemented based on codepoints, hence a "lead
1916 * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
1917 * starts with trail surrogate character at index 0, a source with a leading a surrogate
1918 * character before str found at in source will not have a valid match. Vice versa for lead
1919 * surrogates that ends str. See example below.
1922 * UTF16.lastIndexOf("abc", "a") returns 0<br>
1923 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br>
1924 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800") returns -1<br>
1927 * source is searched backwards starting at the last character.
1929 * Note this method is provided as support to jdk 1.3, which does not support supplementary
1930 * characters to its fullest.
1933 * UTF16 format Unicode string that will be searched
1935 * UTF16 format Unicode string to search for
1936 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
1940 public static int lastIndexOf(String source, String str) {
1941 int strLength = str.length();
1942 // non-surrogate ends
1943 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
1944 return source.lastIndexOf(str);
1947 int result = source.lastIndexOf(str);
1949 // check last character
1950 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
1951 && isTrailSurrogate(source.charAt(result + strLength + 1))) {
1952 return lastIndexOf(source, str, result - 1);
1954 // check first character which is a trail surrogate
1955 if (isTrailSurrogate(str.charAt(0)) && result > 0
1956 && isLeadSurrogate(source.charAt(result - 1))) {
1957 return lastIndexOf(source, str, result - 1);
1965 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
1966 * the argument codepoint, where the result is less than or equals to fromIndex.
1969 * This method is implemented based on codepoints, hence a single surrogate character will not
1970 * match a supplementary character.
1973 * source is searched backwards starting at the last character starting at the specified index.
1977 * UTF16.lastIndexOf("abc", 'c', 2) returns 2<br>
1978 * UTF16.lastIndexOf("abc", 'c', 1) returns -1<br>
1979 * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 5) returns 3<br>
1980 * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 3) returns 3<br>
1981 * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br>
1983 * Note this method is provided as support to jdk 1.3, which does not support supplementary
1984 * characters to its fullest.
1987 * UTF16 format Unicode string that will be searched
1989 * codepoint to search for
1991 * the index to start the search from. There is no restriction on the value of
1992 * fromIndex. If it is greater than or equal to the length of this string, it has the
1993 * same effect as if it were equal to one less than the length of this string: this
1994 * entire string may be searched. If it is negative, it has the same effect as if it
1995 * were -1: -1 is returned.
1996 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
2000 public static int lastIndexOf(String source, int char32, int fromIndex) {
2001 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
2002 throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
2004 // non-surrogate bmp
2005 if (char32 < LEAD_SURROGATE_MIN_VALUE
2006 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
2007 return source.lastIndexOf((char) char32, fromIndex);
2010 if (char32 < SUPPLEMENTARY_MIN_VALUE) {
2011 int result = source.lastIndexOf((char) char32, fromIndex);
2013 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
2014 && isTrailSurrogate(source.charAt(result + 1))) {
2015 return lastIndexOf(source, char32, result - 1);
2018 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
2019 return lastIndexOf(source, char32, result - 1);
2025 String char32str = toString(char32);
2026 return source.lastIndexOf(char32str, fromIndex);
2031 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
2032 * the argument string str, where the result is less than or equals to fromIndex.
2035 * This method is implemented based on codepoints, hence a "lead surrogate character + trail
2036 * surrogate character" is treated as one entity. Hence if the str starts with trail surrogate
2037 * character at index 0, a source with a leading a surrogate character before str found at in
2038 * source will not have a valid match. Vice versa for lead surrogates that ends str.
2040 * See example below.
2043 * UTF16.lastIndexOf("abc", "c", 2) returns 2<br>
2044 * UTF16.lastIndexOf("abc", "c", 1) returns -1<br>
2045 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 5) returns 3<br>
2046 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 3) returns 3<br>
2047 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800", 4) returns -1<br>
2050 * source is searched backwards starting at the last character.
2052 * Note this method is provided as support to jdk 1.3, which does not support supplementary
2053 * characters to its fullest.
2056 * UTF16 format Unicode string that will be searched
2058 * UTF16 format Unicode string to search for
2060 * the index to start the search from. There is no restriction on the value of
2061 * fromIndex. If it is greater than or equal to the length of this string, it has the
2062 * same effect as if it were equal to one less than the length of this string: this
2063 * entire string may be searched. If it is negative, it has the same effect as if it
2064 * were -1: -1 is returned.
2065 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
2069 public static int lastIndexOf(String source, String str, int fromIndex) {
2070 int strLength = str.length();
2071 // non-surrogate ends
2072 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
2073 return source.lastIndexOf(str, fromIndex);
2076 int result = source.lastIndexOf(str, fromIndex);
2078 // check last character
2079 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
2080 && isTrailSurrogate(source.charAt(result + strLength))) {
2081 return lastIndexOf(source, str, result - 1);
2083 // check first character which is a trail surrogate
2084 if (isTrailSurrogate(str.charAt(0)) && result > 0
2085 && isLeadSurrogate(source.charAt(result - 1))) {
2086 return lastIndexOf(source, str, result - 1);
2093 * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of
2094 * oldChar32 in source with newChar32. If the character oldChar32 does not occur in the UTF16
2095 * format Unicode string source, then source will be returned. Otherwise, a new String object is
2096 * created that represents a codepoint sequence identical to the codepoint sequence represented
2097 * by source, except that every occurrence of oldChar32 is replaced by an occurrence of
2101 * UTF16.replace("mesquite in your cellar", 'e', 'o');<br>
2102 * returns "mosquito in your collar"<br>
2103 * UTF16.replace("JonL", 'q', 'x');<br>
2104 * returns "JonL" (no change)<br>
2105 * UTF16.replace("Supplementary character \ud800\udc00", 0x10000, '!'); <br>
2106 * returns "Supplementary character !"<br>
2107 * UTF16.replace("Supplementary character \ud800\udc00", 0xd800, '!'); <br>
2108 * returns "Supplementary character \ud800\udc00"<br>
2110 * Note this method is provided as support to jdk 1.3, which does not support supplementary
2111 * characters to its fullest.
2114 * UTF16 format Unicode string which the codepoint replacements will be based on.
2116 * non-zero old codepoint to be replaced.
2118 * the new codepoint to replace oldChar32
2119 * @return new String derived from source by replacing every occurrence of oldChar32 with
2120 * newChar32, unless when no oldChar32 is found in source then source will be returned.
2123 public static String replace(String source, int oldChar32, int newChar32) {
2124 if (oldChar32 <= 0 || oldChar32 > CODEPOINT_MAX_VALUE) {
2125 throw new IllegalArgumentException("Argument oldChar32 is not a valid codepoint");
2127 if (newChar32 <= 0 || newChar32 > CODEPOINT_MAX_VALUE) {
2128 throw new IllegalArgumentException("Argument newChar32 is not a valid codepoint");
2131 int index = indexOf(source, oldChar32);
2135 String newChar32Str = toString(newChar32);
2136 int oldChar32Size = 1;
2137 int newChar32Size = newChar32Str.length();
2138 StringBuffer result = new StringBuffer(source);
2139 int resultIndex = index;
2141 if (oldChar32 >= SUPPLEMENTARY_MIN_VALUE) {
2145 while (index != -1) {
2146 int endResultIndex = resultIndex + oldChar32Size;
2147 result.replace(resultIndex, endResultIndex, newChar32Str);
2148 int lastEndIndex = index + oldChar32Size;
2149 index = indexOf(source, oldChar32, lastEndIndex);
2150 resultIndex += newChar32Size + index - lastEndIndex;
2152 return result.toString();
2156 * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of oldStr
2157 * in source with newStr. If the string oldStr does not occur in the UTF16 format Unicode string
2158 * source, then source will be returned. Otherwise, a new String object is created that
2159 * represents a codepoint sequence identical to the codepoint sequence represented by source,
2160 * except that every occurrence of oldStr is replaced by an occurrence of newStr.
2163 * UTF16.replace("mesquite in your cellar", "e", "o");<br>
2164 * returns "mosquito in your collar"<br>
2165 * UTF16.replace("mesquite in your cellar", "mesquite", "cat");<br>
2166 * returns "cat in your cellar"<br>
2167 * UTF16.replace("JonL", "q", "x");<br>
2168 * returns "JonL" (no change)<br>
2169 * UTF16.replace("Supplementary character \ud800\udc00", "\ud800\udc00", '!'); <br>
2170 * returns "Supplementary character !"<br>
2171 * UTF16.replace("Supplementary character \ud800\udc00", "\ud800", '!'); <br>
2172 * returns "Supplementary character \ud800\udc00"<br>
2174 * Note this method is provided as support to jdk 1.3, which does not support supplementary
2175 * characters to its fullest.
2178 * UTF16 format Unicode string which the replacements will be based on.
2180 * non-zero-length string to be replaced.
2182 * the new string to replace oldStr
2183 * @return new String derived from source by replacing every occurrence of oldStr with newStr.
2184 * When no oldStr is found in source, then source will be returned.
2187 public static String replace(String source, String oldStr, String newStr) {
2188 int index = indexOf(source, oldStr);
2192 int oldStrSize = oldStr.length();
2193 int newStrSize = newStr.length();
2194 StringBuffer result = new StringBuffer(source);
2195 int resultIndex = index;
2197 while (index != -1) {
2198 int endResultIndex = resultIndex + oldStrSize;
2199 result.replace(resultIndex, endResultIndex, newStr);
2200 int lastEndIndex = index + oldStrSize;
2201 index = indexOf(source, oldStr, lastEndIndex);
2202 resultIndex += newStrSize + index - lastEndIndex;
2204 return result.toString();
2208 * Reverses a UTF16 format Unicode string and replaces source's content with it. This method
2209 * will reverse surrogate characters correctly, instead of blindly reversing every character.
2212 * UTF16.reverse(new StringBuffer( "Supplementary characters \ud800\udc00\ud801\udc01"))<br>
2213 * returns "\ud801\udc01\ud800\udc00 sretcarahc yratnemelppuS".
2216 * the source StringBuffer that contains UTF16 format Unicode string to be reversed
2217 * @return a modified source with reversed UTF16 format Unicode string.
2220 public static StringBuffer reverse(StringBuffer source) {
2221 int length = source.length();
2222 StringBuffer result = new StringBuffer(length);
2223 for (int i = length; i-- > 0;) {
2224 char ch = source.charAt(i);
2225 if (isTrailSurrogate(ch) && i > 0) {
2226 char ch2 = source.charAt(i - 1);
2227 if (isLeadSurrogate(ch2)) {
2240 * Check if the string contains more Unicode code points than a certain number. This is more
2241 * efficient than counting all code points in the entire string and comparing that number with a
2242 * threshold. This function may not need to scan the string at all if the length is within a
2243 * certain range, and never needs to count more than 'number + 1' code points. Logically
2244 * equivalent to (countCodePoint(s) > number). A Unicode code point may occupy either one or two
2250 * The number of code points in the string is compared against the 'number'
2252 * @return boolean value for whether the string contains more Unicode code points than 'number'.
2255 public static boolean hasMoreCodePointsThan(String source, int number) {
2259 if (source == null) {
2262 int length = source.length();
2264 // length >= 0 known
2265 // source contains at least (length + 1) / 2 code points: <= 2
2267 if (((length + 1) >> 1) > number) {
2271 // check if source does not even contain enough chars
2272 int maxsupplementary = length - number;
2273 if (maxsupplementary <= 0) {
2277 // there are maxsupplementary = length - number more chars than
2278 // asked-for code points
2280 // count code points until they exceed and also check that there are
2281 // no more than maxsupplementary supplementary code points (char pairs)
2290 if (isLeadSurrogate(source.charAt(start++)) && start != length
2291 && isTrailSurrogate(source.charAt(start))) {
2293 if (--maxsupplementary <= 0) {
2294 // too many pairs - too few code points
2303 * Check if the sub-range of char array, from argument start to limit, contains more Unicode
2304 * code points than a certain number. This is more efficient than counting all code points in
2305 * the entire char array range and comparing that number with a threshold. This function may not
2306 * need to scan the char array at all if start and limit is within a certain range, and never
2307 * needs to count more than 'number + 1' code points. Logically equivalent to
2308 * (countCodePoint(source, start, limit) > number). A Unicode code point may occupy either one
2309 * or two code units.
2312 * array of UTF-16 chars
2314 * offset to substring in the source array for analyzing
2316 * offset to substring in the source array for analyzing
2318 * The number of code points in the string is compared against the 'number'
2320 * @return boolean value for whether the string contains more Unicode code points than 'number'.
2321 * @exception IndexOutOfBoundsException
2322 * thrown when limit < start
2325 public static boolean hasMoreCodePointsThan(char source[], int start, int limit, int number) {
2326 int length = limit - start;
2327 if (length < 0 || start < 0 || limit < 0) {
2328 throw new IndexOutOfBoundsException(
2329 "Start and limit indexes should be non-negative and start <= limit");
2334 if (source == null) {
2338 // length >= 0 known
2339 // source contains at least (length + 1) / 2 code points: <= 2
2341 if (((length + 1) >> 1) > number) {
2345 // check if source does not even contain enough chars
2346 int maxsupplementary = length - number;
2347 if (maxsupplementary <= 0) {
2351 // there are maxsupplementary = length - number more chars than
2352 // asked-for code points
2354 // count code points until they exceed and also check that there are
2355 // no more than maxsupplementary supplementary code points (char pairs)
2363 if (isLeadSurrogate(source[start++]) && start != limit
2364 && isTrailSurrogate(source[start])) {
2366 if (--maxsupplementary <= 0) {
2367 // too many pairs - too few code points
2376 * Check if the string buffer contains more Unicode code points than a certain number. This is
2377 * more efficient than counting all code points in the entire string buffer and comparing that
2378 * number with a threshold. This function may not need to scan the string buffer at all if the
2379 * length is within a certain range, and never needs to count more than 'number + 1' code
2380 * points. Logically equivalent to (countCodePoint(s) > number). A Unicode code point may occupy
2381 * either one or two code units.
2384 * The input string buffer.
2386 * The number of code points in the string buffer is compared against the 'number'
2388 * @return boolean value for whether the string buffer contains more Unicode code points than
2392 public static boolean hasMoreCodePointsThan(StringBuffer source, int number) {
2396 if (source == null) {
2399 int length = source.length();
2401 // length >= 0 known
2402 // source contains at least (length + 1) / 2 code points: <= 2
2404 if (((length + 1) >> 1) > number) {
2408 // check if source does not even contain enough chars
2409 int maxsupplementary = length - number;
2410 if (maxsupplementary <= 0) {
2414 // there are maxsupplementary = length - number more chars than
2415 // asked-for code points
2417 // count code points until they exceed and also check that there are
2418 // no more than maxsupplementary supplementary code points (char pairs)
2427 if (isLeadSurrogate(source.charAt(start++)) && start != length
2428 && isTrailSurrogate(source.charAt(start))) {
2430 if (--maxsupplementary <= 0) {
2431 // too many pairs - too few code points
2440 * Cover JDK 1.5 API. Create a String from an array of codePoints.
2445 * the start of the text in the code point array
2447 * the number of code points
2448 * @return a String representing the code points between offset and count
2449 * @throws IllegalArgumentException
2450 * if an invalid code point is encountered
2451 * @throws IndexOutOfBoundsException
2452 * if the offset or count are out of bounds.
2455 public static String newString(int[] codePoints, int offset, int count) {
2457 throw new IllegalArgumentException();
2459 char[] chars = new char[count];
2461 for (int r = offset, e = offset + count; r < e; ++r) {
2462 int cp = codePoints[r];
2463 if (cp < 0 || cp > 0x10ffff) {
2464 throw new IllegalArgumentException();
2468 if (cp < 0x010000) {
2469 chars[w] = (char) cp;
2472 chars[w] = (char) (LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_));
2473 chars[w + 1] = (char) (TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_));
2477 } catch (IndexOutOfBoundsException ex) {
2478 int newlen = (int) (Math.ceil((double) codePoints.length * (w + 2)
2479 / (r - offset + 1)));
2480 char[] temp = new char[newlen];
2481 System.arraycopy(chars, 0, temp, 0, w);
2486 return new String(chars, 0, w);
2491 * UTF16 string comparator class. Allows UTF16 string comparison to be done with the various
2495 * <li> Code point comparison or code unit comparison
2496 * <li> Case sensitive comparison, case insensitive comparison or case insensitive comparison
2497 * with special handling for character 'i'.
2500 * The code unit or code point comparison differ only when comparing supplementary code points
2501 * (\u10000..\u10ffff) to BMP code points near the end of the BMP (i.e.,
2502 * \ue000..\uffff). In code unit comparison, high BMP code points sort after
2503 * supplementary code points because they are stored as pairs of surrogates which are at
2504 * \ud800..\udfff.
2507 * @see #FOLD_CASE_DEFAULT
2508 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2511 public static final class StringComparator implements java.util.Comparator {
2512 // public constructor ------------------------------------------------
2515 * Default constructor that does code unit comparison and case sensitive comparison.
2519 public StringComparator() {
2520 this(false, false, FOLD_CASE_DEFAULT);
2524 * Constructor that does comparison based on the argument options.
2526 * @param codepointcompare
2527 * flag to indicate true for code point comparison or false for code unit
2530 * false for case sensitive comparison, true for case-insensitive comparison
2531 * @param foldcaseoption
2532 * FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only
2533 * when ignorecase is set to true. If ignorecase is false, this option is
2535 * @see #FOLD_CASE_DEFAULT
2536 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2537 * @throws IllegalArgumentException
2538 * if foldcaseoption is out of range
2541 public StringComparator(boolean codepointcompare, boolean ignorecase, int foldcaseoption) {
2542 setCodePointCompare(codepointcompare);
2543 m_ignoreCase_ = ignorecase;
2544 if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) {
2545 throw new IllegalArgumentException("Invalid fold case option");
2547 m_foldCase_ = foldcaseoption;
2550 // public data member ------------------------------------------------
2554 * Option value for case folding comparison:
2557 * Comparison is case insensitive, strings are folded using default mappings defined in
2558 * Unicode data file CaseFolding.txt, before comparison.
2563 public static final int FOLD_CASE_DEFAULT = 0;
2567 * Option value for case folding comparison:
2570 * Comparison is case insensitive, strings are folded using modified mappings defined in
2571 * Unicode data file CaseFolding.txt, before comparison.
2574 * The modified set of mappings is provided in a Unicode data file CaseFolding.txt to handle
2575 * dotted I and dotless i appropriately for Turkic languages (tr, az).
2578 * Before Unicode 3.2, CaseFolding.txt contains mappings marked with 'I' that are to be
2579 * included for default mappings and excluded for the Turkic-specific mappings.
2582 * Unicode 3.2 CaseFolding.txt instead contains mappings marked with 'T' that are to be
2583 * excluded for default mappings and included for the Turkic-specific mappings.
2588 public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = 1;
2590 // public methods ----------------------------------------------------
2592 // public setters ----------------------------------------------------
2595 * Sets the comparison mode to code point compare if flag is true. Otherwise comparison mode
2596 * is set to code unit compare
2599 * true for code point compare, false for code unit compare
2602 public void setCodePointCompare(boolean flag) {
2604 m_codePointCompare_ = Normalizer.COMPARE_CODE_POINT_ORDER;
2606 m_codePointCompare_ = 0;
2611 * Sets the Comparator to case-insensitive comparison mode if argument is true, otherwise
2612 * case sensitive comparison mode if set to false.
2615 * true for case-insitive comparison, false for case sensitive comparison
2616 * @param foldcaseoption
2617 * FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only
2618 * when ignorecase is set to true. If ignorecase is false, this option is
2620 * @see #FOLD_CASE_DEFAULT
2621 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2624 public void setIgnoreCase(boolean ignorecase, int foldcaseoption) {
2625 m_ignoreCase_ = ignorecase;
2626 if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) {
2627 throw new IllegalArgumentException("Invalid fold case option");
2629 m_foldCase_ = foldcaseoption;
2632 // public getters ----------------------------------------------------
2635 * Checks if the comparison mode is code point compare.
2637 * @return true for code point compare, false for code unit compare
2640 public boolean getCodePointCompare() {
2641 return m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER;
2645 * Checks if Comparator is in the case insensitive mode.
2647 * @return true if Comparator performs case insensitive comparison, false otherwise
2650 public boolean getIgnoreCase() {
2651 return m_ignoreCase_;
2655 * Gets the fold case options set in Comparator to be used with case insensitive comparison.
2657 * @return either FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I
2658 * @see #FOLD_CASE_DEFAULT
2659 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2662 public int getIgnoreCaseOption() {
2666 // public other methods ----------------------------------------------
2669 * Compare two strings depending on the options selected during construction.
2672 * first source string.
2674 * second source string.
2675 * @return 0 returned if a == b. If a < b, a negative value is returned. Otherwise if a > b,
2676 * a positive value is returned.
2677 * @exception ClassCastException
2678 * thrown when either a or b is not a String object
2681 public int compare(Object a, Object b) {
2682 String str1 = (String) a;
2683 String str2 = (String) b;
2695 if (m_ignoreCase_) {
2696 return compareCaseInsensitive(str1, str2);
2698 return compareCaseSensitive(str1, str2);
2701 // private data member ----------------------------------------------
2704 * Code unit comparison flag. True if code unit comparison is required. False if code point
2705 * comparison is required.
2707 private int m_codePointCompare_;
2710 * Fold case comparison option.
2712 private int m_foldCase_;
2715 * Flag indicator if ignore case is to be used during comparison
2717 private boolean m_ignoreCase_;
2720 * Code point order offset for surrogate characters
2722 private static final int CODE_POINT_COMPARE_SURROGATE_OFFSET_ = 0x2800;
2724 // private method ---------------------------------------------------
2727 * Compares case insensitive. This is a direct port of ICU4C, to make maintainence life
2731 * first string to compare
2733 * second string to compare
2734 * @return -1 is s1 < s2, 0 if equals,
2736 private int compareCaseInsensitive(String s1, String s2) {
2737 return NormalizerImpl.cmpEquivFold(s1, s2, m_foldCase_ | m_codePointCompare_
2738 | Normalizer.COMPARE_IGNORE_CASE);
2742 * Compares case sensitive. This is a direct port of ICU4C, to make maintainence life
2746 * first string to compare
2748 * second string to compare
2749 * @return -1 is s1 < s2, 0 if equals,
2751 private int compareCaseSensitive(String s1, String s2) {
2752 // compare identical prefixes - they do not need to be fixed up
2753 // limit1 = start1 + min(lenght1, length2)
2754 int length1 = s1.length();
2755 int length2 = s2.length();
2756 int minlength = length1;
2758 if (length1 < length2) {
2760 } else if (length1 > length2) {
2762 minlength = length2;
2768 for (; index < minlength; index++) {
2769 c1 = s1.charAt(index);
2770 c2 = s2.charAt(index);
2771 // check pseudo-limit
2777 if (index == minlength) {
2781 boolean codepointcompare = m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER;
2782 // if both values are in or above the surrogate range, fix them up
2783 if (c1 >= LEAD_SURROGATE_MIN_VALUE && c2 >= LEAD_SURROGATE_MIN_VALUE
2784 && codepointcompare) {
2785 // subtract 0x2800 from BMP code points to make them smaller
2786 // than supplementary ones
2787 if ((c1 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length1 && isTrailSurrogate(s1.charAt(index + 1)))
2788 || (isTrailSurrogate(c1) && index != 0 && isLeadSurrogate(s1.charAt(index - 1)))) {
2789 // part of a surrogate pair, leave >=d800
2791 // BMP code point - may be surrogate code point - make
2793 c1 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_;
2796 if ((c2 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length2 && isTrailSurrogate(s2.charAt(index + 1)))
2797 || (isTrailSurrogate(c2) && index != 0 && isLeadSurrogate(s2.charAt(index - 1)))) {
2798 // part of a surrogate pair, leave >=d800
2800 // BMP code point - may be surrogate code point - make <d800
2801 c2 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_;
2805 // now c1 and c2 are in UTF-32-compatible order
2810 // private data members -------------------------------------------------
2813 * Shift value for lead surrogate to form a supplementary character.
2815 private static final int LEAD_SURROGATE_SHIFT_ = 10;
2818 * Mask to retrieve the significant value from a trail surrogate.
2820 private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
2823 * Value that all lead surrogate starts with
2825 private static final int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE
2826 - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);
2828 // private methods ------------------------------------------------------
2832 * Converts argument code point and returns a String object representing the code point's value
2836 * This method does not check for the validity of the codepoint, the results are not guaranteed
2837 * if a invalid codepoint is passed as argument.
2840 * The result is a string whose length is 1 for non-supplementary code points, 2 otherwise.
2845 * @return string representation of the code point
2847 private static String toString(int ch) {
2848 if (ch < SUPPLEMENTARY_MIN_VALUE) {
2849 return String.valueOf((char) ch);
2852 StringBuffer result = new StringBuffer();
2853 result.append(getLeadSurrogate(ch));
2854 result.append(getTrailSurrogate(ch));
2855 return result.toString();