2 *******************************************************************************
3 * Copyright (C) 1996-2012, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 *******************************************************************************
8 package com.ibm.icu.text;
10 import com.ibm.icu.impl.UCharacterProperty;
14 * Standalone utility class providing UTF16 character conversions and indexing conversions.
17 * Code that uses strings alone rarely need modification. By design, UTF-16 does not allow overlap,
18 * so searching for strings is a safe operation. Similarly, concatenation is always safe.
19 * Substringing is safe if the start and end are both on UTF-32 boundaries. In normal code, the
20 * values for start and end are on those boundaries, since they arose from operations like
21 * searching. If not, the nearest UTF-32 boundaries can be determined using <code>bounds()</code>.
23 * <strong>Examples:</strong>
25 * The following examples illustrate use of some of these methods.
28 * // iteration forwards: Original
29 * for (int i = 0; i < s.length(); ++i) {
30 * char ch = s.charAt(i);
31 * doSomethingWith(ch);
34 * // iteration forwards: Changes for UTF-32
36 * for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) {
37 * ch = UTF16.charAt(s, i);
38 * doSomethingWith(ch);
41 * // iteration backwards: Original
42 * for (int i = s.length() - 1; i >= 0; --i) {
43 * char ch = s.charAt(i);
44 * doSomethingWith(ch);
47 * // iteration backwards: Changes for UTF-32
49 * for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) {
50 * ch = UTF16.charAt(s, i);
51 * doSomethingWith(ch);
55 * <strong>Notes:</strong>
57 * <li> <strong>Naming:</strong> For clarity, High and Low surrogates are called <code>Lead</code>
58 * and <code>Trail</code> in the API, which gives a better sense of their ordering in a string.
59 * <code>offset16</code> and <code>offset32</code> are used to distinguish offsets to UTF-16
60 * boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is used to contain UTF-32
61 * characters, as opposed to <code>char16</code>, which is a UTF-16 code unit. </li>
62 * <li> <strong>Roundtripping Offsets:</strong> You can always roundtrip from a UTF-32 offset to a
63 * UTF-16 offset and back. Because of the difference in structure, you can roundtrip from a UTF-16
64 * offset to a UTF-32 offset and back if and only if <code>bounds(string, offset16) != TRAIL</code>.
66 * <li> <strong>Exceptions:</strong> The error checking will throw an exception if indices are out
67 * of bounds. Other than than that, all methods will behave reasonably, even if unmatched surrogates
68 * or out-of-bounds UTF-32 values are present. <code>UCharacter.isLegal()</code> can be used to
69 * check for validity if desired. </li>
70 * <li> <strong>Unmatched Surrogates:</strong> If the string contains unmatched surrogates, then
71 * these are counted as one UTF-32 value. This matches their iteration behavior, which is vital. It
72 * also matches common display practice as missing glyphs (see the Unicode Standard Section 5.4,
74 * <li> <strong>Optimization:</strong> The method implementations may need optimization if the
75 * compiler doesn't fold static final methods. Since surrogate pairs will form an exceeding small
76 * percentage of all the text in the world, the singleton case should always be optimized for. </li>
79 * @author Mark Davis, with help from Markus Scherer
83 public final class UTF16 {
84 // public variables ---------------------------------------------------
87 * Value returned in <code><a href="#bounds(java.lang.String, int)">
88 * bounds()</a></code>.
89 * These values are chosen specifically so that it actually represents the position of the
90 * character [offset16 - (value >> 2), offset16 + (value & 3)]
94 public static final int SINGLE_CHAR_BOUNDARY = 1, LEAD_SURROGATE_BOUNDARY = 2,
95 TRAIL_SURROGATE_BOUNDARY = 5;
98 * The lowest Unicode code point value.
102 public static final int CODEPOINT_MIN_VALUE = 0;
105 * The highest Unicode code point value (scalar value) according to the Unicode Standard.
109 public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
112 * The minimum value for Supplementary code points
116 public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
119 * Lead surrogate minimum value
123 public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
126 * Trail surrogate minimum value
130 public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
133 * Lead surrogate maximum value
137 public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
140 * Trail surrogate maximum value
144 public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
147 * Surrogate minimum value
151 public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
154 * Maximum surrogate value
158 public static final int SURROGATE_MAX_VALUE = TRAIL_SURROGATE_MAX_VALUE;
161 * Lead surrogate bitmask
163 private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00;
166 * Trail surrogate bitmask
168 private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00;
173 private static final int SURROGATE_BITMASK = 0xFFFFF800;
176 * Lead surrogate bits
178 private static final int LEAD_SURROGATE_BITS = 0xD800;
181 * Trail surrogate bits
183 private static final int TRAIL_SURROGATE_BITS = 0xDC00;
188 private static final int SURROGATE_BITS = 0xD800;
190 // constructor --------------------------------------------------------
194 * Prevent instance from being created.
200 // public method ------------------------------------------------------
203 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
204 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
205 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">
206 * UCharacter.isLegal()</a></code>
207 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
208 * character will be returned. If a complete supplementary character is not found the incomplete
209 * character will be returned
211 * @param source Array of UTF-16 chars
212 * @param offset16 UTF-16 offset to the start of the character.
213 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
214 * of that codepoint are the same as in <code>bounds32()</code>.
215 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
218 public static int charAt(String source, int offset16) {
219 char single = source.charAt(offset16);
220 if (single < LEAD_SURROGATE_MIN_VALUE) {
223 return _charAt(source, offset16, single);
226 private static int _charAt(String source, int offset16, char single) {
227 if (single > TRAIL_SURROGATE_MAX_VALUE) {
231 // Convert the UTF-16 surrogate pair if necessary.
232 // For simplicity in usage, and because the frequency of pairs is
233 // low, look both directions.
235 if (single <= LEAD_SURROGATE_MAX_VALUE) {
237 if (source.length() != offset16) {
238 char trail = source.charAt(offset16);
239 if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
240 return UCharacterProperty.getRawSupplementary(single, trail);
246 // single is a trail surrogate so
247 char lead = source.charAt(offset16);
248 if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
249 return UCharacterProperty.getRawSupplementary(lead, single);
253 return single; // return unmatched surrogate
257 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
258 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
259 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">
260 * UCharacter.isLegal()</a></code>
261 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
262 * character will be returned. If a complete supplementary character is not found the incomplete
263 * character will be returned
265 * @param source Array of UTF-16 chars
266 * @param offset16 UTF-16 offset to the start of the character.
267 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
268 * of that codepoint are the same as in <code>bounds32()</code>.
269 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
272 public static int charAt(CharSequence source, int offset16) {
273 char single = source.charAt(offset16);
274 if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) {
277 return _charAt(source, offset16, single);
280 private static int _charAt(CharSequence source, int offset16, char single) {
281 if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
285 // Convert the UTF-16 surrogate pair if necessary.
286 // For simplicity in usage, and because the frequency of pairs is
287 // low, look both directions.
289 if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
291 if (source.length() != offset16) {
292 char trail = source.charAt(offset16);
293 if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE
294 && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
295 return UCharacterProperty.getRawSupplementary(single, trail);
301 // single is a trail surrogate so
302 char lead = source.charAt(offset16);
303 if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE
304 && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
305 return UCharacterProperty.getRawSupplementary(lead, single);
309 return single; // return unmatched surrogate
313 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
314 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
315 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
317 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
318 * character will be returned. If a complete supplementary character is not found the incomplete
319 * character will be returned
321 * @param source UTF-16 chars string buffer
322 * @param offset16 UTF-16 offset to the start of the character.
323 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
324 * of that codepoint are the same as in <code>bounds32()</code>.
325 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
328 public static int charAt(StringBuffer source, int offset16) {
329 if (offset16 < 0 || offset16 >= source.length()) {
330 throw new StringIndexOutOfBoundsException(offset16);
333 char single = source.charAt(offset16);
334 if (!isSurrogate(single)) {
338 // Convert the UTF-16 surrogate pair if necessary.
339 // For simplicity in usage, and because the frequency of pairs is
340 // low, look both directions.
342 if (single <= LEAD_SURROGATE_MAX_VALUE) {
344 if (source.length() != offset16) {
345 char trail = source.charAt(offset16);
346 if (isTrailSurrogate(trail))
347 return UCharacterProperty.getRawSupplementary(single, trail);
352 // single is a trail surrogate so
353 char lead = source.charAt(offset16);
354 if (isLeadSurrogate(lead)) {
355 return UCharacterProperty.getRawSupplementary(lead, single);
359 return single; // return unmatched surrogate
363 * Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards
364 * (with <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
365 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
367 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
368 * character will be returned. If a complete supplementary character is not found the incomplete
369 * character will be returned
371 * @param source Array of UTF-16 chars
372 * @param start Offset to substring in the source array for analyzing
373 * @param limit Offset to substring in the source array for analyzing
374 * @param offset16 UTF-16 offset relative to start
375 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
376 * of that codepoint are the same as in <code>bounds32()</code>.
377 * @exception IndexOutOfBoundsException Thrown if offset16 is not within the range of start and limit.
380 public static int charAt(char source[], int start, int limit, int offset16) {
382 if (offset16 < start || offset16 >= limit) {
383 throw new ArrayIndexOutOfBoundsException(offset16);
386 char single = source[offset16];
387 if (!isSurrogate(single)) {
391 // Convert the UTF-16 surrogate pair if necessary.
392 // For simplicity in usage, and because the frequency of pairs is
393 // low, look both directions.
394 if (single <= LEAD_SURROGATE_MAX_VALUE) {
396 if (offset16 >= limit) {
399 char trail = source[offset16];
400 if (isTrailSurrogate(trail)) {
401 return UCharacterProperty.getRawSupplementary(single, trail);
403 } else { // isTrailSurrogate(single), so
404 if (offset16 == start) {
408 char lead = source[offset16];
409 if (isLeadSurrogate(lead))
410 return UCharacterProperty.getRawSupplementary(lead, single);
412 return single; // return unmatched surrogate
416 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
417 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
418 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
420 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
421 * character will be returned. If a complete supplementary character is not found the incomplete
422 * character will be returned
424 * @param source UTF-16 chars string buffer
425 * @param offset16 UTF-16 offset to the start of the character.
426 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
427 * of that codepoint are the same as in <code>bounds32()</code>.
428 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
431 public static int charAt(Replaceable source, int offset16) {
432 if (offset16 < 0 || offset16 >= source.length()) {
433 throw new StringIndexOutOfBoundsException(offset16);
436 char single = source.charAt(offset16);
437 if (!isSurrogate(single)) {
441 // Convert the UTF-16 surrogate pair if necessary.
442 // For simplicity in usage, and because the frequency of pairs is
443 // low, look both directions.
445 if (single <= LEAD_SURROGATE_MAX_VALUE) {
447 if (source.length() != offset16) {
448 char trail = source.charAt(offset16);
449 if (isTrailSurrogate(trail))
450 return UCharacterProperty.getRawSupplementary(single, trail);
455 // single is a trail surrogate so
456 char lead = source.charAt(offset16);
457 if (isLeadSurrogate(lead)) {
458 return UCharacterProperty.getRawSupplementary(lead, single);
462 return single; // return unmatched surrogate
466 * Determines how many chars this char32 requires. If a validity check is required, use <code>
467 * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
468 * on char32 before calling.
470 * @param char32 The input codepoint.
471 * @return 2 if is in supplementary space, otherwise 1.
474 public static int getCharCount(int char32) {
475 if (char32 < SUPPLEMENTARY_MIN_VALUE) {
482 * Returns the type of the boundaries around the char at offset16. Used for random access.
484 * @param source Text to analyse
485 * @param offset16 UTF-16 offset
488 * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16+1]
489 * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
490 * are [offset16, offset16 + 2]
491 * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
492 * bounds are [offset16 - 1, offset16 + 1]
494 * For bit-twiddlers, the return values for these are chosen so that the boundaries
495 * can be gotten by: [offset16 - (value >> 2), offset16 + (value & 3)].
496 * @exception IndexOutOfBoundsException If offset16 is out of bounds.
499 public static int bounds(String source, int offset16) {
500 char ch = source.charAt(offset16);
501 if (isSurrogate(ch)) {
502 if (isLeadSurrogate(ch)) {
503 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) {
504 return LEAD_SURROGATE_BOUNDARY;
507 // isTrailSurrogate(ch), so
509 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) {
510 return TRAIL_SURROGATE_BOUNDARY;
514 return SINGLE_CHAR_BOUNDARY;
518 * Returns the type of the boundaries around the char at offset16. Used for random access.
520 * @param source String buffer to analyse
521 * @param offset16 UTF16 offset
524 * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16 + 1]
525 * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
526 * are [offset16, offset16 + 2]
527 * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
528 * bounds are [offset16 - 1, offset16 + 1]
530 * For bit-twiddlers, the return values for these are chosen so that the boundaries
531 * can be gotten by: [offset16 - (value >> 2), offset16 + (value & 3)].
532 * @exception IndexOutOfBoundsException If offset16 is out of bounds.
535 public static int bounds(StringBuffer source, int offset16) {
536 char ch = source.charAt(offset16);
537 if (isSurrogate(ch)) {
538 if (isLeadSurrogate(ch)) {
539 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) {
540 return LEAD_SURROGATE_BOUNDARY;
543 // isTrailSurrogate(ch), so
545 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) {
546 return TRAIL_SURROGATE_BOUNDARY;
550 return SINGLE_CHAR_BOUNDARY;
554 * Returns the type of the boundaries around the char at offset16. Used for random access. Note
555 * that the boundaries are determined with respect to the subarray, hence the char array
556 * {0xD800, 0xDC00} has the result SINGLE_CHAR_BOUNDARY for start = offset16 = 0 and limit = 1.
558 * @param source Char array to analyse
559 * @param start Offset to substring in the source array for analyzing
560 * @param limit Offset to substring in the source array for analyzing
561 * @param offset16 UTF16 offset relative to start
564 * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are
565 * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
566 * are [offset16, offset16 + 2]
567 * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
568 * bounds are [offset16 - 1, offset16 + 1]
570 * For bit-twiddlers, the boundary values for these are chosen so that the boundaries
571 * can be gotten by: [offset16 - (boundvalue >> 2), offset16 + (boundvalue & 3)].
572 * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit.
575 public static int bounds(char source[], int start, int limit, int offset16) {
577 if (offset16 < start || offset16 >= limit) {
578 throw new ArrayIndexOutOfBoundsException(offset16);
580 char ch = source[offset16];
581 if (isSurrogate(ch)) {
582 if (isLeadSurrogate(ch)) {
584 if (offset16 < limit && isTrailSurrogate(source[offset16])) {
585 return LEAD_SURROGATE_BOUNDARY;
587 } else { // isTrailSurrogate(ch), so
589 if (offset16 >= start && isLeadSurrogate(source[offset16])) {
590 return TRAIL_SURROGATE_BOUNDARY;
594 return SINGLE_CHAR_BOUNDARY;
598 * Determines whether the code value is a surrogate.
600 * @param char16 The input character.
601 * @return true If the input character is a surrogate.
604 public static boolean isSurrogate(char char16) {
605 return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS;
609 * Determines whether the character is a trail surrogate.
611 * @param char16 The input character.
612 * @return true If the input character is a trail surrogate.
615 public static boolean isTrailSurrogate(char char16) {
616 return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
620 * Determines whether the character is a lead surrogate.
622 * @param char16 The input character.
623 * @return true If the input character is a lead surrogate
626 public static boolean isLeadSurrogate(char char16) {
627 return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
631 * Returns the lead surrogate. If a validity check is required, use
632 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32
635 * @param char32 The input character.
636 * @return lead surrogate if the getCharCount(ch) is 2; <br>
637 * and 0 otherwise (note: 0 is not a valid lead surrogate).
640 public static char getLeadSurrogate(int char32) {
641 if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
642 return (char) (LEAD_SURROGATE_OFFSET_ + (char32 >> LEAD_SURROGATE_SHIFT_));
648 * Returns the trail surrogate. If a validity check is required, use
649 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32
652 * @param char32 The input character.
653 * @return the trail surrogate if the getCharCount(ch) is 2; <br>
654 * otherwise the character itself
657 public static char getTrailSurrogate(int char32) {
658 if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
659 return (char) (TRAIL_SURROGATE_MIN_VALUE + (char32 & TRAIL_SURROGATE_MASK_));
661 return (char) char32;
665 * Convenience method corresponding to String.valueOf(char). Returns a one or two char string
666 * containing the UTF-32 value in UTF16 format. If a validity check is required, use <a
667 * href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32 before calling.
669 * @param char32 The input character.
670 * @return string value of char32 in UTF16 format
671 * @exception IllegalArgumentException Thrown if char32 is a invalid codepoint.
674 public static String valueOf(int char32) {
675 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
676 throw new IllegalArgumentException("Illegal codepoint");
678 return toString(char32);
682 * Convenience method corresponding to String.valueOf(codepoint at offset16). Returns a one or
683 * two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a surrogate
684 * character, the whole supplementary codepoint will be returned. If a validity check is
685 * required, use <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on the
686 * codepoint at offset16 before calling. The result returned will be a newly created String
687 * obtained by calling source.substring(..) with the appropriate indexes.
689 * @param source The input string.
690 * @param offset16 The UTF16 index to the codepoint in source
691 * @return string value of char32 in UTF16 format
694 public static String valueOf(String source, int offset16) {
695 switch (bounds(source, offset16)) {
696 case LEAD_SURROGATE_BOUNDARY:
697 return source.substring(offset16, offset16 + 2);
698 case TRAIL_SURROGATE_BOUNDARY:
699 return source.substring(offset16 - 1, offset16 + 1);
701 return source.substring(offset16, offset16 + 1);
706 * Convenience method corresponding to StringBuffer.valueOf(codepoint at offset16). Returns a
707 * one or two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a
708 * surrogate character, the whole supplementary codepoint will be returned. If a validity check
709 * is required, use <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
710 * the codepoint at offset16 before calling. The result returned will be a newly created String
711 * obtained by calling source.substring(..) with the appropriate indexes.
713 * @param source The input string buffer.
714 * @param offset16 The UTF16 index to the codepoint in source
715 * @return string value of char32 in UTF16 format
718 public static String valueOf(StringBuffer source, int offset16) {
719 switch (bounds(source, offset16)) {
720 case LEAD_SURROGATE_BOUNDARY:
721 return source.substring(offset16, offset16 + 2);
722 case TRAIL_SURROGATE_BOUNDARY:
723 return source.substring(offset16 - 1, offset16 + 1);
725 return source.substring(offset16, offset16 + 1);
730 * Convenience method. Returns a one or two char string containing the UTF-32 value in UTF16
731 * format. If offset16 indexes a surrogate character, the whole supplementary codepoint will be
732 * returned, except when either the leading or trailing surrogate character lies out of the
733 * specified subarray. In the latter case, only the surrogate character within bounds will be
734 * returned. If a validity check is required, use <a
735 * href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on the codepoint at
736 * offset16 before calling. The result returned will be a newly created String containing the
737 * relevant characters.
739 * @param source The input char array.
740 * @param start Start index of the subarray
741 * @param limit End index of the subarray
742 * @param offset16 The UTF16 index to the codepoint in source relative to start
743 * @return string value of char32 in UTF16 format
746 public static String valueOf(char source[], int start, int limit, int offset16) {
747 switch (bounds(source, start, limit, offset16)) {
748 case LEAD_SURROGATE_BOUNDARY:
749 return new String(source, start + offset16, 2);
750 case TRAIL_SURROGATE_BOUNDARY:
751 return new String(source, start + offset16 - 1, 2);
753 return new String(source, start + offset16, 1);
757 * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
758 * the <a name="_top_">class description</a> for notes on roundtripping.
760 * @param source The UTF-16 string
761 * @param offset32 UTF-32 offset
762 * @return UTF-16 offset
763 * @exception IndexOutOfBoundsException If offset32 is out of bounds.
766 public static int findOffsetFromCodePoint(String source, int offset32) {
768 int size = source.length(), result = 0, count = offset32;
769 if (offset32 < 0 || offset32 > size) {
770 throw new StringIndexOutOfBoundsException(offset32);
772 while (result < size && count > 0) {
773 ch = source.charAt(result);
774 if (isLeadSurrogate(ch) && ((result + 1) < size)
775 && isTrailSurrogate(source.charAt(result + 1))) {
783 throw new StringIndexOutOfBoundsException(offset32);
789 * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
790 * the <a name="_top_">class description</a> for notes on roundtripping.
792 * @param source The UTF-16 string buffer
793 * @param offset32 UTF-32 offset
794 * @return UTF-16 offset
795 * @exception IndexOutOfBoundsException If offset32 is out of bounds.
798 public static int findOffsetFromCodePoint(StringBuffer source, int offset32) {
800 int size = source.length(), result = 0, count = offset32;
801 if (offset32 < 0 || offset32 > size) {
802 throw new StringIndexOutOfBoundsException(offset32);
804 while (result < size && count > 0) {
805 ch = source.charAt(result);
806 if (isLeadSurrogate(ch) && ((result + 1) < size)
807 && isTrailSurrogate(source.charAt(result + 1))) {
815 throw new StringIndexOutOfBoundsException(offset32);
821 * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
822 * the <a name="_top_">class description</a> for notes on roundtripping.
824 * @param source The UTF-16 char array whose substring is to be analysed
825 * @param start Offset of the substring to be analysed
826 * @param limit Offset of the substring to be analysed
827 * @param offset32 UTF-32 offset relative to start
828 * @return UTF-16 offset relative to start
829 * @exception IndexOutOfBoundsException If offset32 is out of bounds.
832 public static int findOffsetFromCodePoint(char source[], int start, int limit, int offset32) {
834 int result = start, count = offset32;
835 if (offset32 > limit - start) {
836 throw new ArrayIndexOutOfBoundsException(offset32);
838 while (result < limit && count > 0) {
840 if (isLeadSurrogate(ch) && ((result + 1) < limit)
841 && isTrailSurrogate(source[result + 1])) {
849 throw new ArrayIndexOutOfBoundsException(offset32);
851 return result - start;
855 * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at or after the given
856 * UTF-16 offset. Used for random access. See the <a name="_top_">class description</a> for
857 * notes on roundtripping.<br>
858 * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
859 * of the <strong>lead</strong> of the pair is returned. </i>
861 * To find the UTF-32 length of a string, use:
864 * len32 = countCodePoint(source, source.length());
870 * @param source Text to analyse
871 * @param offset16 UTF-16 offset < source text length.
872 * @return UTF-32 offset
873 * @exception IndexOutOfBoundsException If offset16 is out of bounds.
876 public static int findCodePointOffset(String source, int offset16) {
877 if (offset16 < 0 || offset16 > source.length()) {
878 throw new StringIndexOutOfBoundsException(offset16);
883 boolean hadLeadSurrogate = false;
885 for (int i = 0; i < offset16; ++i) {
886 ch = source.charAt(i);
887 if (hadLeadSurrogate && isTrailSurrogate(ch)) {
888 hadLeadSurrogate = false; // count valid trail as zero
890 hadLeadSurrogate = isLeadSurrogate(ch);
891 ++result; // count others as 1
895 if (offset16 == source.length()) {
899 // end of source being the less significant surrogate character
900 // shift result back to the start of the supplementary character
901 if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) {
909 * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16
910 * offset. Used for random access. See the <a name="_top_">class description</a> for notes on
912 * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
913 * of the <strong>lead</strong> of the pair is returned. </i>
915 * To find the UTF-32 length of a string, use:
918 * len32 = countCodePoint(source);
924 * @param source Text to analyse
925 * @param offset16 UTF-16 offset < source text length.
926 * @return UTF-32 offset
927 * @exception IndexOutOfBoundsException If offset16 is out of bounds.
930 public static int findCodePointOffset(StringBuffer source, int offset16) {
931 if (offset16 < 0 || offset16 > source.length()) {
932 throw new StringIndexOutOfBoundsException(offset16);
937 boolean hadLeadSurrogate = false;
939 for (int i = 0; i < offset16; ++i) {
940 ch = source.charAt(i);
941 if (hadLeadSurrogate && isTrailSurrogate(ch)) {
942 hadLeadSurrogate = false; // count valid trail as zero
944 hadLeadSurrogate = isLeadSurrogate(ch);
945 ++result; // count others as 1
949 if (offset16 == source.length()) {
953 // end of source being the less significant surrogate character
954 // shift result back to the start of the supplementary character
955 if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) {
963 * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16
964 * offset. Used for random access. See the <a name="_top_">class description</a> for notes on
966 * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
967 * of the <strong>lead</strong> of the pair is returned. </i>
969 * To find the UTF-32 length of a substring, use:
972 * len32 = countCodePoint(source, start, limit);
978 * @param source Text to analyse
979 * @param start Offset of the substring
980 * @param limit Offset of the substring
981 * @param offset16 UTF-16 relative to start
982 * @return UTF-32 offset relative to start
983 * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit.
986 public static int findCodePointOffset(char source[], int start, int limit, int offset16) {
988 if (offset16 > limit) {
989 throw new StringIndexOutOfBoundsException(offset16);
994 boolean hadLeadSurrogate = false;
996 for (int i = start; i < offset16; ++i) {
998 if (hadLeadSurrogate && isTrailSurrogate(ch)) {
999 hadLeadSurrogate = false; // count valid trail as zero
1001 hadLeadSurrogate = isLeadSurrogate(ch);
1002 ++result; // count others as 1
1006 if (offset16 == limit) {
1010 // end of source being the less significant surrogate character
1011 // shift result back to the start of the supplementary character
1012 if (hadLeadSurrogate && (isTrailSurrogate(source[offset16]))) {
1020 * Append a single UTF-32 value to the end of a StringBuffer. If a validity check is required,
1021 * use <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32 before
1024 * @param target The buffer to append to
1025 * @param char32 Value to append.
1026 * @return the updated StringBuffer
1027 * @exception IllegalArgumentException Thrown when char32 does not lie within the range of the Unicode codepoints
1030 public static StringBuffer append(StringBuffer target, int char32) {
1031 // Check for irregular values
1032 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1033 throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
1036 // Write the UTF-16 values
1037 if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
1038 target.append(getLeadSurrogate(char32));
1039 target.append(getTrailSurrogate(char32));
1041 target.append((char) char32);
1047 * Cover JDK 1.5 APIs. Append the code point to the buffer and return the buffer as a
1050 * @param target The buffer to append to
1051 * @param cp The code point to append
1052 * @return the updated StringBuffer
1053 * @throws IllegalArgumentException If cp is not a valid code point
1056 public static StringBuffer appendCodePoint(StringBuffer target, int cp) {
1057 return append(target, cp);
1061 * Adds a codepoint to offset16 position of the argument char array.
1063 * @param target Char array to be append with the new code point
1064 * @param limit UTF16 offset which the codepoint will be appended.
1065 * @param char32 Code point to be appended
1066 * @return offset after char32 in the array.
1067 * @exception IllegalArgumentException Thrown if there is not enough space for the append, or when char32 does not
1068 * lie within the range of the Unicode codepoints.
1071 public static int append(char[] target, int limit, int char32) {
1072 // Check for irregular values
1073 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1074 throw new IllegalArgumentException("Illegal codepoint");
1076 // Write the UTF-16 values
1077 if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
1078 target[limit++] = getLeadSurrogate(char32);
1079 target[limit++] = getTrailSurrogate(char32);
1081 target[limit++] = (char) char32;
1087 * Number of codepoints in a UTF16 String
1089 * @param source UTF16 string
1090 * @return number of codepoint in string
1093 public static int countCodePoint(String source) {
1094 if (source == null || source.length() == 0) {
1097 return findCodePointOffset(source, source.length());
1101 * Number of codepoints in a UTF16 String buffer
1103 * @param source UTF16 string buffer
1104 * @return number of codepoint in string
1107 public static int countCodePoint(StringBuffer source) {
1108 if (source == null || source.length() == 0) {
1111 return findCodePointOffset(source, source.length());
1115 * Number of codepoints in a UTF16 char array substring
1117 * @param source UTF16 char array
1118 * @param start Offset of the substring
1119 * @param limit Offset of the substring
1120 * @return number of codepoint in the substring
1121 * @exception IndexOutOfBoundsException If start and limit are not valid.
1124 public static int countCodePoint(char source[], int start, int limit) {
1125 if (source == null || source.length == 0) {
1128 return findCodePointOffset(source, start, limit, limit - start);
1132 * Set a code point into a UTF16 position. Adjusts target according if we are replacing a
1133 * non-supplementary codepoint with a supplementary and vice versa.
1135 * @param target Stringbuffer
1136 * @param offset16 UTF16 position to insert into
1137 * @param char32 Code point
1140 public static void setCharAt(StringBuffer target, int offset16, int char32) {
1142 char single = target.charAt(offset16);
1144 if (isSurrogate(single)) {
1145 // pairs of the surrogate with offset16 at the lead char found
1146 if (isLeadSurrogate(single) && (target.length() > offset16 + 1)
1147 && isTrailSurrogate(target.charAt(offset16 + 1))) {
1150 // pairs of the surrogate with offset16 at the trail char
1152 if (isTrailSurrogate(single) && (offset16 > 0)
1153 && isLeadSurrogate(target.charAt(offset16 - 1))) {
1159 target.replace(offset16, offset16 + count, valueOf(char32));
1163 * Set a code point into a UTF16 position in a char array. Adjusts target according if we are
1164 * replacing a non-supplementary codepoint with a supplementary and vice versa.
1166 * @param target char array
1167 * @param limit numbers of valid chars in target, different from target.length. limit counts the
1168 * number of chars in target that represents a string, not the size of array target.
1169 * @param offset16 UTF16 position to insert into
1170 * @param char32 code point
1171 * @return new number of chars in target that represents a string
1172 * @exception IndexOutOfBoundsException if offset16 is out of range
1175 public static int setCharAt(char target[], int limit, int offset16, int char32) {
1176 if (offset16 >= limit) {
1177 throw new ArrayIndexOutOfBoundsException(offset16);
1180 char single = target[offset16];
1182 if (isSurrogate(single)) {
1183 // pairs of the surrogate with offset16 at the lead char found
1184 if (isLeadSurrogate(single) && (target.length > offset16 + 1)
1185 && isTrailSurrogate(target[offset16 + 1])) {
1188 // pairs of the surrogate with offset16 at the trail char
1190 if (isTrailSurrogate(single) && (offset16 > 0)
1191 && isLeadSurrogate(target[offset16 - 1])) {
1198 String str = valueOf(char32);
1200 int strlength = str.length();
1201 target[offset16] = str.charAt(0);
1202 if (count == strlength) {
1204 target[offset16 + 1] = str.charAt(1);
1207 // this is not exact match in space, we'll have to do some
1209 System.arraycopy(target, offset16 + count, target, offset16 + strlength, limit
1210 - (offset16 + count));
1211 if (count < strlength) {
1212 // char32 is a supplementary character trying to squeeze into
1213 // a non-supplementary space
1214 target[offset16 + 1] = str.charAt(1);
1216 if (result < target.length) {
1220 // char32 is a non-supplementary character trying to fill
1221 // into a supplementary space
1230 * Shifts offset16 by the argument number of codepoints
1232 * @param source string
1233 * @param offset16 UTF16 position to shift
1234 * @param shift32 number of codepoints to shift
1235 * @return new shifted offset16
1236 * @exception IndexOutOfBoundsException if the new offset16 is out of bounds.
1239 public static int moveCodePointOffset(String source, int offset16, int shift32) {
1240 int result = offset16;
1241 int size = source.length();
1244 if (offset16 < 0 || offset16 > size) {
1245 throw new StringIndexOutOfBoundsException(offset16);
1248 if (shift32 + offset16 > size) {
1249 throw new StringIndexOutOfBoundsException(offset16);
1252 while (result < size && count > 0) {
1253 ch = source.charAt(result);
1254 if (isLeadSurrogate(ch) && ((result + 1) < size)
1255 && isTrailSurrogate(source.charAt(result + 1))) {
1262 if (offset16 + shift32 < 0) {
1263 throw new StringIndexOutOfBoundsException(offset16);
1265 for (count = -shift32; count > 0; count--) {
1270 ch = source.charAt(result);
1271 if (isTrailSurrogate(ch) && result > 0
1272 && isLeadSurrogate(source.charAt(result - 1))) {
1278 throw new StringIndexOutOfBoundsException(shift32);
1284 * Shifts offset16 by the argument number of codepoints
1286 * @param source String buffer
1287 * @param offset16 UTF16 position to shift
1288 * @param shift32 Number of codepoints to shift
1289 * @return new shifted offset16
1290 * @exception IndexOutOfBoundsException If the new offset16 is out of bounds.
1293 public static int moveCodePointOffset(StringBuffer source, int offset16, int shift32) {
1294 int result = offset16;
1295 int size = source.length();
1298 if (offset16 < 0 || offset16 > size) {
1299 throw new StringIndexOutOfBoundsException(offset16);
1302 if (shift32 + offset16 > size) {
1303 throw new StringIndexOutOfBoundsException(offset16);
1306 while (result < size && count > 0) {
1307 ch = source.charAt(result);
1308 if (isLeadSurrogate(ch) && ((result + 1) < size)
1309 && isTrailSurrogate(source.charAt(result + 1))) {
1316 if (offset16 + shift32 < 0) {
1317 throw new StringIndexOutOfBoundsException(offset16);
1319 for (count = -shift32; count > 0; count--) {
1324 ch = source.charAt(result);
1325 if (isTrailSurrogate(ch) && result > 0
1326 && isLeadSurrogate(source.charAt(result - 1))) {
1332 throw new StringIndexOutOfBoundsException(shift32);
1338 * Shifts offset16 by the argument number of codepoints within a subarray.
1340 * @param source Char array
1341 * @param start Position of the subarray to be performed on
1342 * @param limit Position of the subarray to be performed on
1343 * @param offset16 UTF16 position to shift relative to start
1344 * @param shift32 Number of codepoints to shift
1345 * @return new shifted offset16 relative to start
1346 * @exception IndexOutOfBoundsException If the new offset16 is out of bounds with respect to the subarray or the
1347 * subarray bounds are out of range.
1350 public static int moveCodePointOffset(char source[], int start, int limit, int offset16,
1352 int size = source.length;
1355 int result = offset16 + start;
1356 if (start < 0 || limit < start) {
1357 throw new StringIndexOutOfBoundsException(start);
1360 throw new StringIndexOutOfBoundsException(limit);
1362 if (offset16 < 0 || result > limit) {
1363 throw new StringIndexOutOfBoundsException(offset16);
1366 if (shift32 + result > size) {
1367 throw new StringIndexOutOfBoundsException(result);
1370 while (result < limit && count > 0) {
1371 ch = source[result];
1372 if (isLeadSurrogate(ch) && (result + 1 < limit)
1373 && isTrailSurrogate(source[result + 1])) {
1380 if (result + shift32 < start) {
1381 throw new StringIndexOutOfBoundsException(result);
1383 for (count = -shift32; count > 0; count--) {
1385 if (result < start) {
1388 ch = source[result];
1389 if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) {
1395 throw new StringIndexOutOfBoundsException(shift32);
1402 * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the
1403 * middle of a supplementary codepoint, char32 will be inserted after the supplementary
1404 * codepoint. The length of target increases by one if codepoint is non-supplementary, 2
1407 * The overall effect is exactly as if the argument were converted to a string by the method
1408 * valueOf(char) and the characters in that string were then inserted into target at the
1409 * position indicated by offset16.
1412 * The offset argument must be greater than or equal to 0, and less than or equal to the length
1415 * @param target String buffer to insert to
1416 * @param offset16 Offset which char32 will be inserted in
1417 * @param char32 Codepoint to be inserted
1418 * @return a reference to target
1419 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
1422 public static StringBuffer insert(StringBuffer target, int offset16, int char32) {
1423 String str = valueOf(char32);
1424 if (offset16 != target.length() && bounds(target, offset16) == TRAIL_SURROGATE_BOUNDARY) {
1427 target.insert(offset16, str);
1432 * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the
1433 * middle of a supplementary codepoint, char32 will be inserted after the supplementary
1434 * codepoint. Limit increases by one if codepoint is non-supplementary, 2 otherwise.
1436 * The overall effect is exactly as if the argument were converted to a string by the method
1437 * valueOf(char) and the characters in that string were then inserted into target at the
1438 * position indicated by offset16.
1441 * The offset argument must be greater than or equal to 0, and less than or equal to the limit.
1443 * @param target Char array to insert to
1444 * @param limit End index of the char array, limit <= target.length
1445 * @param offset16 Offset which char32 will be inserted in
1446 * @param char32 Codepoint to be inserted
1447 * @return new limit size
1448 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
1451 public static int insert(char target[], int limit, int offset16, int char32) {
1452 String str = valueOf(char32);
1453 if (offset16 != limit && bounds(target, 0, limit, offset16) == TRAIL_SURROGATE_BOUNDARY) {
1456 int size = str.length();
1457 if (limit + size > target.length) {
1458 throw new ArrayIndexOutOfBoundsException(offset16 + size);
1460 System.arraycopy(target, offset16, target, offset16 + size, limit - offset16);
1461 target[offset16] = str.charAt(0);
1463 target[offset16 + 1] = str.charAt(1);
1465 return limit + size;
1469 * Removes the codepoint at the specified position in this target (shortening target by 1
1470 * character if the codepoint is a non-supplementary, 2 otherwise).
1472 * @param target String buffer to remove codepoint from
1473 * @param offset16 Offset which the codepoint will be removed
1474 * @return a reference to target
1475 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
1478 public static StringBuffer delete(StringBuffer target, int offset16) {
1480 switch (bounds(target, offset16)) {
1481 case LEAD_SURROGATE_BOUNDARY:
1484 case TRAIL_SURROGATE_BOUNDARY:
1489 target.delete(offset16, offset16 + count);
1494 * Removes the codepoint at the specified position in this target (shortening target by 1
1495 * character if the codepoint is a non-supplementary, 2 otherwise).
1497 * @param target String buffer to remove codepoint from
1498 * @param limit End index of the char array, limit <= target.length
1499 * @param offset16 Offset which the codepoint will be removed
1500 * @return a new limit size
1501 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
1504 public static int delete(char target[], int limit, int offset16) {
1506 switch (bounds(target, 0, limit, offset16)) {
1507 case LEAD_SURROGATE_BOUNDARY:
1510 case TRAIL_SURROGATE_BOUNDARY:
1515 System.arraycopy(target, offset16 + count, target, offset16, limit - (offset16 + count));
1516 target[limit - count] = 0;
1517 return limit - count;
1521 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1522 * the argument codepoint. I.e., the smallest index <code>i</code> such that
1523 * <code>UTF16.charAt(source, i) ==
1524 * char32</code> is true.
1526 * If no such character occurs in this string, then -1 is returned.
1530 * UTF16.indexOf("abc", 'a') returns 0<br>
1531 * UTF16.indexOf("abc\ud800\udc00", 0x10000) returns 3<br>
1532 * UTF16.indexOf("abc\ud800\udc00", 0xd800) returns -1<br>
1534 * Note this method is provided as support to jdk 1.3, which does not support supplementary
1535 * characters to its fullest.
1537 * @param source UTF16 format Unicode string that will be searched
1538 * @param char32 Codepoint to search for
1539 * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
1540 * -1 if the codepoint does not occur.
1543 public static int indexOf(String source, int char32) {
1544 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1545 throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
1547 // non-surrogate bmp
1548 if (char32 < LEAD_SURROGATE_MIN_VALUE
1549 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1550 return source.indexOf((char) char32);
1553 if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1554 int result = source.indexOf((char) char32);
1556 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
1557 && isTrailSurrogate(source.charAt(result + 1))) {
1558 return indexOf(source, char32, result + 1);
1561 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
1562 return indexOf(source, char32, result + 1);
1568 String char32str = toString(char32);
1569 return source.indexOf(char32str);
1573 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1574 * the argument string str. This method is implemented based on codepoints, hence a "lead
1575 * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
1576 * starts with trail surrogate character at index 0, a source with a leading a surrogate
1577 * character before str found at in source will not have a valid match. Vice versa for lead
1578 * surrogates that ends str. See example below.
1580 * If no such string str occurs in this source, then -1 is returned.
1584 * UTF16.indexOf("abc", "ab") returns 0<br>
1585 * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br>
1586 * UTF16.indexOf("abc\ud800\udc00", "\ud800") returns -1<br>
1588 * Note this method is provided as support to jdk 1.3, which does not support supplementary
1589 * characters to its fullest.
1591 * @param source UTF16 format Unicode string that will be searched
1592 * @param str UTF16 format Unicode string to search for
1593 * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
1594 * -1 if the codepoint does not occur.
1597 public static int indexOf(String source, String str) {
1598 int strLength = str.length();
1599 // non-surrogate ends
1600 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
1601 return source.indexOf(str);
1604 int result = source.indexOf(str);
1605 int resultEnd = result + strLength;
1607 // check last character
1608 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
1609 && isTrailSurrogate(source.charAt(resultEnd + 1))) {
1610 return indexOf(source, str, resultEnd + 1);
1612 // check first character which is a trail surrogate
1613 if (isTrailSurrogate(str.charAt(0)) && result > 0
1614 && isLeadSurrogate(source.charAt(result - 1))) {
1615 return indexOf(source, str, resultEnd + 1);
1622 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1623 * the argument codepoint. I.e., the smallest index i such that: <br>
1624 * (UTF16.charAt(source, i) == char32 && i >= fromIndex) is true.
1626 * If no such character occurs in this string, then -1 is returned.
1630 * UTF16.indexOf("abc", 'a', 1) returns -1<br>
1631 * UTF16.indexOf("abc\ud800\udc00", 0x10000, 1) returns 3<br>
1632 * UTF16.indexOf("abc\ud800\udc00", 0xd800, 1) returns -1<br>
1634 * Note this method is provided as support to jdk 1.3, which does not support supplementary
1635 * characters to its fullest.
1637 * @param source UTF16 format Unicode string that will be searched
1638 * @param char32 Codepoint to search for
1639 * @param fromIndex The index to start the search from.
1640 * @return the index of the first occurrence of the codepoint in the argument Unicode string at
1641 * or after fromIndex, or -1 if the codepoint does not occur.
1644 public static int indexOf(String source, int char32, int fromIndex) {
1645 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1646 throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
1648 // non-surrogate bmp
1649 if (char32 < LEAD_SURROGATE_MIN_VALUE
1650 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1651 return source.indexOf((char) char32, fromIndex);
1654 if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1655 int result = source.indexOf((char) char32, fromIndex);
1657 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
1658 && isTrailSurrogate(source.charAt(result + 1))) {
1659 return indexOf(source, char32, result + 1);
1662 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
1663 return indexOf(source, char32, result + 1);
1669 String char32str = toString(char32);
1670 return source.indexOf(char32str, fromIndex);
1674 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
1675 * the argument string str. This method is implemented based on codepoints, hence a "lead
1676 * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
1677 * starts with trail surrogate character at index 0, a source with a leading a surrogate
1678 * character before str found at in source will not have a valid match. Vice versa for lead
1679 * surrogates that ends str. See example below.
1681 * If no such string str occurs in this source, then -1 is returned.
1685 * UTF16.indexOf("abc", "ab", 0) returns 0<br>
1686 * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 0) returns 3<br>
1687 * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 2) returns 3<br>
1688 * UTF16.indexOf("abc\ud800\udc00", "\ud800", 0) returns -1<br>
1690 * Note this method is provided as support to jdk 1.3, which does not support supplementary
1691 * characters to its fullest.
1693 * @param source UTF16 format Unicode string that will be searched
1694 * @param str UTF16 format Unicode string to search for
1695 * @param fromIndex The index to start the search from.
1696 * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
1697 * -1 if the codepoint does not occur.
1700 public static int indexOf(String source, String str, int fromIndex) {
1701 int strLength = str.length();
1702 // non-surrogate ends
1703 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
1704 return source.indexOf(str, fromIndex);
1707 int result = source.indexOf(str, fromIndex);
1708 int resultEnd = result + strLength;
1710 // check last character
1711 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
1712 && isTrailSurrogate(source.charAt(resultEnd))) {
1713 return indexOf(source, str, resultEnd + 1);
1715 // check first character which is a trail surrogate
1716 if (isTrailSurrogate(str.charAt(0)) && result > 0
1717 && isLeadSurrogate(source.charAt(result - 1))) {
1718 return indexOf(source, str, resultEnd + 1);
1725 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
1726 * the argument codepoint. I.e., the index returned is the largest value i such that:
1727 * UTF16.charAt(source, i) == char32 is true.
1730 * UTF16.lastIndexOf("abc", 'a') returns 0<br>
1731 * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000) returns 3<br>
1732 * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br>
1735 * source is searched backwards starting at the last character.
1737 * Note this method is provided as support to jdk 1.3, which does not support supplementary
1738 * characters to its fullest.
1740 * @param source UTF16 format Unicode string that will be searched
1741 * @param char32 Codepoint to search for
1742 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
1746 public static int lastIndexOf(String source, int char32) {
1747 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1748 throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
1750 // non-surrogate bmp
1751 if (char32 < LEAD_SURROGATE_MIN_VALUE
1752 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1753 return source.lastIndexOf((char) char32);
1756 if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1757 int result = source.lastIndexOf((char) char32);
1759 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
1760 && isTrailSurrogate(source.charAt(result + 1))) {
1761 return lastIndexOf(source, char32, result - 1);
1764 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
1765 return lastIndexOf(source, char32, result - 1);
1771 String char32str = toString(char32);
1772 return source.lastIndexOf(char32str);
1776 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
1777 * the argument string str. This method is implemented based on codepoints, hence a "lead
1778 * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
1779 * starts with trail surrogate character at index 0, a source with a leading a surrogate
1780 * character before str found at in source will not have a valid match. Vice versa for lead
1781 * surrogates that ends str. See example below.
1784 * UTF16.lastIndexOf("abc", "a") returns 0<br>
1785 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br>
1786 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800") returns -1<br>
1789 * source is searched backwards starting at the last character.
1791 * Note this method is provided as support to jdk 1.3, which does not support supplementary
1792 * characters to its fullest.
1794 * @param source UTF16 format Unicode string that will be searched
1795 * @param str UTF16 format Unicode string to search for
1796 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
1800 public static int lastIndexOf(String source, String str) {
1801 int strLength = str.length();
1802 // non-surrogate ends
1803 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
1804 return source.lastIndexOf(str);
1807 int result = source.lastIndexOf(str);
1809 // check last character
1810 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
1811 && isTrailSurrogate(source.charAt(result + strLength + 1))) {
1812 return lastIndexOf(source, str, result - 1);
1814 // check first character which is a trail surrogate
1815 if (isTrailSurrogate(str.charAt(0)) && result > 0
1816 && isLeadSurrogate(source.charAt(result - 1))) {
1817 return lastIndexOf(source, str, result - 1);
1825 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
1826 * the argument codepoint, where the result is less than or equals to fromIndex.
1829 * This method is implemented based on codepoints, hence a single surrogate character will not
1830 * match a supplementary character.
1833 * source is searched backwards starting at the last character starting at the specified index.
1837 * UTF16.lastIndexOf("abc", 'c', 2) returns 2<br>
1838 * UTF16.lastIndexOf("abc", 'c', 1) returns -1<br>
1839 * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 5) returns 3<br>
1840 * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 3) returns 3<br>
1841 * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br>
1843 * Note this method is provided as support to jdk 1.3, which does not support supplementary
1844 * characters to its fullest.
1846 * @param source UTF16 format Unicode string that will be searched
1847 * @param char32 Codepoint to search for
1848 * @param fromIndex the index to start the search from. There is no restriction on the value of
1849 * fromIndex. If it is greater than or equal to the length of this string, it has the
1850 * same effect as if it were equal to one less than the length of this string: this
1851 * entire string may be searched. If it is negative, it has the same effect as if it
1852 * were -1: -1 is returned.
1853 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
1857 public static int lastIndexOf(String source, int char32, int fromIndex) {
1858 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
1859 throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
1861 // non-surrogate bmp
1862 if (char32 < LEAD_SURROGATE_MIN_VALUE
1863 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
1864 return source.lastIndexOf((char) char32, fromIndex);
1867 if (char32 < SUPPLEMENTARY_MIN_VALUE) {
1868 int result = source.lastIndexOf((char) char32, fromIndex);
1870 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
1871 && isTrailSurrogate(source.charAt(result + 1))) {
1872 return lastIndexOf(source, char32, result - 1);
1875 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
1876 return lastIndexOf(source, char32, result - 1);
1882 String char32str = toString(char32);
1883 return source.lastIndexOf(char32str, fromIndex);
1888 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
1889 * the argument string str, where the result is less than or equals to fromIndex.
1892 * This method is implemented based on codepoints, hence a "lead surrogate character + trail
1893 * surrogate character" is treated as one entity. Hence if the str starts with trail surrogate
1894 * character at index 0, a source with a leading a surrogate character before str found at in
1895 * source will not have a valid match. Vice versa for lead surrogates that ends str.
1897 * See example below.
1900 * UTF16.lastIndexOf("abc", "c", 2) returns 2<br>
1901 * UTF16.lastIndexOf("abc", "c", 1) returns -1<br>
1902 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 5) returns 3<br>
1903 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 3) returns 3<br>
1904 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800", 4) returns -1<br>
1907 * source is searched backwards starting at the last character.
1909 * Note this method is provided as support to jdk 1.3, which does not support supplementary
1910 * characters to its fullest.
1912 * @param source UTF16 format Unicode string that will be searched
1913 * @param str UTF16 format Unicode string to search for
1914 * @param fromIndex the index to start the search from. There is no restriction on the value of
1915 * fromIndex. If it is greater than or equal to the length of this string, it has the
1916 * same effect as if it were equal to one less than the length of this string: this
1917 * entire string may be searched. If it is negative, it has the same effect as if it
1918 * were -1: -1 is returned.
1919 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
1923 public static int lastIndexOf(String source, String str, int fromIndex) {
1924 int strLength = str.length();
1925 // non-surrogate ends
1926 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
1927 return source.lastIndexOf(str, fromIndex);
1930 int result = source.lastIndexOf(str, fromIndex);
1932 // check last character
1933 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
1934 && isTrailSurrogate(source.charAt(result + strLength))) {
1935 return lastIndexOf(source, str, result - 1);
1937 // check first character which is a trail surrogate
1938 if (isTrailSurrogate(str.charAt(0)) && result > 0
1939 && isLeadSurrogate(source.charAt(result - 1))) {
1940 return lastIndexOf(source, str, result - 1);
1947 * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of
1948 * oldChar32 in source with newChar32. If the character oldChar32 does not occur in the UTF16
1949 * format Unicode string source, then source will be returned. Otherwise, a new String object is
1950 * created that represents a codepoint sequence identical to the codepoint sequence represented
1951 * by source, except that every occurrence of oldChar32 is replaced by an occurrence of
1955 * UTF16.replace("mesquite in your cellar", 'e', 'o');<br>
1956 * returns "mosquito in your collar"<br>
1957 * UTF16.replace("JonL", 'q', 'x');<br>
1958 * returns "JonL" (no change)<br>
1959 * UTF16.replace("Supplementary character \ud800\udc00", 0x10000, '!'); <br>
1960 * returns "Supplementary character !"<br>
1961 * UTF16.replace("Supplementary character \ud800\udc00", 0xd800, '!'); <br>
1962 * returns "Supplementary character \ud800\udc00"<br>
1964 * Note this method is provided as support to jdk 1.3, which does not support supplementary
1965 * characters to its fullest.
1967 * @param source UTF16 format Unicode string which the codepoint replacements will be based on.
1968 * @param oldChar32 Non-zero old codepoint to be replaced.
1969 * @param newChar32 The new codepoint to replace oldChar32
1970 * @return new String derived from source by replacing every occurrence of oldChar32 with
1971 * newChar32, unless when no oldChar32 is found in source then source will be returned.
1974 public static String replace(String source, int oldChar32, int newChar32) {
1975 if (oldChar32 <= 0 || oldChar32 > CODEPOINT_MAX_VALUE) {
1976 throw new IllegalArgumentException("Argument oldChar32 is not a valid codepoint");
1978 if (newChar32 <= 0 || newChar32 > CODEPOINT_MAX_VALUE) {
1979 throw new IllegalArgumentException("Argument newChar32 is not a valid codepoint");
1982 int index = indexOf(source, oldChar32);
1986 String newChar32Str = toString(newChar32);
1987 int oldChar32Size = 1;
1988 int newChar32Size = newChar32Str.length();
1989 StringBuffer result = new StringBuffer(source);
1990 int resultIndex = index;
1992 if (oldChar32 >= SUPPLEMENTARY_MIN_VALUE) {
1996 while (index != -1) {
1997 int endResultIndex = resultIndex + oldChar32Size;
1998 result.replace(resultIndex, endResultIndex, newChar32Str);
1999 int lastEndIndex = index + oldChar32Size;
2000 index = indexOf(source, oldChar32, lastEndIndex);
2001 resultIndex += newChar32Size + index - lastEndIndex;
2003 return result.toString();
2007 * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of oldStr
2008 * in source with newStr. If the string oldStr does not occur in the UTF16 format Unicode string
2009 * source, then source will be returned. Otherwise, a new String object is created that
2010 * represents a codepoint sequence identical to the codepoint sequence represented by source,
2011 * except that every occurrence of oldStr is replaced by an occurrence of newStr.
2014 * UTF16.replace("mesquite in your cellar", "e", "o");<br>
2015 * returns "mosquito in your collar"<br>
2016 * UTF16.replace("mesquite in your cellar", "mesquite", "cat");<br>
2017 * returns "cat in your cellar"<br>
2018 * UTF16.replace("JonL", "q", "x");<br>
2019 * returns "JonL" (no change)<br>
2020 * UTF16.replace("Supplementary character \ud800\udc00", "\ud800\udc00", '!'); <br>
2021 * returns "Supplementary character !"<br>
2022 * UTF16.replace("Supplementary character \ud800\udc00", "\ud800", '!'); <br>
2023 * returns "Supplementary character \ud800\udc00"<br>
2025 * Note this method is provided as support to jdk 1.3, which does not support supplementary
2026 * characters to its fullest.
2028 * @param source UTF16 format Unicode string which the replacements will be based on.
2029 * @param oldStr Non-zero-length string to be replaced.
2030 * @param newStr The new string to replace oldStr
2031 * @return new String derived from source by replacing every occurrence of oldStr with newStr.
2032 * When no oldStr is found in source, then source will be returned.
2035 public static String replace(String source, String oldStr, String newStr) {
2036 int index = indexOf(source, oldStr);
2040 int oldStrSize = oldStr.length();
2041 int newStrSize = newStr.length();
2042 StringBuffer result = new StringBuffer(source);
2043 int resultIndex = index;
2045 while (index != -1) {
2046 int endResultIndex = resultIndex + oldStrSize;
2047 result.replace(resultIndex, endResultIndex, newStr);
2048 int lastEndIndex = index + oldStrSize;
2049 index = indexOf(source, oldStr, lastEndIndex);
2050 resultIndex += newStrSize + index - lastEndIndex;
2052 return result.toString();
2056 * Reverses a UTF16 format Unicode string and replaces source's content with it. This method
2057 * will reverse surrogate characters correctly, instead of blindly reversing every character.
2060 * UTF16.reverse(new StringBuffer( "Supplementary characters \ud800\udc00\ud801\udc01"))<br>
2061 * returns "\ud801\udc01\ud800\udc00 sretcarahc yratnemelppuS".
2063 * @param source The source StringBuffer that contains UTF16 format Unicode string to be reversed
2064 * @return a modified source with reversed UTF16 format Unicode string.
2067 public static StringBuffer reverse(StringBuffer source) {
2068 int length = source.length();
2069 StringBuffer result = new StringBuffer(length);
2070 for (int i = length; i-- > 0;) {
2071 char ch = source.charAt(i);
2072 if (isTrailSurrogate(ch) && i > 0) {
2073 char ch2 = source.charAt(i - 1);
2074 if (isLeadSurrogate(ch2)) {
2087 * Check if the string contains more Unicode code points than a certain number. This is more
2088 * efficient than counting all code points in the entire string and comparing that number with a
2089 * threshold. This function may not need to scan the string at all if the length is within a
2090 * certain range, and never needs to count more than 'number + 1' code points. Logically
2091 * equivalent to (countCodePoint(s) > number). A Unicode code point may occupy either one or two
2094 * @param source The input string.
2095 * @param number The number of code points in the string is compared against the 'number'
2097 * @return boolean value for whether the string contains more Unicode code points than 'number'.
2100 public static boolean hasMoreCodePointsThan(String source, int number) {
2104 if (source == null) {
2107 int length = source.length();
2109 // length >= 0 known
2110 // source contains at least (length + 1) / 2 code points: <= 2
2112 if (((length + 1) >> 1) > number) {
2116 // check if source does not even contain enough chars
2117 int maxsupplementary = length - number;
2118 if (maxsupplementary <= 0) {
2122 // there are maxsupplementary = length - number more chars than
2123 // asked-for code points
2125 // count code points until they exceed and also check that there are
2126 // no more than maxsupplementary supplementary code points (char pairs)
2135 if (isLeadSurrogate(source.charAt(start++)) && start != length
2136 && isTrailSurrogate(source.charAt(start))) {
2138 if (--maxsupplementary <= 0) {
2139 // too many pairs - too few code points
2148 * Check if the sub-range of char array, from argument start to limit, contains more Unicode
2149 * code points than a certain number. This is more efficient than counting all code points in
2150 * the entire char array range and comparing that number with a threshold. This function may not
2151 * need to scan the char array at all if start and limit is within a certain range, and never
2152 * needs to count more than 'number + 1' code points. Logically equivalent to
2153 * (countCodePoint(source, start, limit) > number). A Unicode code point may occupy either one
2154 * or two code units.
2156 * @param source Array of UTF-16 chars
2157 * @param start Offset to substring in the source array for analyzing
2158 * @param limit Offset to substring in the source array for analyzing
2159 * @param number The number of code points in the string is compared against the 'number'
2161 * @return boolean value for whether the string contains more Unicode code points than 'number'.
2162 * @exception IndexOutOfBoundsException Thrown when limit < start
2165 public static boolean hasMoreCodePointsThan(char source[], int start, int limit, int number) {
2166 int length = limit - start;
2167 if (length < 0 || start < 0 || limit < 0) {
2168 throw new IndexOutOfBoundsException(
2169 "Start and limit indexes should be non-negative and start <= limit");
2174 if (source == null) {
2178 // length >= 0 known
2179 // source contains at least (length + 1) / 2 code points: <= 2
2181 if (((length + 1) >> 1) > number) {
2185 // check if source does not even contain enough chars
2186 int maxsupplementary = length - number;
2187 if (maxsupplementary <= 0) {
2191 // there are maxsupplementary = length - number more chars than
2192 // asked-for code points
2194 // count code points until they exceed and also check that there are
2195 // no more than maxsupplementary supplementary code points (char pairs)
2203 if (isLeadSurrogate(source[start++]) && start != limit
2204 && isTrailSurrogate(source[start])) {
2206 if (--maxsupplementary <= 0) {
2207 // too many pairs - too few code points
2216 * Check if the string buffer contains more Unicode code points than a certain number. This is
2217 * more efficient than counting all code points in the entire string buffer and comparing that
2218 * number with a threshold. This function may not need to scan the string buffer at all if the
2219 * length is within a certain range, and never needs to count more than 'number + 1' code
2220 * points. Logically equivalent to (countCodePoint(s) > number). A Unicode code point may occupy
2221 * either one or two code units.
2223 * @param source The input string buffer.
2224 * @param number The number of code points in the string buffer is compared against the 'number'
2226 * @return boolean value for whether the string buffer contains more Unicode code points than
2230 public static boolean hasMoreCodePointsThan(StringBuffer source, int number) {
2234 if (source == null) {
2237 int length = source.length();
2239 // length >= 0 known
2240 // source contains at least (length + 1) / 2 code points: <= 2
2242 if (((length + 1) >> 1) > number) {
2246 // check if source does not even contain enough chars
2247 int maxsupplementary = length - number;
2248 if (maxsupplementary <= 0) {
2252 // there are maxsupplementary = length - number more chars than
2253 // asked-for code points
2255 // count code points until they exceed and also check that there are
2256 // no more than maxsupplementary supplementary code points (char pairs)
2265 if (isLeadSurrogate(source.charAt(start++)) && start != length
2266 && isTrailSurrogate(source.charAt(start))) {
2268 if (--maxsupplementary <= 0) {
2269 // too many pairs - too few code points
2278 * Cover JDK 1.5 API. Create a String from an array of codePoints.
2280 * @param codePoints The code array
2281 * @param offset The start of the text in the code point array
2282 * @param count The number of code points
2283 * @return a String representing the code points between offset and count
2284 * @throws IllegalArgumentException If an invalid code point is encountered
2285 * @throws IndexOutOfBoundsException If the offset or count are out of bounds.
2288 public static String newString(int[] codePoints, int offset, int count) {
2290 throw new IllegalArgumentException();
2292 char[] chars = new char[count];
2294 for (int r = offset, e = offset + count; r < e; ++r) {
2295 int cp = codePoints[r];
2296 if (cp < 0 || cp > 0x10ffff) {
2297 throw new IllegalArgumentException();
2301 if (cp < 0x010000) {
2302 chars[w] = (char) cp;
2305 chars[w] = (char) (LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_));
2306 chars[w + 1] = (char) (TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_));
2310 } catch (IndexOutOfBoundsException ex) {
2311 int newlen = (int) (Math.ceil((double) codePoints.length * (w + 2)
2312 / (r - offset + 1)));
2313 char[] temp = new char[newlen];
2314 System.arraycopy(chars, 0, temp, 0, w);
2319 return new String(chars, 0, w);
2324 * UTF16 string comparator class. Allows UTF16 string comparison to be done with the various
2328 * <li> Code point comparison or code unit comparison
2329 * <li> Case sensitive comparison, case insensitive comparison or case insensitive comparison
2330 * with special handling for character 'i'.
2333 * The code unit or code point comparison differ only when comparing supplementary code points
2334 * (\u10000..\u10ffff) to BMP code points near the end of the BMP (i.e.,
2335 * \ue000..\uffff). In code unit comparison, high BMP code points sort after
2336 * supplementary code points because they are stored as pairs of surrogates which are at
2337 * \ud800..\udfff.
2340 * @see #FOLD_CASE_DEFAULT
2341 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2344 public static final class StringComparator implements java.util.Comparator<String> {
2345 // public constructor ------------------------------------------------
2348 * Default constructor that does code unit comparison and case sensitive comparison.
2352 public StringComparator() {
2353 this(false, false, FOLD_CASE_DEFAULT);
2357 * Constructor that does comparison based on the argument options.
2359 * @param codepointcompare Flag to indicate true for code point comparison or false for code unit
2361 * @param ignorecase False for case sensitive comparison, true for case-insensitive comparison
2362 * @param foldcaseoption FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only
2363 * when ignorecase is set to true. If ignorecase is false, this option is
2365 * @see #FOLD_CASE_DEFAULT
2366 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2367 * @throws IllegalArgumentException If foldcaseoption is out of range
2370 public StringComparator(boolean codepointcompare, boolean ignorecase, int foldcaseoption) {
2371 setCodePointCompare(codepointcompare);
2372 m_ignoreCase_ = ignorecase;
2373 if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) {
2374 throw new IllegalArgumentException("Invalid fold case option");
2376 m_foldCase_ = foldcaseoption;
2379 // public data member ------------------------------------------------
2382 * Option value for case folding comparison:
2384 * <p>Comparison is case insensitive, strings are folded using default mappings defined in
2385 * Unicode data file CaseFolding.txt, before comparison.
2389 public static final int FOLD_CASE_DEFAULT = 0;
2392 * Option value for case folding:
2393 * Use the modified set of mappings provided in CaseFolding.txt to handle dotted I
2394 * and dotless i appropriately for Turkic languages (tr, az).
2396 * <p>Comparison is case insensitive, strings are folded using modified mappings defined in
2397 * Unicode data file CaseFolding.txt, before comparison.
2400 * @see com.ibm.icu.lang.UCharacter#FOLD_CASE_EXCLUDE_SPECIAL_I
2402 public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = 1;
2404 // public methods ----------------------------------------------------
2406 // public setters ----------------------------------------------------
2409 * Sets the comparison mode to code point compare if flag is true. Otherwise comparison mode
2410 * is set to code unit compare
2412 * @param flag True for code point compare, false for code unit compare
2415 public void setCodePointCompare(boolean flag) {
2417 m_codePointCompare_ = Normalizer.COMPARE_CODE_POINT_ORDER;
2419 m_codePointCompare_ = 0;
2424 * Sets the Comparator to case-insensitive comparison mode if argument is true, otherwise
2425 * case sensitive comparison mode if set to false.
2427 * @param ignorecase True for case-insitive comparison, false for case sensitive comparison
2428 * @param foldcaseoption FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only
2429 * when ignorecase is set to true. If ignorecase is false, this option is
2431 * @see #FOLD_CASE_DEFAULT
2432 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2435 public void setIgnoreCase(boolean ignorecase, int foldcaseoption) {
2436 m_ignoreCase_ = ignorecase;
2437 if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) {
2438 throw new IllegalArgumentException("Invalid fold case option");
2440 m_foldCase_ = foldcaseoption;
2443 // public getters ----------------------------------------------------
2446 * Checks if the comparison mode is code point compare.
2448 * @return true for code point compare, false for code unit compare
2451 public boolean getCodePointCompare() {
2452 return m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER;
2456 * Checks if Comparator is in the case insensitive mode.
2458 * @return true if Comparator performs case insensitive comparison, false otherwise
2461 public boolean getIgnoreCase() {
2462 return m_ignoreCase_;
2466 * Gets the fold case options set in Comparator to be used with case insensitive comparison.
2468 * @return either FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I
2469 * @see #FOLD_CASE_DEFAULT
2470 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
2473 public int getIgnoreCaseOption() {
2477 // public other methods ----------------------------------------------
2480 * Compare two strings depending on the options selected during construction.
2482 * @param a first source string.
2483 * @param b second source string.
2484 * @return 0 returned if a == b. If a < b, a negative value is returned. Otherwise if a > b,
2485 * a positive value is returned.
2486 * @exception ClassCastException thrown when either a or b is not a String object
2489 public int compare(String a, String b) {
2500 if (m_ignoreCase_) {
2501 return compareCaseInsensitive(a, b);
2503 return compareCaseSensitive(a, b);
2506 // private data member ----------------------------------------------
2509 * Code unit comparison flag. True if code unit comparison is required. False if code point
2510 * comparison is required.
2512 private int m_codePointCompare_;
2515 * Fold case comparison option.
2517 private int m_foldCase_;
2520 * Flag indicator if ignore case is to be used during comparison
2522 private boolean m_ignoreCase_;
2525 * Code point order offset for surrogate characters
2527 private static final int CODE_POINT_COMPARE_SURROGATE_OFFSET_ = 0x2800;
2529 // private method ---------------------------------------------------
2532 * Compares case insensitive. This is a direct port of ICU4C, to make maintainence life
2536 * first string to compare
2538 * second string to compare
2539 * @return -1 is s1 < s2, 0 if equals,
2541 private int compareCaseInsensitive(String s1, String s2) {
2542 return Normalizer.cmpEquivFold(s1, s2, m_foldCase_ | m_codePointCompare_
2543 | Normalizer.COMPARE_IGNORE_CASE);
2547 * Compares case sensitive. This is a direct port of ICU4C, to make maintainence life
2551 * first string to compare
2553 * second string to compare
2554 * @return -1 is s1 < s2, 0 if equals,
2556 private int compareCaseSensitive(String s1, String s2) {
2557 // compare identical prefixes - they do not need to be fixed up
2558 // limit1 = start1 + min(lenght1, length2)
2559 int length1 = s1.length();
2560 int length2 = s2.length();
2561 int minlength = length1;
2563 if (length1 < length2) {
2565 } else if (length1 > length2) {
2567 minlength = length2;
2573 for (; index < minlength; index++) {
2574 c1 = s1.charAt(index);
2575 c2 = s2.charAt(index);
2576 // check pseudo-limit
2582 if (index == minlength) {
2586 boolean codepointcompare = m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER;
2587 // if both values are in or above the surrogate range, fix them up
2588 if (c1 >= LEAD_SURROGATE_MIN_VALUE && c2 >= LEAD_SURROGATE_MIN_VALUE
2589 && codepointcompare) {
2590 // subtract 0x2800 from BMP code points to make them smaller
2591 // than supplementary ones
2592 if ((c1 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length1 && isTrailSurrogate(s1.charAt(index + 1)))
2593 || (isTrailSurrogate(c1) && index != 0 && isLeadSurrogate(s1.charAt(index - 1)))) {
2594 // part of a surrogate pair, leave >=d800
2596 // BMP code point - may be surrogate code point - make
2598 c1 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_;
2601 if ((c2 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length2 && isTrailSurrogate(s2.charAt(index + 1)))
2602 || (isTrailSurrogate(c2) && index != 0 && isLeadSurrogate(s2.charAt(index - 1)))) {
2603 // part of a surrogate pair, leave >=d800
2605 // BMP code point - may be surrogate code point - make <d800
2606 c2 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_;
2610 // now c1 and c2 are in UTF-32-compatible order
2615 // private data members -------------------------------------------------
2618 * Shift value for lead surrogate to form a supplementary character.
2620 private static final int LEAD_SURROGATE_SHIFT_ = 10;
2623 * Mask to retrieve the significant value from a trail surrogate.
2625 private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
2628 * Value that all lead surrogate starts with
2630 private static final int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE
2631 - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);
2633 // private methods ------------------------------------------------------
2637 * Converts argument code point and returns a String object representing the code point's value
2641 * This method does not check for the validity of the codepoint, the results are not guaranteed
2642 * if a invalid codepoint is passed as argument.
2645 * The result is a string whose length is 1 for non-supplementary code points, 2 otherwise.
2650 * @return string representation of the code point
2652 private static String toString(int ch) {
2653 if (ch < SUPPLEMENTARY_MIN_VALUE) {
2654 return String.valueOf((char) ch);
2657 StringBuilder result = new StringBuilder();
2658 result.append(getLeadSurrogate(ch));
2659 result.append(getTrailSurrogate(ch));
2660 return result.toString();