3 *******************************************************************************
\r
4 * Copyright (C) 1996-2009, International Business Machines Corporation and *
\r
5 * others. All Rights Reserved. *
\r
6 *******************************************************************************
\r
9 package com.ibm.icu.text;
\r
11 import com.ibm.icu.impl.UCharacterProperty;
\r
12 import com.ibm.icu.impl.NormalizerImpl;
\r
16 * Standalone utility class providing UTF16 character conversions and indexing conversions.
\r
19 * Code that uses strings alone rarely need modification. By design, UTF-16 does not allow overlap,
\r
20 * so searching for strings is a safe operation. Similarly, concatenation is always safe.
\r
21 * Substringing is safe if the start and end are both on UTF-32 boundaries. In normal code, the
\r
22 * values for start and end are on those boundaries, since they arose from operations like
\r
23 * searching. If not, the nearest UTF-32 boundaries can be determined using <code>bounds()</code>.
\r
25 * <strong>Examples:</strong>
\r
27 * The following examples illustrate use of some of these methods.
\r
30 * // iteration forwards: Original
\r
31 * for (int i = 0; i < s.length(); ++i) {
\r
32 * char ch = s.charAt(i);
\r
33 * doSomethingWith(ch);
\r
36 * // iteration forwards: Changes for UTF-32
\r
38 * for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) {
\r
39 * ch = UTF16.charAt(s, i);
\r
40 * doSomethingWith(ch);
\r
43 * // iteration backwards: Original
\r
44 * for (int i = s.length() - 1; i >= 0; --i) {
\r
45 * char ch = s.charAt(i);
\r
46 * doSomethingWith(ch);
\r
49 * // iteration backwards: Changes for UTF-32
\r
51 * for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) {
\r
52 * ch = UTF16.charAt(s, i);
\r
53 * doSomethingWith(ch);
\r
57 * <strong>Notes:</strong>
\r
59 * <li> <strong>Naming:</strong> For clarity, High and Low surrogates are called <code>Lead</code>
\r
60 * and <code>Trail</code> in the API, which gives a better sense of their ordering in a string.
\r
61 * <code>offset16</code> and <code>offset32</code> are used to distinguish offsets to UTF-16
\r
62 * boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is used to contain UTF-32
\r
63 * characters, as opposed to <code>char16</code>, which is a UTF-16 code unit. </li>
\r
64 * <li> <strong>Roundtripping Offsets:</strong> You can always roundtrip from a UTF-32 offset to a
\r
65 * UTF-16 offset and back. Because of the difference in structure, you can roundtrip from a UTF-16
\r
66 * offset to a UTF-32 offset and back if and only if <code>bounds(string, offset16) != TRAIL</code>.
\r
68 * <li> <strong>Exceptions:</strong> The error checking will throw an exception if indices are out
\r
69 * of bounds. Other than than that, all methods will behave reasonably, even if unmatched surrogates
\r
70 * or out-of-bounds UTF-32 values are present. <code>UCharacter.isLegal()</code> can be used to
\r
71 * check for validity if desired. </li>
\r
72 * <li> <strong>Unmatched Surrogates:</strong> If the string contains unmatched surrogates, then
\r
73 * these are counted as one UTF-32 value. This matches their iteration behavior, which is vital. It
\r
74 * also matches common display practice as missing glyphs (see the Unicode Standard Section 5.4,
\r
76 * <li> <strong>Optimization:</strong> The method implementations may need optimization if the
\r
77 * compiler doesn't fold static final methods. Since surrogate pairs will form an exceeding small
\r
78 * percentage of all the text in the world, the singleton case should always be optimized for. </li>
\r
81 * @author Mark Davis, with help from Markus Scherer
\r
85 public final class UTF16 {
\r
86 // public variables ---------------------------------------------------
\r
89 * Value returned in <code><a href="#bounds(java.lang.String, int)">
\r
90 * bounds()</a></code>.
\r
91 * These values are chosen specifically so that it actually represents the position of the
\r
92 * character [offset16 - (value >> 2), offset16 + (value & 3)]
\r
96 public static final int SINGLE_CHAR_BOUNDARY = 1, LEAD_SURROGATE_BOUNDARY = 2,
\r
97 TRAIL_SURROGATE_BOUNDARY = 5;
\r
100 * The lowest Unicode code point value.
\r
104 public static final int CODEPOINT_MIN_VALUE = 0;
\r
107 * The highest Unicode code point value (scalar value) according to the Unicode Standard.
\r
111 public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
\r
114 * The minimum value for Supplementary code points
\r
118 public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
\r
121 * Lead surrogate minimum value
\r
125 public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
\r
128 * Trail surrogate minimum value
\r
132 public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
\r
135 * Lead surrogate maximum value
\r
139 public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
\r
142 * Trail surrogate maximum value
\r
146 public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
\r
149 * Surrogate minimum value
\r
153 public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
\r
156 * Maximum surrogate value
\r
160 public static final int SURROGATE_MAX_VALUE = TRAIL_SURROGATE_MAX_VALUE;
\r
163 * Lead surrogate bitmask
\r
165 private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00;
\r
168 * Trail surrogate bitmask
\r
170 private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00;
\r
173 * Surrogate bitmask
\r
175 private static final int SURROGATE_BITMASK = 0xFFFFF800;
\r
178 * Lead surrogate bits
\r
180 private static final int LEAD_SURROGATE_BITS = 0xD800;
\r
183 * Trail surrogate bits
\r
185 private static final int TRAIL_SURROGATE_BITS = 0xDC00;
\r
190 private static final int SURROGATE_BITS = 0xD800;
\r
192 // constructor --------------------------------------------------------
\r
196 * Prevent instance from being created.
\r
202 // public method ------------------------------------------------------
\r
205 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
\r
206 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
\r
207 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">
\r
208 * UCharacter.isLegal()</a></code>
\r
209 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
\r
210 * character will be returned. If a complete supplementary character is not found the incomplete
\r
211 * character will be returned
\r
214 * array of UTF-16 chars
\r
216 * UTF-16 offset to the start of the character.
\r
217 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
\r
218 * of that codepoint are the same as in <code>bounds32()</code>.
\r
219 * @exception IndexOutOfBoundsException
\r
220 * thrown if offset16 is out of bounds.
\r
223 public static int charAt(String source, int offset16) {
\r
224 char single = source.charAt(offset16);
\r
225 if (single < LEAD_SURROGATE_MIN_VALUE) {
\r
228 return _charAt(source, offset16, single);
\r
231 private static int _charAt(String source, int offset16, char single) {
\r
232 if (single > TRAIL_SURROGATE_MAX_VALUE) {
\r
236 // Convert the UTF-16 surrogate pair if necessary.
\r
237 // For simplicity in usage, and because the frequency of pairs is
\r
238 // low, look both directions.
\r
240 if (single <= LEAD_SURROGATE_MAX_VALUE) {
\r
242 if (source.length() != offset16) {
\r
243 char trail = source.charAt(offset16);
\r
244 if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
\r
245 return UCharacterProperty.getRawSupplementary(single, trail);
\r
250 if (offset16 >= 0) {
\r
251 // single is a trail surrogate so
\r
252 char lead = source.charAt(offset16);
\r
253 if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
\r
254 return UCharacterProperty.getRawSupplementary(lead, single);
\r
258 return single; // return unmatched surrogate
\r
261 //#if defined(FOUNDATION10) || defined(J2SE13)
\r
264 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
\r
265 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
\r
266 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">
\r
267 * UCharacter.isLegal()</a></code>
\r
268 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
\r
269 * character will be returned. If a complete supplementary character is not found the incomplete
\r
270 * character will be returned
\r
273 * array of UTF-16 chars
\r
275 * UTF-16 offset to the start of the character.
\r
276 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
\r
277 * of that codepoint are the same as in <code>bounds32()</code>.
\r
278 * @exception IndexOutOfBoundsException
\r
279 * thrown if offset16 is out of bounds.
\r
282 public static int charAt(CharSequence source, int offset16) {
\r
283 char single = source.charAt(offset16);
\r
284 if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) {
\r
287 return _charAt(source, offset16, single);
\r
290 private static int _charAt(CharSequence source, int offset16, char single) {
\r
291 if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
\r
295 // Convert the UTF-16 surrogate pair if necessary.
\r
296 // For simplicity in usage, and because the frequency of pairs is
\r
297 // low, look both directions.
\r
299 if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
\r
301 if (source.length() != offset16) {
\r
302 char trail = source.charAt(offset16);
\r
303 if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE
\r
304 && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
\r
305 return UCharacterProperty.getRawSupplementary(single, trail);
\r
310 if (offset16 >= 0) {
\r
311 // single is a trail surrogate so
\r
312 char lead = source.charAt(offset16);
\r
313 if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE
\r
314 && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
\r
315 return UCharacterProperty.getRawSupplementary(lead, single);
\r
319 return single; // return unmatched surrogate
\r
325 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
\r
326 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
\r
327 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
\r
329 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
\r
330 * character will be returned. If a complete supplementary character is not found the incomplete
\r
331 * character will be returned
\r
334 * UTF-16 chars string buffer
\r
336 * UTF-16 offset to the start of the character.
\r
337 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
\r
338 * of that codepoint are the same as in <code>bounds32()</code>.
\r
339 * @exception IndexOutOfBoundsException
\r
340 * thrown if offset16 is out of bounds.
\r
343 public static int charAt(StringBuffer source, int offset16) {
\r
344 if (offset16 < 0 || offset16 >= source.length()) {
\r
345 throw new StringIndexOutOfBoundsException(offset16);
\r
348 char single = source.charAt(offset16);
\r
349 if (!isSurrogate(single)) {
\r
353 // Convert the UTF-16 surrogate pair if necessary.
\r
354 // For simplicity in usage, and because the frequency of pairs is
\r
355 // low, look both directions.
\r
357 if (single <= LEAD_SURROGATE_MAX_VALUE) {
\r
359 if (source.length() != offset16) {
\r
360 char trail = source.charAt(offset16);
\r
361 if (isTrailSurrogate(trail))
\r
362 return UCharacterProperty.getRawSupplementary(single, trail);
\r
366 if (offset16 >= 0) {
\r
367 // single is a trail surrogate so
\r
368 char lead = source.charAt(offset16);
\r
369 if (isLeadSurrogate(lead)) {
\r
370 return UCharacterProperty.getRawSupplementary(lead, single);
\r
374 return single; // return unmatched surrogate
\r
378 * Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards
\r
379 * (with <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
\r
380 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
\r
382 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
\r
383 * character will be returned. If a complete supplementary character is not found the incomplete
\r
384 * character will be returned
\r
387 * array of UTF-16 chars
\r
389 * offset to substring in the source array for analyzing
\r
391 * offset to substring in the source array for analyzing
\r
393 * UTF-16 offset relative to start
\r
394 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
\r
395 * of that codepoint are the same as in <code>bounds32()</code>.
\r
396 * @exception IndexOutOfBoundsException
\r
397 * thrown if offset16 is not within the range of start and limit.
\r
400 public static int charAt(char source[], int start, int limit, int offset16) {
\r
402 if (offset16 < start || offset16 >= limit) {
\r
403 throw new ArrayIndexOutOfBoundsException(offset16);
\r
406 char single = source[offset16];
\r
407 if (!isSurrogate(single)) {
\r
411 // Convert the UTF-16 surrogate pair if necessary.
\r
412 // For simplicity in usage, and because the frequency of pairs is
\r
413 // low, look both directions.
\r
414 if (single <= LEAD_SURROGATE_MAX_VALUE) {
\r
416 if (offset16 >= limit) {
\r
419 char trail = source[offset16];
\r
420 if (isTrailSurrogate(trail)) {
\r
421 return UCharacterProperty.getRawSupplementary(single, trail);
\r
423 } else { // isTrailSurrogate(single), so
\r
424 if (offset16 == start) {
\r
428 char lead = source[offset16];
\r
429 if (isLeadSurrogate(lead))
\r
430 return UCharacterProperty.getRawSupplementary(lead, single);
\r
432 return single; // return unmatched surrogate
\r
436 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
\r
437 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
\r
438 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
\r
440 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
\r
441 * character will be returned. If a complete supplementary character is not found the incomplete
\r
442 * character will be returned
\r
445 * UTF-16 chars string buffer
\r
447 * UTF-16 offset to the start of the character.
\r
448 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
\r
449 * of that codepoint are the same as in <code>bounds32()</code>.
\r
450 * @exception IndexOutOfBoundsException
\r
451 * thrown if offset16 is out of bounds.
\r
454 public static int charAt(Replaceable source, int offset16) {
\r
455 if (offset16 < 0 || offset16 >= source.length()) {
\r
456 throw new StringIndexOutOfBoundsException(offset16);
\r
459 char single = source.charAt(offset16);
\r
460 if (!isSurrogate(single)) {
\r
464 // Convert the UTF-16 surrogate pair if necessary.
\r
465 // For simplicity in usage, and because the frequency of pairs is
\r
466 // low, look both directions.
\r
468 if (single <= LEAD_SURROGATE_MAX_VALUE) {
\r
470 if (source.length() != offset16) {
\r
471 char trail = source.charAt(offset16);
\r
472 if (isTrailSurrogate(trail))
\r
473 return UCharacterProperty.getRawSupplementary(single, trail);
\r
477 if (offset16 >= 0) {
\r
478 // single is a trail surrogate so
\r
479 char lead = source.charAt(offset16);
\r
480 if (isLeadSurrogate(lead)) {
\r
481 return UCharacterProperty.getRawSupplementary(lead, single);
\r
485 return single; // return unmatched surrogate
\r
489 * Determines how many chars this char32 requires. If a validity check is required, use <code>
\r
490 * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
\r
491 * on char32 before calling.
\r
494 * the input codepoint.
\r
495 * @return 2 if is in supplementary space, otherwise 1.
\r
498 public static int getCharCount(int char32) {
\r
499 if (char32 < SUPPLEMENTARY_MIN_VALUE) {
\r
506 * Returns the type of the boundaries around the char at offset16. Used for random access.
\r
514 * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16+1]
\r
515 * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
\r
516 * are [offset16, offset16 + 2]
\r
517 * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
\r
518 * bounds are [offset16 - 1, offset16 + 1]
\r
520 * For bit-twiddlers, the return values for these are chosen so that the boundaries
\r
521 * can be gotten by: [offset16 - (value >> 2), offset16 + (value & 3)].
\r
522 * @exception IndexOutOfBoundsException
\r
523 * if offset16 is out of bounds.
\r
526 public static int bounds(String source, int offset16) {
\r
527 char ch = source.charAt(offset16);
\r
528 if (isSurrogate(ch)) {
\r
529 if (isLeadSurrogate(ch)) {
\r
530 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) {
\r
531 return LEAD_SURROGATE_BOUNDARY;
\r
534 // isTrailSurrogate(ch), so
\r
536 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) {
\r
537 return TRAIL_SURROGATE_BOUNDARY;
\r
541 return SINGLE_CHAR_BOUNDARY;
\r
545 * Returns the type of the boundaries around the char at offset16. Used for random access.
\r
548 * string buffer to analyse
\r
553 * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16 + 1]
\r
554 * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
\r
555 * are [offset16, offset16 + 2]
\r
556 * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
\r
557 * bounds are [offset16 - 1, offset16 + 1]
\r
559 * For bit-twiddlers, the return values for these are chosen so that the boundaries
\r
560 * can be gotten by: [offset16 - (value >> 2), offset16 + (value & 3)].
\r
561 * @exception IndexOutOfBoundsException
\r
562 * if offset16 is out of bounds.
\r
565 public static int bounds(StringBuffer source, int offset16) {
\r
566 char ch = source.charAt(offset16);
\r
567 if (isSurrogate(ch)) {
\r
568 if (isLeadSurrogate(ch)) {
\r
569 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) {
\r
570 return LEAD_SURROGATE_BOUNDARY;
\r
573 // isTrailSurrogate(ch), so
\r
575 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) {
\r
576 return TRAIL_SURROGATE_BOUNDARY;
\r
580 return SINGLE_CHAR_BOUNDARY;
\r
584 * Returns the type of the boundaries around the char at offset16. Used for random access. Note
\r
585 * that the boundaries are determined with respect to the subarray, hence the char array
\r
586 * {0xD800, 0xDC00} has the result SINGLE_CHAR_BOUNDARY for start = offset16 = 0 and limit = 1.
\r
589 * char array to analyse
\r
591 * offset to substring in the source array for analyzing
\r
593 * offset to substring in the source array for analyzing
\r
595 * UTF16 offset relative to start
\r
598 * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are
\r
599 * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
\r
600 * are [offset16, offset16 + 2]
\r
601 * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
\r
602 * bounds are [offset16 - 1, offset16 + 1]
\r
604 * For bit-twiddlers, the boundary values for these are chosen so that the boundaries
\r
605 * can be gotten by: [offset16 - (boundvalue >> 2), offset16 + (boundvalue & 3)].
\r
606 * @exception IndexOutOfBoundsException
\r
607 * if offset16 is not within the range of start and limit.
\r
610 public static int bounds(char source[], int start, int limit, int offset16) {
\r
612 if (offset16 < start || offset16 >= limit) {
\r
613 throw new ArrayIndexOutOfBoundsException(offset16);
\r
615 char ch = source[offset16];
\r
616 if (isSurrogate(ch)) {
\r
617 if (isLeadSurrogate(ch)) {
\r
619 if (offset16 < limit && isTrailSurrogate(source[offset16])) {
\r
620 return LEAD_SURROGATE_BOUNDARY;
\r
622 } else { // isTrailSurrogate(ch), so
\r
624 if (offset16 >= start && isLeadSurrogate(source[offset16])) {
\r
625 return TRAIL_SURROGATE_BOUNDARY;
\r
629 return SINGLE_CHAR_BOUNDARY;
\r
633 * Determines whether the code value is a surrogate.
\r
636 * the input character.
\r
637 * @return true iff the input character is a surrogate.
\r
640 public static boolean isSurrogate(char char16) {
\r
641 return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS;
\r
645 * Determines whether the character is a trail surrogate.
\r
648 * the input character.
\r
649 * @return true iff the input character is a trail surrogate.
\r
652 public static boolean isTrailSurrogate(char char16) {
\r
653 return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
\r
657 * Determines whether the character is a lead surrogate.
\r
660 * the input character.
\r
661 * @return true iff the input character is a lead surrogate
\r
664 public static boolean isLeadSurrogate(char char16) {
\r
665 return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
\r
669 * Returns the lead surrogate. If a validity check is required, use
\r
670 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32
\r
674 * the input character.
\r
675 * @return lead surrogate if the getCharCount(ch) is 2; <br>
\r
676 * and 0 otherwise (note: 0 is not a valid lead surrogate).
\r
679 public static char getLeadSurrogate(int char32) {
\r
680 if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
\r
681 return (char) (LEAD_SURROGATE_OFFSET_ + (char32 >> LEAD_SURROGATE_SHIFT_));
\r
687 * Returns the trail surrogate. If a validity check is required, use
\r
688 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32
\r
692 * the input character.
\r
693 * @return the trail surrogate if the getCharCount(ch) is 2; <br>
\r
694 * otherwise the character itself
\r
697 public static char getTrailSurrogate(int char32) {
\r
698 if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
\r
699 return (char) (TRAIL_SURROGATE_MIN_VALUE + (char32 & TRAIL_SURROGATE_MASK_));
\r
701 return (char) char32;
\r
705 * Convenience method corresponding to String.valueOf(char). Returns a one or two char string
\r
706 * containing the UTF-32 value in UTF16 format. If a validity check is required, use <a
\r
707 * href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32 before calling.
\r
710 * the input character.
\r
711 * @return string value of char32 in UTF16 format
\r
712 * @exception IllegalArgumentException
\r
713 * thrown if char32 is a invalid codepoint.
\r
716 public static String valueOf(int char32) {
\r
717 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
\r
718 throw new IllegalArgumentException("Illegal codepoint");
\r
720 return toString(char32);
\r
724 * Convenience method corresponding to String.valueOf(codepoint at offset16). Returns a one or
\r
725 * two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a surrogate
\r
726 * character, the whole supplementary codepoint will be returned. If a validity check is
\r
727 * required, use <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on the
\r
728 * codepoint at offset16 before calling. The result returned will be a newly created String
\r
729 * obtained by calling source.substring(..) with the appropriate indexes.
\r
732 * the input string.
\r
734 * the UTF16 index to the codepoint in source
\r
735 * @return string value of char32 in UTF16 format
\r
738 public static String valueOf(String source, int offset16) {
\r
739 switch (bounds(source, offset16)) {
\r
740 case LEAD_SURROGATE_BOUNDARY:
\r
741 return source.substring(offset16, offset16 + 2);
\r
742 case TRAIL_SURROGATE_BOUNDARY:
\r
743 return source.substring(offset16 - 1, offset16 + 1);
\r
745 return source.substring(offset16, offset16 + 1);
\r
750 * Convenience method corresponding to StringBuffer.valueOf(codepoint at offset16). Returns a
\r
751 * one or two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a
\r
752 * surrogate character, the whole supplementary codepoint will be returned. If a validity check
\r
753 * is required, use <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
\r
754 * the codepoint at offset16 before calling. The result returned will be a newly created String
\r
755 * obtained by calling source.substring(..) with the appropriate indexes.
\r
758 * the input string buffer.
\r
760 * the UTF16 index to the codepoint in source
\r
761 * @return string value of char32 in UTF16 format
\r
764 public static String valueOf(StringBuffer source, int offset16) {
\r
765 switch (bounds(source, offset16)) {
\r
766 case LEAD_SURROGATE_BOUNDARY:
\r
767 return source.substring(offset16, offset16 + 2);
\r
768 case TRAIL_SURROGATE_BOUNDARY:
\r
769 return source.substring(offset16 - 1, offset16 + 1);
\r
771 return source.substring(offset16, offset16 + 1);
\r
776 * Convenience method. Returns a one or two char string containing the UTF-32 value in UTF16
\r
777 * format. If offset16 indexes a surrogate character, the whole supplementary codepoint will be
\r
778 * returned, except when either the leading or trailing surrogate character lies out of the
\r
779 * specified subarray. In the latter case, only the surrogate character within bounds will be
\r
780 * returned. If a validity check is required, use <a
\r
781 * href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on the codepoint at
\r
782 * offset16 before calling. The result returned will be a newly created String containing the
\r
783 * relevant characters.
\r
786 * the input char array.
\r
788 * start index of the subarray
\r
790 * end index of the subarray
\r
792 * the UTF16 index to the codepoint in source relative to start
\r
793 * @return string value of char32 in UTF16 format
\r
796 public static String valueOf(char source[], int start, int limit, int offset16) {
\r
797 switch (bounds(source, start, limit, offset16)) {
\r
798 case LEAD_SURROGATE_BOUNDARY:
\r
799 return new String(source, start + offset16, 2);
\r
800 case TRAIL_SURROGATE_BOUNDARY:
\r
801 return new String(source, start + offset16 - 1, 2);
\r
803 return new String(source, start + offset16, 1);
\r
807 * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
\r
808 * the <a name="_top_">class description</a> for notes on roundtripping.
\r
811 * the UTF-16 string
\r
814 * @return UTF-16 offset
\r
815 * @exception IndexOutOfBoundsException
\r
816 * if offset32 is out of bounds.
\r
819 public static int findOffsetFromCodePoint(String source, int offset32) {
\r
821 int size = source.length(), result = 0, count = offset32;
\r
822 if (offset32 < 0 || offset32 > size) {
\r
823 throw new StringIndexOutOfBoundsException(offset32);
\r
825 while (result < size && count > 0) {
\r
826 ch = source.charAt(result);
\r
827 if (isLeadSurrogate(ch) && ((result + 1) < size)
\r
828 && isTrailSurrogate(source.charAt(result + 1))) {
\r
836 throw new StringIndexOutOfBoundsException(offset32);
\r
842 * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
\r
843 * the <a name="_top_">class description</a> for notes on roundtripping.
\r
846 * the UTF-16 string buffer
\r
849 * @return UTF-16 offset
\r
850 * @exception IndexOutOfBoundsException
\r
851 * if offset32 is out of bounds.
\r
854 public static int findOffsetFromCodePoint(StringBuffer source, int offset32) {
\r
856 int size = source.length(), result = 0, count = offset32;
\r
857 if (offset32 < 0 || offset32 > size) {
\r
858 throw new StringIndexOutOfBoundsException(offset32);
\r
860 while (result < size && count > 0) {
\r
861 ch = source.charAt(result);
\r
862 if (isLeadSurrogate(ch) && ((result + 1) < size)
\r
863 && isTrailSurrogate(source.charAt(result + 1))) {
\r
871 throw new StringIndexOutOfBoundsException(offset32);
\r
877 * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
\r
878 * the <a name="_top_">class description</a> for notes on roundtripping.
\r
881 * the UTF-16 char array whose substring is to be analysed
\r
883 * offset of the substring to be analysed
\r
885 * offset of the substring to be analysed
\r
887 * UTF-32 offset relative to start
\r
888 * @return UTF-16 offset relative to start
\r
889 * @exception IndexOutOfBoundsException
\r
890 * if offset32 is out of bounds.
\r
893 public static int findOffsetFromCodePoint(char source[], int start, int limit, int offset32) {
\r
895 int result = start, count = offset32;
\r
896 if (offset32 > limit - start) {
\r
897 throw new ArrayIndexOutOfBoundsException(offset32);
\r
899 while (result < limit && count > 0) {
\r
900 ch = source[result];
\r
901 if (isLeadSurrogate(ch) && ((result + 1) < limit)
\r
902 && isTrailSurrogate(source[result + 1])) {
\r
910 throw new ArrayIndexOutOfBoundsException(offset32);
\r
912 return result - start;
\r
916 * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at or after the given
\r
917 * UTF-16 offset. Used for random access. See the <a name="_top_">class description</a> for
\r
918 * notes on roundtripping.<br>
\r
919 * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
\r
920 * of the <strong>lead</strong> of the pair is returned. </i>
\r
922 * To find the UTF-32 length of a string, use:
\r
925 * len32 = countCodePoint(source, source.length());
\r
934 * UTF-16 offset < source text length.
\r
935 * @return UTF-32 offset
\r
936 * @exception IndexOutOfBoundsException
\r
937 * if offset16 is out of bounds.
\r
940 public static int findCodePointOffset(String source, int offset16) {
\r
941 if (offset16 < 0 || offset16 > source.length()) {
\r
942 throw new StringIndexOutOfBoundsException(offset16);
\r
947 boolean hadLeadSurrogate = false;
\r
949 for (int i = 0; i < offset16; ++i) {
\r
950 ch = source.charAt(i);
\r
951 if (hadLeadSurrogate && isTrailSurrogate(ch)) {
\r
952 hadLeadSurrogate = false; // count valid trail as zero
\r
954 hadLeadSurrogate = isLeadSurrogate(ch);
\r
955 ++result; // count others as 1
\r
959 if (offset16 == source.length()) {
\r
963 // end of source being the less significant surrogate character
\r
964 // shift result back to the start of the supplementary character
\r
965 if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) {
\r
973 * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16
\r
974 * offset. Used for random access. See the <a name="_top_">class description</a> for notes on
\r
975 * roundtripping.<br>
\r
976 * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
\r
977 * of the <strong>lead</strong> of the pair is returned. </i>
\r
979 * To find the UTF-32 length of a string, use:
\r
982 * len32 = countCodePoint(source);
\r
991 * UTF-16 offset < source text length.
\r
992 * @return UTF-32 offset
\r
993 * @exception IndexOutOfBoundsException
\r
994 * if offset16 is out of bounds.
\r
997 public static int findCodePointOffset(StringBuffer source, int offset16) {
\r
998 if (offset16 < 0 || offset16 > source.length()) {
\r
999 throw new StringIndexOutOfBoundsException(offset16);
\r
1004 boolean hadLeadSurrogate = false;
\r
1006 for (int i = 0; i < offset16; ++i) {
\r
1007 ch = source.charAt(i);
\r
1008 if (hadLeadSurrogate && isTrailSurrogate(ch)) {
\r
1009 hadLeadSurrogate = false; // count valid trail as zero
\r
1011 hadLeadSurrogate = isLeadSurrogate(ch);
\r
1012 ++result; // count others as 1
\r
1016 if (offset16 == source.length()) {
\r
1020 // end of source being the less significant surrogate character
\r
1021 // shift result back to the start of the supplementary character
\r
1022 if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) {
\r
1030 * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16
\r
1031 * offset. Used for random access. See the <a name="_top_">class description</a> for notes on
\r
1032 * roundtripping.<br>
\r
1033 * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
\r
1034 * of the <strong>lead</strong> of the pair is returned. </i>
\r
1036 * To find the UTF-32 length of a substring, use:
\r
1039 * len32 = countCodePoint(source, start, limit);
\r
1048 * offset of the substring
\r
1050 * offset of the substring
\r
1052 * UTF-16 relative to start
\r
1053 * @return UTF-32 offset relative to start
\r
1054 * @exception IndexOutOfBoundsException
\r
1055 * if offset16 is not within the range of start and limit.
\r
1058 public static int findCodePointOffset(char source[], int start, int limit, int offset16) {
\r
1059 offset16 += start;
\r
1060 if (offset16 > limit) {
\r
1061 throw new StringIndexOutOfBoundsException(offset16);
\r
1066 boolean hadLeadSurrogate = false;
\r
1068 for (int i = start; i < offset16; ++i) {
\r
1070 if (hadLeadSurrogate && isTrailSurrogate(ch)) {
\r
1071 hadLeadSurrogate = false; // count valid trail as zero
\r
1073 hadLeadSurrogate = isLeadSurrogate(ch);
\r
1074 ++result; // count others as 1
\r
1078 if (offset16 == limit) {
\r
1082 // end of source being the less significant surrogate character
\r
1083 // shift result back to the start of the supplementary character
\r
1084 if (hadLeadSurrogate && (isTrailSurrogate(source[offset16]))) {
\r
1092 * Append a single UTF-32 value to the end of a StringBuffer. If a validity check is required,
\r
1093 * use <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32 before
\r
1097 * the buffer to append to
\r
1099 * value to append.
\r
1100 * @return the updated StringBuffer
\r
1101 * @exception IllegalArgumentException
\r
1102 * thrown when char32 does not lie within the range of the Unicode codepoints
\r
1105 public static StringBuffer append(StringBuffer target, int char32) {
\r
1106 // Check for irregular values
\r
1107 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
\r
1108 throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
\r
1111 // Write the UTF-16 values
\r
1112 if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
\r
1113 target.append(getLeadSurrogate(char32));
\r
1114 target.append(getTrailSurrogate(char32));
\r
1116 target.append((char) char32);
\r
1122 * Cover JDK 1.5 APIs. Append the code point to the buffer and return the buffer as a
\r
1126 * the buffer to append to
\r
1128 * the code point to append
\r
1129 * @return the updated StringBuffer
\r
1130 * @throws IllegalArgumentException
\r
1131 * if cp is not a valid code point
\r
1134 public static StringBuffer appendCodePoint(StringBuffer target, int cp) {
\r
1135 return append(target, cp);
\r
1139 * Adds a codepoint to offset16 position of the argument char array.
\r
1142 * char array to be append with the new code point
\r
1144 * UTF16 offset which the codepoint will be appended.
\r
1146 * code point to be appended
\r
1147 * @return offset after char32 in the array.
\r
1148 * @exception IllegalArgumentException
\r
1149 * thrown if there is not enough space for the append, or when char32 does not
\r
1150 * lie within the range of the Unicode codepoints.
\r
1153 public static int append(char[] target, int limit, int char32) {
\r
1154 // Check for irregular values
\r
1155 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
\r
1156 throw new IllegalArgumentException("Illegal codepoint");
\r
1158 // Write the UTF-16 values
\r
1159 if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
\r
1160 target[limit++] = getLeadSurrogate(char32);
\r
1161 target[limit++] = getTrailSurrogate(char32);
\r
1163 target[limit++] = (char) char32;
\r
1169 * Number of codepoints in a UTF16 String
\r
1173 * @return number of codepoint in string
\r
1176 public static int countCodePoint(String source) {
\r
1177 if (source == null || source.length() == 0) {
\r
1180 return findCodePointOffset(source, source.length());
\r
1184 * Number of codepoints in a UTF16 String buffer
\r
1187 * UTF16 string buffer
\r
1188 * @return number of codepoint in string
\r
1191 public static int countCodePoint(StringBuffer source) {
\r
1192 if (source == null || source.length() == 0) {
\r
1195 return findCodePointOffset(source, source.length());
\r
1199 * Number of codepoints in a UTF16 char array substring
\r
1202 * UTF16 char array
\r
1204 * offset of the substring
\r
1206 * offset of the substring
\r
1207 * @return number of codepoint in the substring
\r
1208 * @exception IndexOutOfBoundsException
\r
1209 * if start and limit are not valid.
\r
1212 public static int countCodePoint(char source[], int start, int limit) {
\r
1213 if (source == null || source.length == 0) {
\r
1216 return findCodePointOffset(source, start, limit, limit - start);
\r
1220 * Set a code point into a UTF16 position. Adjusts target according if we are replacing a
\r
1221 * non-supplementary codepoint with a supplementary and vice versa.
\r
1226 * UTF16 position to insert into
\r
1231 public static void setCharAt(StringBuffer target, int offset16, int char32) {
\r
1233 char single = target.charAt(offset16);
\r
1235 if (isSurrogate(single)) {
\r
1236 // pairs of the surrogate with offset16 at the lead char found
\r
1237 if (isLeadSurrogate(single) && (target.length() > offset16 + 1)
\r
1238 && isTrailSurrogate(target.charAt(offset16 + 1))) {
\r
1241 // pairs of the surrogate with offset16 at the trail char
\r
1243 if (isTrailSurrogate(single) && (offset16 > 0)
\r
1244 && isLeadSurrogate(target.charAt(offset16 - 1))) {
\r
1250 target.replace(offset16, offset16 + count, valueOf(char32));
\r
1254 * Set a code point into a UTF16 position in a char array. Adjusts target according if we are
\r
1255 * replacing a non-supplementary codepoint with a supplementary and vice versa.
\r
1260 * numbers of valid chars in target, different from target.length. limit counts the
\r
1261 * number of chars in target that represents a string, not the size of array target.
\r
1263 * UTF16 position to insert into
\r
1266 * @return new number of chars in target that represents a string
\r
1267 * @exception IndexOutOfBoundsException
\r
1268 * if offset16 is out of range
\r
1271 public static int setCharAt(char target[], int limit, int offset16, int char32) {
\r
1272 if (offset16 >= limit) {
\r
1273 throw new ArrayIndexOutOfBoundsException(offset16);
\r
1276 char single = target[offset16];
\r
1278 if (isSurrogate(single)) {
\r
1279 // pairs of the surrogate with offset16 at the lead char found
\r
1280 if (isLeadSurrogate(single) && (target.length > offset16 + 1)
\r
1281 && isTrailSurrogate(target[offset16 + 1])) {
\r
1284 // pairs of the surrogate with offset16 at the trail char
\r
1286 if (isTrailSurrogate(single) && (offset16 > 0)
\r
1287 && isLeadSurrogate(target[offset16 - 1])) {
\r
1294 String str = valueOf(char32);
\r
1295 int result = limit;
\r
1296 int strlength = str.length();
\r
1297 target[offset16] = str.charAt(0);
\r
1298 if (count == strlength) {
\r
1300 target[offset16 + 1] = str.charAt(1);
\r
1303 // this is not exact match in space, we'll have to do some
\r
1305 System.arraycopy(target, offset16 + count, target, offset16 + strlength, limit
\r
1306 - (offset16 + count));
\r
1307 if (count < strlength) {
\r
1308 // char32 is a supplementary character trying to squeeze into
\r
1309 // a non-supplementary space
\r
1310 target[offset16 + 1] = str.charAt(1);
\r
1312 if (result < target.length) {
\r
1313 target[result] = 0;
\r
1316 // char32 is a non-supplementary character trying to fill
\r
1317 // into a supplementary space
\r
1319 target[result] = 0;
\r
1326 * Shifts offset16 by the argument number of codepoints
\r
1331 * UTF16 position to shift
\r
1333 * number of codepoints to shift
\r
1334 * @return new shifted offset16
\r
1335 * @exception IndexOutOfBoundsException
\r
1336 * if the new offset16 is out of bounds.
\r
1339 public static int moveCodePointOffset(String source, int offset16, int shift32) {
\r
1340 int result = offset16;
\r
1341 int size = source.length();
\r
1344 if (offset16 < 0 || offset16 > size) {
\r
1345 throw new StringIndexOutOfBoundsException(offset16);
\r
1347 if (shift32 > 0) {
\r
1348 if (shift32 + offset16 > size) {
\r
1349 throw new StringIndexOutOfBoundsException(offset16);
\r
1352 while (result < size && count > 0) {
\r
1353 ch = source.charAt(result);
\r
1354 if (isLeadSurrogate(ch) && ((result + 1) < size)
\r
1355 && isTrailSurrogate(source.charAt(result + 1))) {
\r
1362 if (offset16 + shift32 < 0) {
\r
1363 throw new StringIndexOutOfBoundsException(offset16);
\r
1365 for (count = -shift32; count > 0; count--) {
\r
1370 ch = source.charAt(result);
\r
1371 if (isTrailSurrogate(ch) && result > 0
\r
1372 && isLeadSurrogate(source.charAt(result - 1))) {
\r
1378 throw new StringIndexOutOfBoundsException(shift32);
\r
1384 * Shifts offset16 by the argument number of codepoints
\r
1389 * UTF16 position to shift
\r
1391 * number of codepoints to shift
\r
1392 * @return new shifted offset16
\r
1393 * @exception IndexOutOfBoundsException
\r
1394 * if the new offset16 is out of bounds.
\r
1397 public static int moveCodePointOffset(StringBuffer source, int offset16, int shift32) {
\r
1398 int result = offset16;
\r
1399 int size = source.length();
\r
1402 if (offset16 < 0 || offset16 > size) {
\r
1403 throw new StringIndexOutOfBoundsException(offset16);
\r
1405 if (shift32 > 0) {
\r
1406 if (shift32 + offset16 > size) {
\r
1407 throw new StringIndexOutOfBoundsException(offset16);
\r
1410 while (result < size && count > 0) {
\r
1411 ch = source.charAt(result);
\r
1412 if (isLeadSurrogate(ch) && ((result + 1) < size)
\r
1413 && isTrailSurrogate(source.charAt(result + 1))) {
\r
1420 if (offset16 + shift32 < 0) {
\r
1421 throw new StringIndexOutOfBoundsException(offset16);
\r
1423 for (count = -shift32; count > 0; count--) {
\r
1428 ch = source.charAt(result);
\r
1429 if (isTrailSurrogate(ch) && result > 0
\r
1430 && isLeadSurrogate(source.charAt(result - 1))) {
\r
1436 throw new StringIndexOutOfBoundsException(shift32);
\r
1442 * Shifts offset16 by the argument number of codepoints within a subarray.
\r
1447 * position of the subarray to be performed on
\r
1449 * position of the subarray to be performed on
\r
1451 * UTF16 position to shift relative to start
\r
1453 * number of codepoints to shift
\r
1454 * @return new shifted offset16 relative to start
\r
1455 * @exception IndexOutOfBoundsException
\r
1456 * if the new offset16 is out of bounds with respect to the subarray or the
\r
1457 * subarray bounds are out of range.
\r
1460 public static int moveCodePointOffset(char source[], int start, int limit, int offset16,
\r
1462 int size = source.length;
\r
1465 int result = offset16 + start;
\r
1466 if (start < 0 || limit < start) {
\r
1467 throw new StringIndexOutOfBoundsException(start);
\r
1469 if (limit > size) {
\r
1470 throw new StringIndexOutOfBoundsException(limit);
\r
1472 if (offset16 < 0 || result > limit) {
\r
1473 throw new StringIndexOutOfBoundsException(offset16);
\r
1475 if (shift32 > 0) {
\r
1476 if (shift32 + result > size) {
\r
1477 throw new StringIndexOutOfBoundsException(result);
\r
1480 while (result < limit && count > 0) {
\r
1481 ch = source[result];
\r
1482 if (isLeadSurrogate(ch) && (result + 1 < limit)
\r
1483 && isTrailSurrogate(source[result + 1])) {
\r
1490 if (result + shift32 < start) {
\r
1491 throw new StringIndexOutOfBoundsException(result);
\r
1493 for (count = -shift32; count > 0; count--) {
\r
1495 if (result < start) {
\r
1498 ch = source[result];
\r
1499 if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) {
\r
1505 throw new StringIndexOutOfBoundsException(shift32);
\r
1512 * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the
\r
1513 * middle of a supplementary codepoint, char32 will be inserted after the supplementary
\r
1514 * codepoint. The length of target increases by one if codepoint is non-supplementary, 2
\r
1517 * The overall effect is exactly as if the argument were converted to a string by the method
\r
1518 * valueOf(char) and the characters in that string were then inserted into target at the
\r
1519 * position indicated by offset16.
\r
1522 * The offset argument must be greater than or equal to 0, and less than or equal to the length
\r
1526 * string buffer to insert to
\r
1528 * offset which char32 will be inserted in
\r
1530 * codepoint to be inserted
\r
1531 * @return a reference to target
\r
1532 * @exception IndexOutOfBoundsException
\r
1533 * thrown if offset16 is invalid.
\r
1536 public static StringBuffer insert(StringBuffer target, int offset16, int char32) {
\r
1537 String str = valueOf(char32);
\r
1538 if (offset16 != target.length() && bounds(target, offset16) == TRAIL_SURROGATE_BOUNDARY) {
\r
1541 target.insert(offset16, str);
\r
1546 * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the
\r
1547 * middle of a supplementary codepoint, char32 will be inserted after the supplementary
\r
1548 * codepoint. Limit increases by one if codepoint is non-supplementary, 2 otherwise.
\r
1550 * The overall effect is exactly as if the argument were converted to a string by the method
\r
1551 * valueOf(char) and the characters in that string were then inserted into target at the
\r
1552 * position indicated by offset16.
\r
1555 * The offset argument must be greater than or equal to 0, and less than or equal to the limit.
\r
1558 * char array to insert to
\r
1560 * end index of the char array, limit <= target.length
\r
1562 * offset which char32 will be inserted in
\r
1564 * codepoint to be inserted
\r
1565 * @return new limit size
\r
1566 * @exception IndexOutOfBoundsException
\r
1567 * thrown if offset16 is invalid.
\r
1570 public static int insert(char target[], int limit, int offset16, int char32) {
\r
1571 String str = valueOf(char32);
\r
1572 if (offset16 != limit && bounds(target, 0, limit, offset16) == TRAIL_SURROGATE_BOUNDARY) {
\r
1575 int size = str.length();
\r
1576 if (limit + size > target.length) {
\r
1577 throw new ArrayIndexOutOfBoundsException(offset16 + size);
\r
1579 System.arraycopy(target, offset16, target, offset16 + size, limit - offset16);
\r
1580 target[offset16] = str.charAt(0);
\r
1582 target[offset16 + 1] = str.charAt(1);
\r
1584 return limit + size;
\r
1588 * Removes the codepoint at the specified position in this target (shortening target by 1
\r
1589 * character if the codepoint is a non-supplementary, 2 otherwise).
\r
1592 * string buffer to remove codepoint from
\r
1594 * offset which the codepoint will be removed
\r
1595 * @return a reference to target
\r
1596 * @exception IndexOutOfBoundsException
\r
1597 * thrown if offset16 is invalid.
\r
1600 public static StringBuffer delete(StringBuffer target, int offset16) {
\r
1602 switch (bounds(target, offset16)) {
\r
1603 case LEAD_SURROGATE_BOUNDARY:
\r
1606 case TRAIL_SURROGATE_BOUNDARY:
\r
1611 target.delete(offset16, offset16 + count);
\r
1616 * Removes the codepoint at the specified position in this target (shortening target by 1
\r
1617 * character if the codepoint is a non-supplementary, 2 otherwise).
\r
1620 * string buffer to remove codepoint from
\r
1622 * end index of the char array, limit <= target.length
\r
1624 * offset which the codepoint will be removed
\r
1625 * @return a new limit size
\r
1626 * @exception IndexOutOfBoundsException
\r
1627 * thrown if offset16 is invalid.
\r
1630 public static int delete(char target[], int limit, int offset16) {
\r
1632 switch (bounds(target, 0, limit, offset16)) {
\r
1633 case LEAD_SURROGATE_BOUNDARY:
\r
1636 case TRAIL_SURROGATE_BOUNDARY:
\r
1641 System.arraycopy(target, offset16 + count, target, offset16, limit - (offset16 + count));
\r
1642 target[limit - count] = 0;
\r
1643 return limit - count;
\r
1647 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
\r
1648 * the argument codepoint. I.e., the smallest index <code>i</code> such that
\r
1649 * <code>UTF16.charAt(source, i) ==
\r
1650 * char32</code> is true.
\r
1652 * If no such character occurs in this string, then -1 is returned.
\r
1656 * UTF16.indexOf("abc", 'a') returns 0<br>
\r
1657 * UTF16.indexOf("abc\ud800\udc00", 0x10000) returns 3<br>
\r
1658 * UTF16.indexOf("abc\ud800\udc00", 0xd800) returns -1<br>
\r
1660 * Note this method is provided as support to jdk 1.3, which does not support supplementary
\r
1661 * characters to its fullest.
\r
1664 * UTF16 format Unicode string that will be searched
\r
1666 * codepoint to search for
\r
1667 * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
\r
1668 * -1 if the codepoint does not occur.
\r
1671 public static int indexOf(String source, int char32) {
\r
1672 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
\r
1673 throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
\r
1675 // non-surrogate bmp
\r
1676 if (char32 < LEAD_SURROGATE_MIN_VALUE
\r
1677 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
\r
1678 return source.indexOf((char) char32);
\r
1681 if (char32 < SUPPLEMENTARY_MIN_VALUE) {
\r
1682 int result = source.indexOf((char) char32);
\r
1683 if (result >= 0) {
\r
1684 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
\r
1685 && isTrailSurrogate(source.charAt(result + 1))) {
\r
1686 return indexOf(source, char32, result + 1);
\r
1688 // trail surrogate
\r
1689 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
\r
1690 return indexOf(source, char32, result + 1);
\r
1696 String char32str = toString(char32);
\r
1697 return source.indexOf(char32str);
\r
1701 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
\r
1702 * the argument string str. This method is implemented based on codepoints, hence a "lead
\r
1703 * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
\r
1704 * starts with trail surrogate character at index 0, a source with a leading a surrogate
\r
1705 * character before str found at in source will not have a valid match. Vice versa for lead
\r
1706 * surrogates that ends str. See example below.
\r
1708 * If no such string str occurs in this source, then -1 is returned.
\r
1712 * UTF16.indexOf("abc", "ab") returns 0<br>
\r
1713 * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br>
\r
1714 * UTF16.indexOf("abc\ud800\udc00", "\ud800") returns -1<br>
\r
1716 * Note this method is provided as support to jdk 1.3, which does not support supplementary
\r
1717 * characters to its fullest.
\r
1720 * UTF16 format Unicode string that will be searched
\r
1722 * UTF16 format Unicode string to search for
\r
1723 * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
\r
1724 * -1 if the codepoint does not occur.
\r
1727 public static int indexOf(String source, String str) {
\r
1728 int strLength = str.length();
\r
1729 // non-surrogate ends
\r
1730 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
\r
1731 return source.indexOf(str);
\r
1734 int result = source.indexOf(str);
\r
1735 int resultEnd = result + strLength;
\r
1736 if (result >= 0) {
\r
1737 // check last character
\r
1738 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
\r
1739 && isTrailSurrogate(source.charAt(resultEnd + 1))) {
\r
1740 return indexOf(source, str, resultEnd + 1);
\r
1742 // check first character which is a trail surrogate
\r
1743 if (isTrailSurrogate(str.charAt(0)) && result > 0
\r
1744 && isLeadSurrogate(source.charAt(result - 1))) {
\r
1745 return indexOf(source, str, resultEnd + 1);
\r
1752 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
\r
1753 * the argument codepoint. I.e., the smallest index i such that: <br>
\r
1754 * (UTF16.charAt(source, i) == char32 && i >= fromIndex) is true.
\r
1756 * If no such character occurs in this string, then -1 is returned.
\r
1760 * UTF16.indexOf("abc", 'a', 1) returns -1<br>
\r
1761 * UTF16.indexOf("abc\ud800\udc00", 0x10000, 1) returns 3<br>
\r
1762 * UTF16.indexOf("abc\ud800\udc00", 0xd800, 1) returns -1<br>
\r
1764 * Note this method is provided as support to jdk 1.3, which does not support supplementary
\r
1765 * characters to its fullest.
\r
1768 * UTF16 format Unicode string that will be searched
\r
1770 * codepoint to search for
\r
1771 * @param fromIndex
\r
1772 * the index to start the search from.
\r
1773 * @return the index of the first occurrence of the codepoint in the argument Unicode string at
\r
1774 * or after fromIndex, or -1 if the codepoint does not occur.
\r
1777 public static int indexOf(String source, int char32, int fromIndex) {
\r
1778 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
\r
1779 throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
\r
1781 // non-surrogate bmp
\r
1782 if (char32 < LEAD_SURROGATE_MIN_VALUE
\r
1783 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
\r
1784 return source.indexOf((char) char32, fromIndex);
\r
1787 if (char32 < SUPPLEMENTARY_MIN_VALUE) {
\r
1788 int result = source.indexOf((char) char32, fromIndex);
\r
1789 if (result >= 0) {
\r
1790 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
\r
1791 && isTrailSurrogate(source.charAt(result + 1))) {
\r
1792 return indexOf(source, char32, result + 1);
\r
1794 // trail surrogate
\r
1795 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
\r
1796 return indexOf(source, char32, result + 1);
\r
1802 String char32str = toString(char32);
\r
1803 return source.indexOf(char32str, fromIndex);
\r
1807 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
\r
1808 * the argument string str. This method is implemented based on codepoints, hence a "lead
\r
1809 * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
\r
1810 * starts with trail surrogate character at index 0, a source with a leading a surrogate
\r
1811 * character before str found at in source will not have a valid match. Vice versa for lead
\r
1812 * surrogates that ends str. See example below.
\r
1814 * If no such string str occurs in this source, then -1 is returned.
\r
1818 * UTF16.indexOf("abc", "ab", 0) returns 0<br>
\r
1819 * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 0) returns 3<br>
\r
1820 * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 2) returns 3<br>
\r
1821 * UTF16.indexOf("abc\ud800\udc00", "\ud800", 0) returns -1<br>
\r
1823 * Note this method is provided as support to jdk 1.3, which does not support supplementary
\r
1824 * characters to its fullest.
\r
1827 * UTF16 format Unicode string that will be searched
\r
1829 * UTF16 format Unicode string to search for
\r
1830 * @param fromIndex
\r
1831 * the index to start the search from.
\r
1832 * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
\r
1833 * -1 if the codepoint does not occur.
\r
1836 public static int indexOf(String source, String str, int fromIndex) {
\r
1837 int strLength = str.length();
\r
1838 // non-surrogate ends
\r
1839 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
\r
1840 return source.indexOf(str, fromIndex);
\r
1843 int result = source.indexOf(str, fromIndex);
\r
1844 int resultEnd = result + strLength;
\r
1845 if (result >= 0) {
\r
1846 // check last character
\r
1847 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
\r
1848 && isTrailSurrogate(source.charAt(resultEnd))) {
\r
1849 return indexOf(source, str, resultEnd + 1);
\r
1851 // check first character which is a trail surrogate
\r
1852 if (isTrailSurrogate(str.charAt(0)) && result > 0
\r
1853 && isLeadSurrogate(source.charAt(result - 1))) {
\r
1854 return indexOf(source, str, resultEnd + 1);
\r
1861 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
\r
1862 * the argument codepoint. I.e., the index returned is the largest value i such that:
\r
1863 * UTF16.charAt(source, i) == char32 is true.
\r
1866 * UTF16.lastIndexOf("abc", 'a') returns 0<br>
\r
1867 * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000) returns 3<br>
\r
1868 * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br>
\r
1871 * source is searched backwards starting at the last character.
\r
1873 * Note this method is provided as support to jdk 1.3, which does not support supplementary
\r
1874 * characters to its fullest.
\r
1877 * UTF16 format Unicode string that will be searched
\r
1879 * codepoint to search for
\r
1880 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
\r
1884 public static int lastIndexOf(String source, int char32) {
\r
1885 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
\r
1886 throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
\r
1888 // non-surrogate bmp
\r
1889 if (char32 < LEAD_SURROGATE_MIN_VALUE
\r
1890 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
\r
1891 return source.lastIndexOf((char) char32);
\r
1894 if (char32 < SUPPLEMENTARY_MIN_VALUE) {
\r
1895 int result = source.lastIndexOf((char) char32);
\r
1896 if (result >= 0) {
\r
1897 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
\r
1898 && isTrailSurrogate(source.charAt(result + 1))) {
\r
1899 return lastIndexOf(source, char32, result - 1);
\r
1901 // trail surrogate
\r
1902 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
\r
1903 return lastIndexOf(source, char32, result - 1);
\r
1909 String char32str = toString(char32);
\r
1910 return source.lastIndexOf(char32str);
\r
1914 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
\r
1915 * the argument string str. This method is implemented based on codepoints, hence a "lead
\r
1916 * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
\r
1917 * starts with trail surrogate character at index 0, a source with a leading a surrogate
\r
1918 * character before str found at in source will not have a valid match. Vice versa for lead
\r
1919 * surrogates that ends str. See example below.
\r
1922 * UTF16.lastIndexOf("abc", "a") returns 0<br>
\r
1923 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br>
\r
1924 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800") returns -1<br>
\r
1927 * source is searched backwards starting at the last character.
\r
1929 * Note this method is provided as support to jdk 1.3, which does not support supplementary
\r
1930 * characters to its fullest.
\r
1933 * UTF16 format Unicode string that will be searched
\r
1935 * UTF16 format Unicode string to search for
\r
1936 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
\r
1940 public static int lastIndexOf(String source, String str) {
\r
1941 int strLength = str.length();
\r
1942 // non-surrogate ends
\r
1943 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
\r
1944 return source.lastIndexOf(str);
\r
1947 int result = source.lastIndexOf(str);
\r
1948 if (result >= 0) {
\r
1949 // check last character
\r
1950 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
\r
1951 && isTrailSurrogate(source.charAt(result + strLength + 1))) {
\r
1952 return lastIndexOf(source, str, result - 1);
\r
1954 // check first character which is a trail surrogate
\r
1955 if (isTrailSurrogate(str.charAt(0)) && result > 0
\r
1956 && isLeadSurrogate(source.charAt(result - 1))) {
\r
1957 return lastIndexOf(source, str, result - 1);
\r
1965 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
\r
1966 * the argument codepoint, where the result is less than or equals to fromIndex.
\r
1969 * This method is implemented based on codepoints, hence a single surrogate character will not
\r
1970 * match a supplementary character.
\r
1973 * source is searched backwards starting at the last character starting at the specified index.
\r
1977 * UTF16.lastIndexOf("abc", 'c', 2) returns 2<br>
\r
1978 * UTF16.lastIndexOf("abc", 'c', 1) returns -1<br>
\r
1979 * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 5) returns 3<br>
\r
1980 * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 3) returns 3<br>
\r
1981 * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br>
\r
1983 * Note this method is provided as support to jdk 1.3, which does not support supplementary
\r
1984 * characters to its fullest.
\r
1987 * UTF16 format Unicode string that will be searched
\r
1989 * codepoint to search for
\r
1990 * @param fromIndex
\r
1991 * the index to start the search from. There is no restriction on the value of
\r
1992 * fromIndex. If it is greater than or equal to the length of this string, it has the
\r
1993 * same effect as if it were equal to one less than the length of this string: this
\r
1994 * entire string may be searched. If it is negative, it has the same effect as if it
\r
1995 * were -1: -1 is returned.
\r
1996 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
\r
2000 public static int lastIndexOf(String source, int char32, int fromIndex) {
\r
2001 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
\r
2002 throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
\r
2004 // non-surrogate bmp
\r
2005 if (char32 < LEAD_SURROGATE_MIN_VALUE
\r
2006 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
\r
2007 return source.lastIndexOf((char) char32, fromIndex);
\r
2010 if (char32 < SUPPLEMENTARY_MIN_VALUE) {
\r
2011 int result = source.lastIndexOf((char) char32, fromIndex);
\r
2012 if (result >= 0) {
\r
2013 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
\r
2014 && isTrailSurrogate(source.charAt(result + 1))) {
\r
2015 return lastIndexOf(source, char32, result - 1);
\r
2017 // trail surrogate
\r
2018 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
\r
2019 return lastIndexOf(source, char32, result - 1);
\r
2025 String char32str = toString(char32);
\r
2026 return source.lastIndexOf(char32str, fromIndex);
\r
2031 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
\r
2032 * the argument string str, where the result is less than or equals to fromIndex.
\r
2035 * This method is implemented based on codepoints, hence a "lead surrogate character + trail
\r
2036 * surrogate character" is treated as one entity. Hence if the str starts with trail surrogate
\r
2037 * character at index 0, a source with a leading a surrogate character before str found at in
\r
2038 * source will not have a valid match. Vice versa for lead surrogates that ends str.
\r
2040 * See example below.
\r
2043 * UTF16.lastIndexOf("abc", "c", 2) returns 2<br>
\r
2044 * UTF16.lastIndexOf("abc", "c", 1) returns -1<br>
\r
2045 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 5) returns 3<br>
\r
2046 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 3) returns 3<br>
\r
2047 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800", 4) returns -1<br>
\r
2050 * source is searched backwards starting at the last character.
\r
2052 * Note this method is provided as support to jdk 1.3, which does not support supplementary
\r
2053 * characters to its fullest.
\r
2056 * UTF16 format Unicode string that will be searched
\r
2058 * UTF16 format Unicode string to search for
\r
2059 * @param fromIndex
\r
2060 * the index to start the search from. There is no restriction on the value of
\r
2061 * fromIndex. If it is greater than or equal to the length of this string, it has the
\r
2062 * same effect as if it were equal to one less than the length of this string: this
\r
2063 * entire string may be searched. If it is negative, it has the same effect as if it
\r
2064 * were -1: -1 is returned.
\r
2065 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
\r
2069 public static int lastIndexOf(String source, String str, int fromIndex) {
\r
2070 int strLength = str.length();
\r
2071 // non-surrogate ends
\r
2072 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
\r
2073 return source.lastIndexOf(str, fromIndex);
\r
2076 int result = source.lastIndexOf(str, fromIndex);
\r
2077 if (result >= 0) {
\r
2078 // check last character
\r
2079 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
\r
2080 && isTrailSurrogate(source.charAt(result + strLength))) {
\r
2081 return lastIndexOf(source, str, result - 1);
\r
2083 // check first character which is a trail surrogate
\r
2084 if (isTrailSurrogate(str.charAt(0)) && result > 0
\r
2085 && isLeadSurrogate(source.charAt(result - 1))) {
\r
2086 return lastIndexOf(source, str, result - 1);
\r
2093 * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of
\r
2094 * oldChar32 in source with newChar32. If the character oldChar32 does not occur in the UTF16
\r
2095 * format Unicode string source, then source will be returned. Otherwise, a new String object is
\r
2096 * created that represents a codepoint sequence identical to the codepoint sequence represented
\r
2097 * by source, except that every occurrence of oldChar32 is replaced by an occurrence of
\r
2101 * UTF16.replace("mesquite in your cellar", 'e', 'o');<br>
\r
2102 * returns "mosquito in your collar"<br>
\r
2103 * UTF16.replace("JonL", 'q', 'x');<br>
\r
2104 * returns "JonL" (no change)<br>
\r
2105 * UTF16.replace("Supplementary character \ud800\udc00", 0x10000, '!'); <br>
\r
2106 * returns "Supplementary character !"<br>
\r
2107 * UTF16.replace("Supplementary character \ud800\udc00", 0xd800, '!'); <br>
\r
2108 * returns "Supplementary character \ud800\udc00"<br>
\r
2110 * Note this method is provided as support to jdk 1.3, which does not support supplementary
\r
2111 * characters to its fullest.
\r
2114 * UTF16 format Unicode string which the codepoint replacements will be based on.
\r
2115 * @param oldChar32
\r
2116 * non-zero old codepoint to be replaced.
\r
2117 * @param newChar32
\r
2118 * the new codepoint to replace oldChar32
\r
2119 * @return new String derived from source by replacing every occurrence of oldChar32 with
\r
2120 * newChar32, unless when no oldChar32 is found in source then source will be returned.
\r
2123 public static String replace(String source, int oldChar32, int newChar32) {
\r
2124 if (oldChar32 <= 0 || oldChar32 > CODEPOINT_MAX_VALUE) {
\r
2125 throw new IllegalArgumentException("Argument oldChar32 is not a valid codepoint");
\r
2127 if (newChar32 <= 0 || newChar32 > CODEPOINT_MAX_VALUE) {
\r
2128 throw new IllegalArgumentException("Argument newChar32 is not a valid codepoint");
\r
2131 int index = indexOf(source, oldChar32);
\r
2132 if (index == -1) {
\r
2135 String newChar32Str = toString(newChar32);
\r
2136 int oldChar32Size = 1;
\r
2137 int newChar32Size = newChar32Str.length();
\r
2138 StringBuffer result = new StringBuffer(source);
\r
2139 int resultIndex = index;
\r
2141 if (oldChar32 >= SUPPLEMENTARY_MIN_VALUE) {
\r
2142 oldChar32Size = 2;
\r
2145 while (index != -1) {
\r
2146 int endResultIndex = resultIndex + oldChar32Size;
\r
2147 result.replace(resultIndex, endResultIndex, newChar32Str);
\r
2148 int lastEndIndex = index + oldChar32Size;
\r
2149 index = indexOf(source, oldChar32, lastEndIndex);
\r
2150 resultIndex += newChar32Size + index - lastEndIndex;
\r
2152 return result.toString();
\r
2156 * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of oldStr
\r
2157 * in source with newStr. If the string oldStr does not occur in the UTF16 format Unicode string
\r
2158 * source, then source will be returned. Otherwise, a new String object is created that
\r
2159 * represents a codepoint sequence identical to the codepoint sequence represented by source,
\r
2160 * except that every occurrence of oldStr is replaced by an occurrence of newStr.
\r
2163 * UTF16.replace("mesquite in your cellar", "e", "o");<br>
\r
2164 * returns "mosquito in your collar"<br>
\r
2165 * UTF16.replace("mesquite in your cellar", "mesquite", "cat");<br>
\r
2166 * returns "cat in your cellar"<br>
\r
2167 * UTF16.replace("JonL", "q", "x");<br>
\r
2168 * returns "JonL" (no change)<br>
\r
2169 * UTF16.replace("Supplementary character \ud800\udc00", "\ud800\udc00", '!'); <br>
\r
2170 * returns "Supplementary character !"<br>
\r
2171 * UTF16.replace("Supplementary character \ud800\udc00", "\ud800", '!'); <br>
\r
2172 * returns "Supplementary character \ud800\udc00"<br>
\r
2174 * Note this method is provided as support to jdk 1.3, which does not support supplementary
\r
2175 * characters to its fullest.
\r
2178 * UTF16 format Unicode string which the replacements will be based on.
\r
2180 * non-zero-length string to be replaced.
\r
2182 * the new string to replace oldStr
\r
2183 * @return new String derived from source by replacing every occurrence of oldStr with newStr.
\r
2184 * When no oldStr is found in source, then source will be returned.
\r
2187 public static String replace(String source, String oldStr, String newStr) {
\r
2188 int index = indexOf(source, oldStr);
\r
2189 if (index == -1) {
\r
2192 int oldStrSize = oldStr.length();
\r
2193 int newStrSize = newStr.length();
\r
2194 StringBuffer result = new StringBuffer(source);
\r
2195 int resultIndex = index;
\r
2197 while (index != -1) {
\r
2198 int endResultIndex = resultIndex + oldStrSize;
\r
2199 result.replace(resultIndex, endResultIndex, newStr);
\r
2200 int lastEndIndex = index + oldStrSize;
\r
2201 index = indexOf(source, oldStr, lastEndIndex);
\r
2202 resultIndex += newStrSize + index - lastEndIndex;
\r
2204 return result.toString();
\r
2208 * Reverses a UTF16 format Unicode string and replaces source's content with it. This method
\r
2209 * will reverse surrogate characters correctly, instead of blindly reversing every character.
\r
2212 * UTF16.reverse(new StringBuffer( "Supplementary characters \ud800\udc00\ud801\udc01"))<br>
\r
2213 * returns "\ud801\udc01\ud800\udc00 sretcarahc yratnemelppuS".
\r
2216 * the source StringBuffer that contains UTF16 format Unicode string to be reversed
\r
2217 * @return a modified source with reversed UTF16 format Unicode string.
\r
2220 public static StringBuffer reverse(StringBuffer source) {
\r
2221 int length = source.length();
\r
2222 StringBuffer result = new StringBuffer(length);
\r
2223 for (int i = length; i-- > 0;) {
\r
2224 char ch = source.charAt(i);
\r
2225 if (isTrailSurrogate(ch) && i > 0) {
\r
2226 char ch2 = source.charAt(i - 1);
\r
2227 if (isLeadSurrogate(ch2)) {
\r
2228 result.append(ch2);
\r
2229 result.append(ch);
\r
2234 result.append(ch);
\r
2240 * Check if the string contains more Unicode code points than a certain number. This is more
\r
2241 * efficient than counting all code points in the entire string and comparing that number with a
\r
2242 * threshold. This function may not need to scan the string at all if the length is within a
\r
2243 * certain range, and never needs to count more than 'number + 1' code points. Logically
\r
2244 * equivalent to (countCodePoint(s) > number). A Unicode code point may occupy either one or two
\r
2248 * The input string.
\r
2250 * The number of code points in the string is compared against the 'number'
\r
2252 * @return boolean value for whether the string contains more Unicode code points than 'number'.
\r
2255 public static boolean hasMoreCodePointsThan(String source, int number) {
\r
2259 if (source == null) {
\r
2262 int length = source.length();
\r
2264 // length >= 0 known
\r
2265 // source contains at least (length + 1) / 2 code points: <= 2
\r
2267 if (((length + 1) >> 1) > number) {
\r
2271 // check if source does not even contain enough chars
\r
2272 int maxsupplementary = length - number;
\r
2273 if (maxsupplementary <= 0) {
\r
2277 // there are maxsupplementary = length - number more chars than
\r
2278 // asked-for code points
\r
2280 // count code points until they exceed and also check that there are
\r
2281 // no more than maxsupplementary supplementary code points (char pairs)
\r
2284 if (length == 0) {
\r
2287 if (number == 0) {
\r
2290 if (isLeadSurrogate(source.charAt(start++)) && start != length
\r
2291 && isTrailSurrogate(source.charAt(start))) {
\r
2293 if (--maxsupplementary <= 0) {
\r
2294 // too many pairs - too few code points
\r
2303 * Check if the sub-range of char array, from argument start to limit, contains more Unicode
\r
2304 * code points than a certain number. This is more efficient than counting all code points in
\r
2305 * the entire char array range and comparing that number with a threshold. This function may not
\r
2306 * need to scan the char array at all if start and limit is within a certain range, and never
\r
2307 * needs to count more than 'number + 1' code points. Logically equivalent to
\r
2308 * (countCodePoint(source, start, limit) > number). A Unicode code point may occupy either one
\r
2309 * or two code units.
\r
2312 * array of UTF-16 chars
\r
2314 * offset to substring in the source array for analyzing
\r
2316 * offset to substring in the source array for analyzing
\r
2318 * The number of code points in the string is compared against the 'number'
\r
2320 * @return boolean value for whether the string contains more Unicode code points than 'number'.
\r
2321 * @exception IndexOutOfBoundsException
\r
2322 * thrown when limit < start
\r
2325 public static boolean hasMoreCodePointsThan(char source[], int start, int limit, int number) {
\r
2326 int length = limit - start;
\r
2327 if (length < 0 || start < 0 || limit < 0) {
\r
2328 throw new IndexOutOfBoundsException(
\r
2329 "Start and limit indexes should be non-negative and start <= limit");
\r
2334 if (source == null) {
\r
2338 // length >= 0 known
\r
2339 // source contains at least (length + 1) / 2 code points: <= 2
\r
2341 if (((length + 1) >> 1) > number) {
\r
2345 // check if source does not even contain enough chars
\r
2346 int maxsupplementary = length - number;
\r
2347 if (maxsupplementary <= 0) {
\r
2351 // there are maxsupplementary = length - number more chars than
\r
2352 // asked-for code points
\r
2354 // count code points until they exceed and also check that there are
\r
2355 // no more than maxsupplementary supplementary code points (char pairs)
\r
2357 if (length == 0) {
\r
2360 if (number == 0) {
\r
2363 if (isLeadSurrogate(source[start++]) && start != limit
\r
2364 && isTrailSurrogate(source[start])) {
\r
2366 if (--maxsupplementary <= 0) {
\r
2367 // too many pairs - too few code points
\r
2376 * Check if the string buffer contains more Unicode code points than a certain number. This is
\r
2377 * more efficient than counting all code points in the entire string buffer and comparing that
\r
2378 * number with a threshold. This function may not need to scan the string buffer at all if the
\r
2379 * length is within a certain range, and never needs to count more than 'number + 1' code
\r
2380 * points. Logically equivalent to (countCodePoint(s) > number). A Unicode code point may occupy
\r
2381 * either one or two code units.
\r
2384 * The input string buffer.
\r
2386 * The number of code points in the string buffer is compared against the 'number'
\r
2388 * @return boolean value for whether the string buffer contains more Unicode code points than
\r
2392 public static boolean hasMoreCodePointsThan(StringBuffer source, int number) {
\r
2396 if (source == null) {
\r
2399 int length = source.length();
\r
2401 // length >= 0 known
\r
2402 // source contains at least (length + 1) / 2 code points: <= 2
\r
2404 if (((length + 1) >> 1) > number) {
\r
2408 // check if source does not even contain enough chars
\r
2409 int maxsupplementary = length - number;
\r
2410 if (maxsupplementary <= 0) {
\r
2414 // there are maxsupplementary = length - number more chars than
\r
2415 // asked-for code points
\r
2417 // count code points until they exceed and also check that there are
\r
2418 // no more than maxsupplementary supplementary code points (char pairs)
\r
2421 if (length == 0) {
\r
2424 if (number == 0) {
\r
2427 if (isLeadSurrogate(source.charAt(start++)) && start != length
\r
2428 && isTrailSurrogate(source.charAt(start))) {
\r
2430 if (--maxsupplementary <= 0) {
\r
2431 // too many pairs - too few code points
\r
2440 * Cover JDK 1.5 API. Create a String from an array of codePoints.
\r
2442 * @param codePoints
\r
2445 * the start of the text in the code point array
\r
2447 * the number of code points
\r
2448 * @return a String representing the code points between offset and count
\r
2449 * @throws IllegalArgumentException
\r
2450 * if an invalid code point is encountered
\r
2451 * @throws IndexOutOfBoundsException
\r
2452 * if the offset or count are out of bounds.
\r
2455 public static String newString(int[] codePoints, int offset, int count) {
\r
2457 throw new IllegalArgumentException();
\r
2459 char[] chars = new char[count];
\r
2461 for (int r = offset, e = offset + count; r < e; ++r) {
\r
2462 int cp = codePoints[r];
\r
2463 if (cp < 0 || cp > 0x10ffff) {
\r
2464 throw new IllegalArgumentException();
\r
2468 if (cp < 0x010000) {
\r
2469 chars[w] = (char) cp;
\r
2472 chars[w] = (char) (LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_));
\r
2473 chars[w + 1] = (char) (TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_));
\r
2477 } catch (IndexOutOfBoundsException ex) {
\r
2478 int newlen = (int) (Math.ceil((double) codePoints.length * (w + 2)
\r
2479 / (r - offset + 1)));
\r
2480 char[] temp = new char[newlen];
\r
2481 System.arraycopy(chars, 0, temp, 0, w);
\r
2486 return new String(chars, 0, w);
\r
2491 * UTF16 string comparator class. Allows UTF16 string comparison to be done with the various
\r
2495 * <li> Code point comparison or code unit comparison
\r
2496 * <li> Case sensitive comparison, case insensitive comparison or case insensitive comparison
\r
2497 * with special handling for character 'i'.
\r
2500 * The code unit or code point comparison differ only when comparing supplementary code points
\r
2501 * (\u10000..\u10ffff) to BMP code points near the end of the BMP (i.e.,
\r
2502 * \ue000..\uffff). In code unit comparison, high BMP code points sort after
\r
2503 * supplementary code points because they are stored as pairs of surrogates which are at
\r
2504 * \ud800..\udfff.
\r
2507 * @see #FOLD_CASE_DEFAULT
\r
2508 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
\r
2511 public static final class StringComparator implements java.util.Comparator {
\r
2512 // public constructor ------------------------------------------------
\r
2515 * Default constructor that does code unit comparison and case sensitive comparison.
\r
2519 public StringComparator() {
\r
2520 this(false, false, FOLD_CASE_DEFAULT);
\r
2524 * Constructor that does comparison based on the argument options.
\r
2526 * @param codepointcompare
\r
2527 * flag to indicate true for code point comparison or false for code unit
\r
2529 * @param ignorecase
\r
2530 * false for case sensitive comparison, true for case-insensitive comparison
\r
2531 * @param foldcaseoption
\r
2532 * FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only
\r
2533 * when ignorecase is set to true. If ignorecase is false, this option is
\r
2535 * @see #FOLD_CASE_DEFAULT
\r
2536 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
\r
2537 * @throws IllegalArgumentException
\r
2538 * if foldcaseoption is out of range
\r
2541 public StringComparator(boolean codepointcompare, boolean ignorecase, int foldcaseoption) {
\r
2542 setCodePointCompare(codepointcompare);
\r
2543 m_ignoreCase_ = ignorecase;
\r
2544 if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) {
\r
2545 throw new IllegalArgumentException("Invalid fold case option");
\r
2547 m_foldCase_ = foldcaseoption;
\r
2550 // public data member ------------------------------------------------
\r
2554 * Option value for case folding comparison:
\r
2557 * Comparison is case insensitive, strings are folded using default mappings defined in
\r
2558 * Unicode data file CaseFolding.txt, before comparison.
\r
2563 public static final int FOLD_CASE_DEFAULT = 0;
\r
2567 * Option value for case folding comparison:
\r
2570 * Comparison is case insensitive, strings are folded using modified mappings defined in
\r
2571 * Unicode data file CaseFolding.txt, before comparison.
\r
2574 * The modified set of mappings is provided in a Unicode data file CaseFolding.txt to handle
\r
2575 * dotted I and dotless i appropriately for Turkic languages (tr, az).
\r
2578 * Before Unicode 3.2, CaseFolding.txt contains mappings marked with 'I' that are to be
\r
2579 * included for default mappings and excluded for the Turkic-specific mappings.
\r
2582 * Unicode 3.2 CaseFolding.txt instead contains mappings marked with 'T' that are to be
\r
2583 * excluded for default mappings and included for the Turkic-specific mappings.
\r
2588 public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = 1;
\r
2590 // public methods ----------------------------------------------------
\r
2592 // public setters ----------------------------------------------------
\r
2595 * Sets the comparison mode to code point compare if flag is true. Otherwise comparison mode
\r
2596 * is set to code unit compare
\r
2599 * true for code point compare, false for code unit compare
\r
2602 public void setCodePointCompare(boolean flag) {
\r
2604 m_codePointCompare_ = Normalizer.COMPARE_CODE_POINT_ORDER;
\r
2606 m_codePointCompare_ = 0;
\r
2611 * Sets the Comparator to case-insensitive comparison mode if argument is true, otherwise
\r
2612 * case sensitive comparison mode if set to false.
\r
2614 * @param ignorecase
\r
2615 * true for case-insitive comparison, false for case sensitive comparison
\r
2616 * @param foldcaseoption
\r
2617 * FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only
\r
2618 * when ignorecase is set to true. If ignorecase is false, this option is
\r
2620 * @see #FOLD_CASE_DEFAULT
\r
2621 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
\r
2624 public void setIgnoreCase(boolean ignorecase, int foldcaseoption) {
\r
2625 m_ignoreCase_ = ignorecase;
\r
2626 if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) {
\r
2627 throw new IllegalArgumentException("Invalid fold case option");
\r
2629 m_foldCase_ = foldcaseoption;
\r
2632 // public getters ----------------------------------------------------
\r
2635 * Checks if the comparison mode is code point compare.
\r
2637 * @return true for code point compare, false for code unit compare
\r
2640 public boolean getCodePointCompare() {
\r
2641 return m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER;
\r
2645 * Checks if Comparator is in the case insensitive mode.
\r
2647 * @return true if Comparator performs case insensitive comparison, false otherwise
\r
2650 public boolean getIgnoreCase() {
\r
2651 return m_ignoreCase_;
\r
2655 * Gets the fold case options set in Comparator to be used with case insensitive comparison.
\r
2657 * @return either FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I
\r
2658 * @see #FOLD_CASE_DEFAULT
\r
2659 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
\r
2662 public int getIgnoreCaseOption() {
\r
2663 return m_foldCase_;
\r
2666 // public other methods ----------------------------------------------
\r
2669 * Compare two strings depending on the options selected during construction.
\r
2672 * first source string.
\r
2674 * second source string.
\r
2675 * @return 0 returned if a == b. If a < b, a negative value is returned. Otherwise if a > b,
\r
2676 * a positive value is returned.
\r
2677 * @exception ClassCastException
\r
2678 * thrown when either a or b is not a String object
\r
2681 public int compare(Object a, Object b) {
\r
2682 String str1 = (String) a;
\r
2683 String str2 = (String) b;
\r
2685 if (str1 == str2) {
\r
2688 if (str1 == null) {
\r
2691 if (str2 == null) {
\r
2695 if (m_ignoreCase_) {
\r
2696 return compareCaseInsensitive(str1, str2);
\r
2698 return compareCaseSensitive(str1, str2);
\r
2701 // private data member ----------------------------------------------
\r
2704 * Code unit comparison flag. True if code unit comparison is required. False if code point
\r
2705 * comparison is required.
\r
2707 private int m_codePointCompare_;
\r
2710 * Fold case comparison option.
\r
2712 private int m_foldCase_;
\r
2715 * Flag indicator if ignore case is to be used during comparison
\r
2717 private boolean m_ignoreCase_;
\r
2720 * Code point order offset for surrogate characters
\r
2722 private static final int CODE_POINT_COMPARE_SURROGATE_OFFSET_ = 0x2800;
\r
2724 // private method ---------------------------------------------------
\r
2727 * Compares case insensitive. This is a direct port of ICU4C, to make maintainence life
\r
2731 * first string to compare
\r
2733 * second string to compare
\r
2734 * @return -1 is s1 < s2, 0 if equals,
\r
2736 private int compareCaseInsensitive(String s1, String s2) {
\r
2737 return NormalizerImpl.cmpEquivFold(s1, s2, m_foldCase_ | m_codePointCompare_
\r
2738 | Normalizer.COMPARE_IGNORE_CASE);
\r
2742 * Compares case sensitive. This is a direct port of ICU4C, to make maintainence life
\r
2746 * first string to compare
\r
2748 * second string to compare
\r
2749 * @return -1 is s1 < s2, 0 if equals,
\r
2751 private int compareCaseSensitive(String s1, String s2) {
\r
2752 // compare identical prefixes - they do not need to be fixed up
\r
2753 // limit1 = start1 + min(lenght1, length2)
\r
2754 int length1 = s1.length();
\r
2755 int length2 = s2.length();
\r
2756 int minlength = length1;
\r
2758 if (length1 < length2) {
\r
2760 } else if (length1 > length2) {
\r
2762 minlength = length2;
\r
2768 for (; index < minlength; index++) {
\r
2769 c1 = s1.charAt(index);
\r
2770 c2 = s2.charAt(index);
\r
2771 // check pseudo-limit
\r
2777 if (index == minlength) {
\r
2781 boolean codepointcompare = m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER;
\r
2782 // if both values are in or above the surrogate range, fix them up
\r
2783 if (c1 >= LEAD_SURROGATE_MIN_VALUE && c2 >= LEAD_SURROGATE_MIN_VALUE
\r
2784 && codepointcompare) {
\r
2785 // subtract 0x2800 from BMP code points to make them smaller
\r
2786 // than supplementary ones
\r
2787 if ((c1 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length1 && isTrailSurrogate(s1.charAt(index + 1)))
\r
2788 || (isTrailSurrogate(c1) && index != 0 && isLeadSurrogate(s1.charAt(index - 1)))) {
\r
2789 // part of a surrogate pair, leave >=d800
\r
2791 // BMP code point - may be surrogate code point - make
\r
2793 c1 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_;
\r
2796 if ((c2 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length2 && isTrailSurrogate(s2.charAt(index + 1)))
\r
2797 || (isTrailSurrogate(c2) && index != 0 && isLeadSurrogate(s2.charAt(index - 1)))) {
\r
2798 // part of a surrogate pair, leave >=d800
\r
2800 // BMP code point - may be surrogate code point - make <d800
\r
2801 c2 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_;
\r
2805 // now c1 and c2 are in UTF-32-compatible order
\r
2810 // private data members -------------------------------------------------
\r
2813 * Shift value for lead surrogate to form a supplementary character.
\r
2815 private static final int LEAD_SURROGATE_SHIFT_ = 10;
\r
2818 * Mask to retrieve the significant value from a trail surrogate.
\r
2820 private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
\r
2823 * Value that all lead surrogate starts with
\r
2825 private static final int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE
\r
2826 - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);
\r
2828 // private methods ------------------------------------------------------
\r
2832 * Converts argument code point and returns a String object representing the code point's value
\r
2833 * in UTF16 format.
\r
2836 * This method does not check for the validity of the codepoint, the results are not guaranteed
\r
2837 * if a invalid codepoint is passed as argument.
\r
2840 * The result is a string whose length is 1 for non-supplementary code points, 2 otherwise.
\r
2845 * @return string representation of the code point
\r
2847 private static String toString(int ch) {
\r
2848 if (ch < SUPPLEMENTARY_MIN_VALUE) {
\r
2849 return String.valueOf((char) ch);
\r
2852 StringBuffer result = new StringBuffer();
\r
2853 result.append(getLeadSurrogate(ch));
\r
2854 result.append(getTrailSurrogate(ch));
\r
2855 return result.toString();
\r