2 *******************************************************************************
\r
3 * Copyright (C) 1996-2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
8 package com.ibm.icu.text;
\r
10 import com.ibm.icu.impl.UCharacterProperty;
\r
14 * Standalone utility class providing UTF16 character conversions and indexing conversions.
\r
17 * Code that uses strings alone rarely need modification. By design, UTF-16 does not allow overlap,
\r
18 * so searching for strings is a safe operation. Similarly, concatenation is always safe.
\r
19 * Substringing is safe if the start and end are both on UTF-32 boundaries. In normal code, the
\r
20 * values for start and end are on those boundaries, since they arose from operations like
\r
21 * searching. If not, the nearest UTF-32 boundaries can be determined using <code>bounds()</code>.
\r
23 * <strong>Examples:</strong>
\r
25 * The following examples illustrate use of some of these methods.
\r
28 * // iteration forwards: Original
\r
29 * for (int i = 0; i < s.length(); ++i) {
\r
30 * char ch = s.charAt(i);
\r
31 * doSomethingWith(ch);
\r
34 * // iteration forwards: Changes for UTF-32
\r
36 * for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) {
\r
37 * ch = UTF16.charAt(s, i);
\r
38 * doSomethingWith(ch);
\r
41 * // iteration backwards: Original
\r
42 * for (int i = s.length() - 1; i >= 0; --i) {
\r
43 * char ch = s.charAt(i);
\r
44 * doSomethingWith(ch);
\r
47 * // iteration backwards: Changes for UTF-32
\r
49 * for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) {
\r
50 * ch = UTF16.charAt(s, i);
\r
51 * doSomethingWith(ch);
\r
55 * <strong>Notes:</strong>
\r
57 * <li> <strong>Naming:</strong> For clarity, High and Low surrogates are called <code>Lead</code>
\r
58 * and <code>Trail</code> in the API, which gives a better sense of their ordering in a string.
\r
59 * <code>offset16</code> and <code>offset32</code> are used to distinguish offsets to UTF-16
\r
60 * boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is used to contain UTF-32
\r
61 * characters, as opposed to <code>char16</code>, which is a UTF-16 code unit. </li>
\r
62 * <li> <strong>Roundtripping Offsets:</strong> You can always roundtrip from a UTF-32 offset to a
\r
63 * UTF-16 offset and back. Because of the difference in structure, you can roundtrip from a UTF-16
\r
64 * offset to a UTF-32 offset and back if and only if <code>bounds(string, offset16) != TRAIL</code>.
\r
66 * <li> <strong>Exceptions:</strong> The error checking will throw an exception if indices are out
\r
67 * of bounds. Other than than that, all methods will behave reasonably, even if unmatched surrogates
\r
68 * or out-of-bounds UTF-32 values are present. <code>UCharacter.isLegal()</code> can be used to
\r
69 * check for validity if desired. </li>
\r
70 * <li> <strong>Unmatched Surrogates:</strong> If the string contains unmatched surrogates, then
\r
71 * these are counted as one UTF-32 value. This matches their iteration behavior, which is vital. It
\r
72 * also matches common display practice as missing glyphs (see the Unicode Standard Section 5.4,
\r
74 * <li> <strong>Optimization:</strong> The method implementations may need optimization if the
\r
75 * compiler doesn't fold static final methods. Since surrogate pairs will form an exceeding small
\r
76 * percentage of all the text in the world, the singleton case should always be optimized for. </li>
\r
79 * @author Mark Davis, with help from Markus Scherer
\r
83 public final class UTF16 {
\r
84 // public variables ---------------------------------------------------
\r
87 * Value returned in <code><a href="#bounds(java.lang.String, int)">
\r
88 * bounds()</a></code>.
\r
89 * These values are chosen specifically so that it actually represents the position of the
\r
90 * character [offset16 - (value >> 2), offset16 + (value & 3)]
\r
94 public static final int SINGLE_CHAR_BOUNDARY = 1, LEAD_SURROGATE_BOUNDARY = 2,
\r
95 TRAIL_SURROGATE_BOUNDARY = 5;
\r
98 * The lowest Unicode code point value.
\r
102 public static final int CODEPOINT_MIN_VALUE = 0;
\r
105 * The highest Unicode code point value (scalar value) according to the Unicode Standard.
\r
109 public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
\r
112 * The minimum value for Supplementary code points
\r
116 public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
\r
119 * Lead surrogate minimum value
\r
123 public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
\r
126 * Trail surrogate minimum value
\r
130 public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
\r
133 * Lead surrogate maximum value
\r
137 public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
\r
140 * Trail surrogate maximum value
\r
144 public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
\r
147 * Surrogate minimum value
\r
151 public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
\r
154 * Maximum surrogate value
\r
158 public static final int SURROGATE_MAX_VALUE = TRAIL_SURROGATE_MAX_VALUE;
\r
161 * Lead surrogate bitmask
\r
163 private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00;
\r
166 * Trail surrogate bitmask
\r
168 private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00;
\r
171 * Surrogate bitmask
\r
173 private static final int SURROGATE_BITMASK = 0xFFFFF800;
\r
176 * Lead surrogate bits
\r
178 private static final int LEAD_SURROGATE_BITS = 0xD800;
\r
181 * Trail surrogate bits
\r
183 private static final int TRAIL_SURROGATE_BITS = 0xDC00;
\r
188 private static final int SURROGATE_BITS = 0xD800;
\r
190 // constructor --------------------------------------------------------
\r
194 * Prevent instance from being created.
\r
200 // public method ------------------------------------------------------
\r
203 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
\r
204 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
\r
205 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">
\r
206 * UCharacter.isLegal()</a></code>
\r
207 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
\r
208 * character will be returned. If a complete supplementary character is not found the incomplete
\r
209 * character will be returned
\r
211 * @param source Array of UTF-16 chars
\r
212 * @param offset16 UTF-16 offset to the start of the character.
\r
213 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
\r
214 * of that codepoint are the same as in <code>bounds32()</code>.
\r
215 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
\r
218 public static int charAt(String source, int offset16) {
\r
219 char single = source.charAt(offset16);
\r
220 if (single < LEAD_SURROGATE_MIN_VALUE) {
\r
223 return _charAt(source, offset16, single);
\r
226 private static int _charAt(String source, int offset16, char single) {
\r
227 if (single > TRAIL_SURROGATE_MAX_VALUE) {
\r
231 // Convert the UTF-16 surrogate pair if necessary.
\r
232 // For simplicity in usage, and because the frequency of pairs is
\r
233 // low, look both directions.
\r
235 if (single <= LEAD_SURROGATE_MAX_VALUE) {
\r
237 if (source.length() != offset16) {
\r
238 char trail = source.charAt(offset16);
\r
239 if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
\r
240 return UCharacterProperty.getRawSupplementary(single, trail);
\r
245 if (offset16 >= 0) {
\r
246 // single is a trail surrogate so
\r
247 char lead = source.charAt(offset16);
\r
248 if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
\r
249 return UCharacterProperty.getRawSupplementary(lead, single);
\r
253 return single; // return unmatched surrogate
\r
257 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
\r
258 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
\r
259 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">
\r
260 * UCharacter.isLegal()</a></code>
\r
261 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
\r
262 * character will be returned. If a complete supplementary character is not found the incomplete
\r
263 * character will be returned
\r
265 * @param source Array of UTF-16 chars
\r
266 * @param offset16 UTF-16 offset to the start of the character.
\r
267 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
\r
268 * of that codepoint are the same as in <code>bounds32()</code>.
\r
269 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
\r
272 public static int charAt(CharSequence source, int offset16) {
\r
273 char single = source.charAt(offset16);
\r
274 if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) {
\r
277 return _charAt(source, offset16, single);
\r
280 private static int _charAt(CharSequence source, int offset16, char single) {
\r
281 if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
\r
285 // Convert the UTF-16 surrogate pair if necessary.
\r
286 // For simplicity in usage, and because the frequency of pairs is
\r
287 // low, look both directions.
\r
289 if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
\r
291 if (source.length() != offset16) {
\r
292 char trail = source.charAt(offset16);
\r
293 if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE
\r
294 && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
\r
295 return UCharacterProperty.getRawSupplementary(single, trail);
\r
300 if (offset16 >= 0) {
\r
301 // single is a trail surrogate so
\r
302 char lead = source.charAt(offset16);
\r
303 if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE
\r
304 && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
\r
305 return UCharacterProperty.getRawSupplementary(lead, single);
\r
309 return single; // return unmatched surrogate
\r
313 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
\r
314 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
\r
315 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
\r
317 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
\r
318 * character will be returned. If a complete supplementary character is not found the incomplete
\r
319 * character will be returned
\r
321 * @param source UTF-16 chars string buffer
\r
322 * @param offset16 UTF-16 offset to the start of the character.
\r
323 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
\r
324 * of that codepoint are the same as in <code>bounds32()</code>.
\r
325 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
\r
328 public static int charAt(StringBuffer source, int offset16) {
\r
329 if (offset16 < 0 || offset16 >= source.length()) {
\r
330 throw new StringIndexOutOfBoundsException(offset16);
\r
333 char single = source.charAt(offset16);
\r
334 if (!isSurrogate(single)) {
\r
338 // Convert the UTF-16 surrogate pair if necessary.
\r
339 // For simplicity in usage, and because the frequency of pairs is
\r
340 // low, look both directions.
\r
342 if (single <= LEAD_SURROGATE_MAX_VALUE) {
\r
344 if (source.length() != offset16) {
\r
345 char trail = source.charAt(offset16);
\r
346 if (isTrailSurrogate(trail))
\r
347 return UCharacterProperty.getRawSupplementary(single, trail);
\r
351 if (offset16 >= 0) {
\r
352 // single is a trail surrogate so
\r
353 char lead = source.charAt(offset16);
\r
354 if (isLeadSurrogate(lead)) {
\r
355 return UCharacterProperty.getRawSupplementary(lead, single);
\r
359 return single; // return unmatched surrogate
\r
363 * Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards
\r
364 * (with <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
\r
365 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
\r
367 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
\r
368 * character will be returned. If a complete supplementary character is not found the incomplete
\r
369 * character will be returned
\r
371 * @param source Array of UTF-16 chars
\r
372 * @param start Offset to substring in the source array for analyzing
\r
373 * @param limit Offset to substring in the source array for analyzing
\r
374 * @param offset16 UTF-16 offset relative to start
\r
375 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
\r
376 * of that codepoint are the same as in <code>bounds32()</code>.
\r
377 * @exception IndexOutOfBoundsException Thrown if offset16 is not within the range of start and limit.
\r
380 public static int charAt(char source[], int start, int limit, int offset16) {
\r
382 if (offset16 < start || offset16 >= limit) {
\r
383 throw new ArrayIndexOutOfBoundsException(offset16);
\r
386 char single = source[offset16];
\r
387 if (!isSurrogate(single)) {
\r
391 // Convert the UTF-16 surrogate pair if necessary.
\r
392 // For simplicity in usage, and because the frequency of pairs is
\r
393 // low, look both directions.
\r
394 if (single <= LEAD_SURROGATE_MAX_VALUE) {
\r
396 if (offset16 >= limit) {
\r
399 char trail = source[offset16];
\r
400 if (isTrailSurrogate(trail)) {
\r
401 return UCharacterProperty.getRawSupplementary(single, trail);
\r
403 } else { // isTrailSurrogate(single), so
\r
404 if (offset16 == start) {
\r
408 char lead = source[offset16];
\r
409 if (isLeadSurrogate(lead))
\r
410 return UCharacterProperty.getRawSupplementary(lead, single);
\r
412 return single; // return unmatched surrogate
\r
416 * Extract a single UTF-32 value from a string. Used when iterating forwards or backwards (with
\r
417 * <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
\r
418 * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
\r
420 * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
\r
421 * character will be returned. If a complete supplementary character is not found the incomplete
\r
422 * character will be returned
\r
424 * @param source UTF-16 chars string buffer
\r
425 * @param offset16 UTF-16 offset to the start of the character.
\r
426 * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
\r
427 * of that codepoint are the same as in <code>bounds32()</code>.
\r
428 * @exception IndexOutOfBoundsException Thrown if offset16 is out of bounds.
\r
431 public static int charAt(Replaceable source, int offset16) {
\r
432 if (offset16 < 0 || offset16 >= source.length()) {
\r
433 throw new StringIndexOutOfBoundsException(offset16);
\r
436 char single = source.charAt(offset16);
\r
437 if (!isSurrogate(single)) {
\r
441 // Convert the UTF-16 surrogate pair if necessary.
\r
442 // For simplicity in usage, and because the frequency of pairs is
\r
443 // low, look both directions.
\r
445 if (single <= LEAD_SURROGATE_MAX_VALUE) {
\r
447 if (source.length() != offset16) {
\r
448 char trail = source.charAt(offset16);
\r
449 if (isTrailSurrogate(trail))
\r
450 return UCharacterProperty.getRawSupplementary(single, trail);
\r
454 if (offset16 >= 0) {
\r
455 // single is a trail surrogate so
\r
456 char lead = source.charAt(offset16);
\r
457 if (isLeadSurrogate(lead)) {
\r
458 return UCharacterProperty.getRawSupplementary(lead, single);
\r
462 return single; // return unmatched surrogate
\r
466 * Determines how many chars this char32 requires. If a validity check is required, use <code>
\r
467 * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
\r
468 * on char32 before calling.
\r
470 * @param char32 The input codepoint.
\r
471 * @return 2 if is in supplementary space, otherwise 1.
\r
474 public static int getCharCount(int char32) {
\r
475 if (char32 < SUPPLEMENTARY_MIN_VALUE) {
\r
482 * Returns the type of the boundaries around the char at offset16. Used for random access.
\r
484 * @param source Text to analyse
\r
485 * @param offset16 UTF-16 offset
\r
488 * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16+1]
\r
489 * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
\r
490 * are [offset16, offset16 + 2]
\r
491 * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
\r
492 * bounds are [offset16 - 1, offset16 + 1]
\r
494 * For bit-twiddlers, the return values for these are chosen so that the boundaries
\r
495 * can be gotten by: [offset16 - (value >> 2), offset16 + (value & 3)].
\r
496 * @exception IndexOutOfBoundsException If offset16 is out of bounds.
\r
499 public static int bounds(String source, int offset16) {
\r
500 char ch = source.charAt(offset16);
\r
501 if (isSurrogate(ch)) {
\r
502 if (isLeadSurrogate(ch)) {
\r
503 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) {
\r
504 return LEAD_SURROGATE_BOUNDARY;
\r
507 // isTrailSurrogate(ch), so
\r
509 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) {
\r
510 return TRAIL_SURROGATE_BOUNDARY;
\r
514 return SINGLE_CHAR_BOUNDARY;
\r
518 * Returns the type of the boundaries around the char at offset16. Used for random access.
\r
520 * @param source String buffer to analyse
\r
521 * @param offset16 UTF16 offset
\r
524 * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are [offset16, offset16 + 1]
\r
525 * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
\r
526 * are [offset16, offset16 + 2]
\r
527 * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
\r
528 * bounds are [offset16 - 1, offset16 + 1]
\r
530 * For bit-twiddlers, the return values for these are chosen so that the boundaries
\r
531 * can be gotten by: [offset16 - (value >> 2), offset16 + (value & 3)].
\r
532 * @exception IndexOutOfBoundsException If offset16 is out of bounds.
\r
535 public static int bounds(StringBuffer source, int offset16) {
\r
536 char ch = source.charAt(offset16);
\r
537 if (isSurrogate(ch)) {
\r
538 if (isLeadSurrogate(ch)) {
\r
539 if (++offset16 < source.length() && isTrailSurrogate(source.charAt(offset16))) {
\r
540 return LEAD_SURROGATE_BOUNDARY;
\r
543 // isTrailSurrogate(ch), so
\r
545 if (offset16 >= 0 && isLeadSurrogate(source.charAt(offset16))) {
\r
546 return TRAIL_SURROGATE_BOUNDARY;
\r
550 return SINGLE_CHAR_BOUNDARY;
\r
554 * Returns the type of the boundaries around the char at offset16. Used for random access. Note
\r
555 * that the boundaries are determined with respect to the subarray, hence the char array
\r
556 * {0xD800, 0xDC00} has the result SINGLE_CHAR_BOUNDARY for start = offset16 = 0 and limit = 1.
\r
558 * @param source Char array to analyse
\r
559 * @param start Offset to substring in the source array for analyzing
\r
560 * @param limit Offset to substring in the source array for analyzing
\r
561 * @param offset16 UTF16 offset relative to start
\r
564 * <li> SINGLE_CHAR_BOUNDARY : a single char; the bounds are
\r
565 * <li> LEAD_SURROGATE_BOUNDARY : a surrogate pair starting at offset16; the bounds
\r
566 * are [offset16, offset16 + 2]
\r
567 * <li> TRAIL_SURROGATE_BOUNDARY : a surrogate pair starting at offset16 - 1; the
\r
568 * bounds are [offset16 - 1, offset16 + 1]
\r
570 * For bit-twiddlers, the boundary values for these are chosen so that the boundaries
\r
571 * can be gotten by: [offset16 - (boundvalue >> 2), offset16 + (boundvalue & 3)].
\r
572 * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit.
\r
575 public static int bounds(char source[], int start, int limit, int offset16) {
\r
577 if (offset16 < start || offset16 >= limit) {
\r
578 throw new ArrayIndexOutOfBoundsException(offset16);
\r
580 char ch = source[offset16];
\r
581 if (isSurrogate(ch)) {
\r
582 if (isLeadSurrogate(ch)) {
\r
584 if (offset16 < limit && isTrailSurrogate(source[offset16])) {
\r
585 return LEAD_SURROGATE_BOUNDARY;
\r
587 } else { // isTrailSurrogate(ch), so
\r
589 if (offset16 >= start && isLeadSurrogate(source[offset16])) {
\r
590 return TRAIL_SURROGATE_BOUNDARY;
\r
594 return SINGLE_CHAR_BOUNDARY;
\r
598 * Determines whether the code value is a surrogate.
\r
600 * @param char16 The input character.
\r
601 * @return true If the input character is a surrogate.
\r
604 public static boolean isSurrogate(char char16) {
\r
605 return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS;
\r
609 * Determines whether the character is a trail surrogate.
\r
611 * @param char16 The input character.
\r
612 * @return true If the input character is a trail surrogate.
\r
615 public static boolean isTrailSurrogate(char char16) {
\r
616 return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
\r
620 * Determines whether the character is a lead surrogate.
\r
622 * @param char16 The input character.
\r
623 * @return true If the input character is a lead surrogate
\r
626 public static boolean isLeadSurrogate(char char16) {
\r
627 return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
\r
631 * Returns the lead surrogate. If a validity check is required, use
\r
632 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32
\r
635 * @param char32 The input character.
\r
636 * @return lead surrogate if the getCharCount(ch) is 2; <br>
\r
637 * and 0 otherwise (note: 0 is not a valid lead surrogate).
\r
640 public static char getLeadSurrogate(int char32) {
\r
641 if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
\r
642 return (char) (LEAD_SURROGATE_OFFSET_ + (char32 >> LEAD_SURROGATE_SHIFT_));
\r
648 * Returns the trail surrogate. If a validity check is required, use
\r
649 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32
\r
652 * @param char32 The input character.
\r
653 * @return the trail surrogate if the getCharCount(ch) is 2; <br>
\r
654 * otherwise the character itself
\r
657 public static char getTrailSurrogate(int char32) {
\r
658 if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
\r
659 return (char) (TRAIL_SURROGATE_MIN_VALUE + (char32 & TRAIL_SURROGATE_MASK_));
\r
661 return (char) char32;
\r
665 * Convenience method corresponding to String.valueOf(char). Returns a one or two char string
\r
666 * containing the UTF-32 value in UTF16 format. If a validity check is required, use <a
\r
667 * href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32 before calling.
\r
669 * @param char32 The input character.
\r
670 * @return string value of char32 in UTF16 format
\r
671 * @exception IllegalArgumentException Thrown if char32 is a invalid codepoint.
\r
674 public static String valueOf(int char32) {
\r
675 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
\r
676 throw new IllegalArgumentException("Illegal codepoint");
\r
678 return toString(char32);
\r
682 * Convenience method corresponding to String.valueOf(codepoint at offset16). Returns a one or
\r
683 * two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a surrogate
\r
684 * character, the whole supplementary codepoint will be returned. If a validity check is
\r
685 * required, use <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on the
\r
686 * codepoint at offset16 before calling. The result returned will be a newly created String
\r
687 * obtained by calling source.substring(..) with the appropriate indexes.
\r
689 * @param source The input string.
\r
690 * @param offset16 The UTF16 index to the codepoint in source
\r
691 * @return string value of char32 in UTF16 format
\r
694 public static String valueOf(String source, int offset16) {
\r
695 switch (bounds(source, offset16)) {
\r
696 case LEAD_SURROGATE_BOUNDARY:
\r
697 return source.substring(offset16, offset16 + 2);
\r
698 case TRAIL_SURROGATE_BOUNDARY:
\r
699 return source.substring(offset16 - 1, offset16 + 1);
\r
701 return source.substring(offset16, offset16 + 1);
\r
706 * Convenience method corresponding to StringBuffer.valueOf(codepoint at offset16). Returns a
\r
707 * one or two char string containing the UTF-32 value in UTF16 format. If offset16 indexes a
\r
708 * surrogate character, the whole supplementary codepoint will be returned. If a validity check
\r
709 * is required, use <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
\r
710 * the codepoint at offset16 before calling. The result returned will be a newly created String
\r
711 * obtained by calling source.substring(..) with the appropriate indexes.
\r
713 * @param source The input string buffer.
\r
714 * @param offset16 The UTF16 index to the codepoint in source
\r
715 * @return string value of char32 in UTF16 format
\r
718 public static String valueOf(StringBuffer source, int offset16) {
\r
719 switch (bounds(source, offset16)) {
\r
720 case LEAD_SURROGATE_BOUNDARY:
\r
721 return source.substring(offset16, offset16 + 2);
\r
722 case TRAIL_SURROGATE_BOUNDARY:
\r
723 return source.substring(offset16 - 1, offset16 + 1);
\r
725 return source.substring(offset16, offset16 + 1);
\r
730 * Convenience method. Returns a one or two char string containing the UTF-32 value in UTF16
\r
731 * format. If offset16 indexes a surrogate character, the whole supplementary codepoint will be
\r
732 * returned, except when either the leading or trailing surrogate character lies out of the
\r
733 * specified subarray. In the latter case, only the surrogate character within bounds will be
\r
734 * returned. If a validity check is required, use <a
\r
735 * href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on the codepoint at
\r
736 * offset16 before calling. The result returned will be a newly created String containing the
\r
737 * relevant characters.
\r
739 * @param source The input char array.
\r
740 * @param start Start index of the subarray
\r
741 * @param limit End index of the subarray
\r
742 * @param offset16 The UTF16 index to the codepoint in source relative to start
\r
743 * @return string value of char32 in UTF16 format
\r
746 public static String valueOf(char source[], int start, int limit, int offset16) {
\r
747 switch (bounds(source, start, limit, offset16)) {
\r
748 case LEAD_SURROGATE_BOUNDARY:
\r
749 return new String(source, start + offset16, 2);
\r
750 case TRAIL_SURROGATE_BOUNDARY:
\r
751 return new String(source, start + offset16 - 1, 2);
\r
753 return new String(source, start + offset16, 1);
\r
757 * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
\r
758 * the <a name="_top_">class description</a> for notes on roundtripping.
\r
760 * @param source The UTF-16 string
\r
761 * @param offset32 UTF-32 offset
\r
762 * @return UTF-16 offset
\r
763 * @exception IndexOutOfBoundsException If offset32 is out of bounds.
\r
766 public static int findOffsetFromCodePoint(String source, int offset32) {
\r
768 int size = source.length(), result = 0, count = offset32;
\r
769 if (offset32 < 0 || offset32 > size) {
\r
770 throw new StringIndexOutOfBoundsException(offset32);
\r
772 while (result < size && count > 0) {
\r
773 ch = source.charAt(result);
\r
774 if (isLeadSurrogate(ch) && ((result + 1) < size)
\r
775 && isTrailSurrogate(source.charAt(result + 1))) {
\r
783 throw new StringIndexOutOfBoundsException(offset32);
\r
789 * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
\r
790 * the <a name="_top_">class description</a> for notes on roundtripping.
\r
792 * @param source The UTF-16 string buffer
\r
793 * @param offset32 UTF-32 offset
\r
794 * @return UTF-16 offset
\r
795 * @exception IndexOutOfBoundsException If offset32 is out of bounds.
\r
798 public static int findOffsetFromCodePoint(StringBuffer source, int offset32) {
\r
800 int size = source.length(), result = 0, count = offset32;
\r
801 if (offset32 < 0 || offset32 > size) {
\r
802 throw new StringIndexOutOfBoundsException(offset32);
\r
804 while (result < size && count > 0) {
\r
805 ch = source.charAt(result);
\r
806 if (isLeadSurrogate(ch) && ((result + 1) < size)
\r
807 && isTrailSurrogate(source.charAt(result + 1))) {
\r
815 throw new StringIndexOutOfBoundsException(offset32);
\r
821 * Returns the UTF-16 offset that corresponds to a UTF-32 offset. Used for random access. See
\r
822 * the <a name="_top_">class description</a> for notes on roundtripping.
\r
824 * @param source The UTF-16 char array whose substring is to be analysed
\r
825 * @param start Offset of the substring to be analysed
\r
826 * @param limit Offset of the substring to be analysed
\r
827 * @param offset32 UTF-32 offset relative to start
\r
828 * @return UTF-16 offset relative to start
\r
829 * @exception IndexOutOfBoundsException If offset32 is out of bounds.
\r
832 public static int findOffsetFromCodePoint(char source[], int start, int limit, int offset32) {
\r
834 int result = start, count = offset32;
\r
835 if (offset32 > limit - start) {
\r
836 throw new ArrayIndexOutOfBoundsException(offset32);
\r
838 while (result < limit && count > 0) {
\r
839 ch = source[result];
\r
840 if (isLeadSurrogate(ch) && ((result + 1) < limit)
\r
841 && isTrailSurrogate(source[result + 1])) {
\r
849 throw new ArrayIndexOutOfBoundsException(offset32);
\r
851 return result - start;
\r
855 * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at or after the given
\r
856 * UTF-16 offset. Used for random access. See the <a name="_top_">class description</a> for
\r
857 * notes on roundtripping.<br>
\r
858 * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
\r
859 * of the <strong>lead</strong> of the pair is returned. </i>
\r
861 * To find the UTF-32 length of a string, use:
\r
864 * len32 = countCodePoint(source, source.length());
\r
870 * @param source Text to analyse
\r
871 * @param offset16 UTF-16 offset < source text length.
\r
872 * @return UTF-32 offset
\r
873 * @exception IndexOutOfBoundsException If offset16 is out of bounds.
\r
876 public static int findCodePointOffset(String source, int offset16) {
\r
877 if (offset16 < 0 || offset16 > source.length()) {
\r
878 throw new StringIndexOutOfBoundsException(offset16);
\r
883 boolean hadLeadSurrogate = false;
\r
885 for (int i = 0; i < offset16; ++i) {
\r
886 ch = source.charAt(i);
\r
887 if (hadLeadSurrogate && isTrailSurrogate(ch)) {
\r
888 hadLeadSurrogate = false; // count valid trail as zero
\r
890 hadLeadSurrogate = isLeadSurrogate(ch);
\r
891 ++result; // count others as 1
\r
895 if (offset16 == source.length()) {
\r
899 // end of source being the less significant surrogate character
\r
900 // shift result back to the start of the supplementary character
\r
901 if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) {
\r
909 * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16
\r
910 * offset. Used for random access. See the <a name="_top_">class description</a> for notes on
\r
911 * roundtripping.<br>
\r
912 * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
\r
913 * of the <strong>lead</strong> of the pair is returned. </i>
\r
915 * To find the UTF-32 length of a string, use:
\r
918 * len32 = countCodePoint(source);
\r
924 * @param source Text to analyse
\r
925 * @param offset16 UTF-16 offset < source text length.
\r
926 * @return UTF-32 offset
\r
927 * @exception IndexOutOfBoundsException If offset16 is out of bounds.
\r
930 public static int findCodePointOffset(StringBuffer source, int offset16) {
\r
931 if (offset16 < 0 || offset16 > source.length()) {
\r
932 throw new StringIndexOutOfBoundsException(offset16);
\r
937 boolean hadLeadSurrogate = false;
\r
939 for (int i = 0; i < offset16; ++i) {
\r
940 ch = source.charAt(i);
\r
941 if (hadLeadSurrogate && isTrailSurrogate(ch)) {
\r
942 hadLeadSurrogate = false; // count valid trail as zero
\r
944 hadLeadSurrogate = isLeadSurrogate(ch);
\r
945 ++result; // count others as 1
\r
949 if (offset16 == source.length()) {
\r
953 // end of source being the less significant surrogate character
\r
954 // shift result back to the start of the supplementary character
\r
955 if (hadLeadSurrogate && (isTrailSurrogate(source.charAt(offset16)))) {
\r
963 * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at the given UTF-16
\r
964 * offset. Used for random access. See the <a name="_top_">class description</a> for notes on
\r
965 * roundtripping.<br>
\r
966 * <i>Note: If the UTF-16 offset is into the middle of a surrogate pair, then the UTF-32 offset
\r
967 * of the <strong>lead</strong> of the pair is returned. </i>
\r
969 * To find the UTF-32 length of a substring, use:
\r
972 * len32 = countCodePoint(source, start, limit);
\r
978 * @param source Text to analyse
\r
979 * @param start Offset of the substring
\r
980 * @param limit Offset of the substring
\r
981 * @param offset16 UTF-16 relative to start
\r
982 * @return UTF-32 offset relative to start
\r
983 * @exception IndexOutOfBoundsException If offset16 is not within the range of start and limit.
\r
986 public static int findCodePointOffset(char source[], int start, int limit, int offset16) {
\r
988 if (offset16 > limit) {
\r
989 throw new StringIndexOutOfBoundsException(offset16);
\r
994 boolean hadLeadSurrogate = false;
\r
996 for (int i = start; i < offset16; ++i) {
\r
998 if (hadLeadSurrogate && isTrailSurrogate(ch)) {
\r
999 hadLeadSurrogate = false; // count valid trail as zero
\r
1001 hadLeadSurrogate = isLeadSurrogate(ch);
\r
1002 ++result; // count others as 1
\r
1006 if (offset16 == limit) {
\r
1010 // end of source being the less significant surrogate character
\r
1011 // shift result back to the start of the supplementary character
\r
1012 if (hadLeadSurrogate && (isTrailSurrogate(source[offset16]))) {
\r
1020 * Append a single UTF-32 value to the end of a StringBuffer. If a validity check is required,
\r
1021 * use <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on char32 before
\r
1024 * @param target The buffer to append to
\r
1025 * @param char32 Value to append.
\r
1026 * @return the updated StringBuffer
\r
1027 * @exception IllegalArgumentException Thrown when char32 does not lie within the range of the Unicode codepoints
\r
1030 public static StringBuffer append(StringBuffer target, int char32) {
\r
1031 // Check for irregular values
\r
1032 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
\r
1033 throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
\r
1036 // Write the UTF-16 values
\r
1037 if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
\r
1038 target.append(getLeadSurrogate(char32));
\r
1039 target.append(getTrailSurrogate(char32));
\r
1041 target.append((char) char32);
\r
1047 * Cover JDK 1.5 APIs. Append the code point to the buffer and return the buffer as a
\r
1050 * @param target The buffer to append to
\r
1051 * @param cp The code point to append
\r
1052 * @return the updated StringBuffer
\r
1053 * @throws IllegalArgumentException If cp is not a valid code point
\r
1056 public static StringBuffer appendCodePoint(StringBuffer target, int cp) {
\r
1057 return append(target, cp);
\r
1061 * Adds a codepoint to offset16 position of the argument char array.
\r
1063 * @param target Char array to be append with the new code point
\r
1064 * @param limit UTF16 offset which the codepoint will be appended.
\r
1065 * @param char32 Code point to be appended
\r
1066 * @return offset after char32 in the array.
\r
1067 * @exception IllegalArgumentException Thrown if there is not enough space for the append, or when char32 does not
\r
1068 * lie within the range of the Unicode codepoints.
\r
1071 public static int append(char[] target, int limit, int char32) {
\r
1072 // Check for irregular values
\r
1073 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
\r
1074 throw new IllegalArgumentException("Illegal codepoint");
\r
1076 // Write the UTF-16 values
\r
1077 if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
\r
1078 target[limit++] = getLeadSurrogate(char32);
\r
1079 target[limit++] = getTrailSurrogate(char32);
\r
1081 target[limit++] = (char) char32;
\r
1087 * Number of codepoints in a UTF16 String
\r
1089 * @param source UTF16 string
\r
1090 * @return number of codepoint in string
\r
1093 public static int countCodePoint(String source) {
\r
1094 if (source == null || source.length() == 0) {
\r
1097 return findCodePointOffset(source, source.length());
\r
1101 * Number of codepoints in a UTF16 String buffer
\r
1103 * @param source UTF16 string buffer
\r
1104 * @return number of codepoint in string
\r
1107 public static int countCodePoint(StringBuffer source) {
\r
1108 if (source == null || source.length() == 0) {
\r
1111 return findCodePointOffset(source, source.length());
\r
1115 * Number of codepoints in a UTF16 char array substring
\r
1117 * @param source UTF16 char array
\r
1118 * @param start Offset of the substring
\r
1119 * @param limit Offset of the substring
\r
1120 * @return number of codepoint in the substring
\r
1121 * @exception IndexOutOfBoundsException If start and limit are not valid.
\r
1124 public static int countCodePoint(char source[], int start, int limit) {
\r
1125 if (source == null || source.length == 0) {
\r
1128 return findCodePointOffset(source, start, limit, limit - start);
\r
1132 * Set a code point into a UTF16 position. Adjusts target according if we are replacing a
\r
1133 * non-supplementary codepoint with a supplementary and vice versa.
\r
1135 * @param target Stringbuffer
\r
1136 * @param offset16 UTF16 position to insert into
\r
1137 * @param char32 Code point
\r
1140 public static void setCharAt(StringBuffer target, int offset16, int char32) {
\r
1142 char single = target.charAt(offset16);
\r
1144 if (isSurrogate(single)) {
\r
1145 // pairs of the surrogate with offset16 at the lead char found
\r
1146 if (isLeadSurrogate(single) && (target.length() > offset16 + 1)
\r
1147 && isTrailSurrogate(target.charAt(offset16 + 1))) {
\r
1150 // pairs of the surrogate with offset16 at the trail char
\r
1152 if (isTrailSurrogate(single) && (offset16 > 0)
\r
1153 && isLeadSurrogate(target.charAt(offset16 - 1))) {
\r
1159 target.replace(offset16, offset16 + count, valueOf(char32));
\r
1163 * Set a code point into a UTF16 position in a char array. Adjusts target according if we are
\r
1164 * replacing a non-supplementary codepoint with a supplementary and vice versa.
\r
1166 * @param target char array
\r
1167 * @param limit numbers of valid chars in target, different from target.length. limit counts the
\r
1168 * number of chars in target that represents a string, not the size of array target.
\r
1169 * @param offset16 UTF16 position to insert into
\r
1170 * @param char32 code point
\r
1171 * @return new number of chars in target that represents a string
\r
1172 * @exception IndexOutOfBoundsException if offset16 is out of range
\r
1175 public static int setCharAt(char target[], int limit, int offset16, int char32) {
\r
1176 if (offset16 >= limit) {
\r
1177 throw new ArrayIndexOutOfBoundsException(offset16);
\r
1180 char single = target[offset16];
\r
1182 if (isSurrogate(single)) {
\r
1183 // pairs of the surrogate with offset16 at the lead char found
\r
1184 if (isLeadSurrogate(single) && (target.length > offset16 + 1)
\r
1185 && isTrailSurrogate(target[offset16 + 1])) {
\r
1188 // pairs of the surrogate with offset16 at the trail char
\r
1190 if (isTrailSurrogate(single) && (offset16 > 0)
\r
1191 && isLeadSurrogate(target[offset16 - 1])) {
\r
1198 String str = valueOf(char32);
\r
1199 int result = limit;
\r
1200 int strlength = str.length();
\r
1201 target[offset16] = str.charAt(0);
\r
1202 if (count == strlength) {
\r
1204 target[offset16 + 1] = str.charAt(1);
\r
1207 // this is not exact match in space, we'll have to do some
\r
1209 System.arraycopy(target, offset16 + count, target, offset16 + strlength, limit
\r
1210 - (offset16 + count));
\r
1211 if (count < strlength) {
\r
1212 // char32 is a supplementary character trying to squeeze into
\r
1213 // a non-supplementary space
\r
1214 target[offset16 + 1] = str.charAt(1);
\r
1216 if (result < target.length) {
\r
1217 target[result] = 0;
\r
1220 // char32 is a non-supplementary character trying to fill
\r
1221 // into a supplementary space
\r
1223 target[result] = 0;
\r
1230 * Shifts offset16 by the argument number of codepoints
\r
1232 * @param source string
\r
1233 * @param offset16 UTF16 position to shift
\r
1234 * @param shift32 number of codepoints to shift
\r
1235 * @return new shifted offset16
\r
1236 * @exception IndexOutOfBoundsException if the new offset16 is out of bounds.
\r
1239 public static int moveCodePointOffset(String source, int offset16, int shift32) {
\r
1240 int result = offset16;
\r
1241 int size = source.length();
\r
1244 if (offset16 < 0 || offset16 > size) {
\r
1245 throw new StringIndexOutOfBoundsException(offset16);
\r
1247 if (shift32 > 0) {
\r
1248 if (shift32 + offset16 > size) {
\r
1249 throw new StringIndexOutOfBoundsException(offset16);
\r
1252 while (result < size && count > 0) {
\r
1253 ch = source.charAt(result);
\r
1254 if (isLeadSurrogate(ch) && ((result + 1) < size)
\r
1255 && isTrailSurrogate(source.charAt(result + 1))) {
\r
1262 if (offset16 + shift32 < 0) {
\r
1263 throw new StringIndexOutOfBoundsException(offset16);
\r
1265 for (count = -shift32; count > 0; count--) {
\r
1270 ch = source.charAt(result);
\r
1271 if (isTrailSurrogate(ch) && result > 0
\r
1272 && isLeadSurrogate(source.charAt(result - 1))) {
\r
1278 throw new StringIndexOutOfBoundsException(shift32);
\r
1284 * Shifts offset16 by the argument number of codepoints
\r
1286 * @param source String buffer
\r
1287 * @param offset16 UTF16 position to shift
\r
1288 * @param shift32 Number of codepoints to shift
\r
1289 * @return new shifted offset16
\r
1290 * @exception IndexOutOfBoundsException If the new offset16 is out of bounds.
\r
1293 public static int moveCodePointOffset(StringBuffer source, int offset16, int shift32) {
\r
1294 int result = offset16;
\r
1295 int size = source.length();
\r
1298 if (offset16 < 0 || offset16 > size) {
\r
1299 throw new StringIndexOutOfBoundsException(offset16);
\r
1301 if (shift32 > 0) {
\r
1302 if (shift32 + offset16 > size) {
\r
1303 throw new StringIndexOutOfBoundsException(offset16);
\r
1306 while (result < size && count > 0) {
\r
1307 ch = source.charAt(result);
\r
1308 if (isLeadSurrogate(ch) && ((result + 1) < size)
\r
1309 && isTrailSurrogate(source.charAt(result + 1))) {
\r
1316 if (offset16 + shift32 < 0) {
\r
1317 throw new StringIndexOutOfBoundsException(offset16);
\r
1319 for (count = -shift32; count > 0; count--) {
\r
1324 ch = source.charAt(result);
\r
1325 if (isTrailSurrogate(ch) && result > 0
\r
1326 && isLeadSurrogate(source.charAt(result - 1))) {
\r
1332 throw new StringIndexOutOfBoundsException(shift32);
\r
1338 * Shifts offset16 by the argument number of codepoints within a subarray.
\r
1340 * @param source Char array
\r
1341 * @param start Position of the subarray to be performed on
\r
1342 * @param limit Position of the subarray to be performed on
\r
1343 * @param offset16 UTF16 position to shift relative to start
\r
1344 * @param shift32 Number of codepoints to shift
\r
1345 * @return new shifted offset16 relative to start
\r
1346 * @exception IndexOutOfBoundsException If the new offset16 is out of bounds with respect to the subarray or the
\r
1347 * subarray bounds are out of range.
\r
1350 public static int moveCodePointOffset(char source[], int start, int limit, int offset16,
\r
1352 int size = source.length;
\r
1355 int result = offset16 + start;
\r
1356 if (start < 0 || limit < start) {
\r
1357 throw new StringIndexOutOfBoundsException(start);
\r
1359 if (limit > size) {
\r
1360 throw new StringIndexOutOfBoundsException(limit);
\r
1362 if (offset16 < 0 || result > limit) {
\r
1363 throw new StringIndexOutOfBoundsException(offset16);
\r
1365 if (shift32 > 0) {
\r
1366 if (shift32 + result > size) {
\r
1367 throw new StringIndexOutOfBoundsException(result);
\r
1370 while (result < limit && count > 0) {
\r
1371 ch = source[result];
\r
1372 if (isLeadSurrogate(ch) && (result + 1 < limit)
\r
1373 && isTrailSurrogate(source[result + 1])) {
\r
1380 if (result + shift32 < start) {
\r
1381 throw new StringIndexOutOfBoundsException(result);
\r
1383 for (count = -shift32; count > 0; count--) {
\r
1385 if (result < start) {
\r
1388 ch = source[result];
\r
1389 if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) {
\r
1395 throw new StringIndexOutOfBoundsException(shift32);
\r
1402 * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the
\r
1403 * middle of a supplementary codepoint, char32 will be inserted after the supplementary
\r
1404 * codepoint. The length of target increases by one if codepoint is non-supplementary, 2
\r
1407 * The overall effect is exactly as if the argument were converted to a string by the method
\r
1408 * valueOf(char) and the characters in that string were then inserted into target at the
\r
1409 * position indicated by offset16.
\r
1412 * The offset argument must be greater than or equal to 0, and less than or equal to the length
\r
1415 * @param target String buffer to insert to
\r
1416 * @param offset16 Offset which char32 will be inserted in
\r
1417 * @param char32 Codepoint to be inserted
\r
1418 * @return a reference to target
\r
1419 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
\r
1422 public static StringBuffer insert(StringBuffer target, int offset16, int char32) {
\r
1423 String str = valueOf(char32);
\r
1424 if (offset16 != target.length() && bounds(target, offset16) == TRAIL_SURROGATE_BOUNDARY) {
\r
1427 target.insert(offset16, str);
\r
1432 * Inserts char32 codepoint into target at the argument offset16. If the offset16 is in the
\r
1433 * middle of a supplementary codepoint, char32 will be inserted after the supplementary
\r
1434 * codepoint. Limit increases by one if codepoint is non-supplementary, 2 otherwise.
\r
1436 * The overall effect is exactly as if the argument were converted to a string by the method
\r
1437 * valueOf(char) and the characters in that string were then inserted into target at the
\r
1438 * position indicated by offset16.
\r
1441 * The offset argument must be greater than or equal to 0, and less than or equal to the limit.
\r
1443 * @param target Char array to insert to
\r
1444 * @param limit End index of the char array, limit <= target.length
\r
1445 * @param offset16 Offset which char32 will be inserted in
\r
1446 * @param char32 Codepoint to be inserted
\r
1447 * @return new limit size
\r
1448 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
\r
1451 public static int insert(char target[], int limit, int offset16, int char32) {
\r
1452 String str = valueOf(char32);
\r
1453 if (offset16 != limit && bounds(target, 0, limit, offset16) == TRAIL_SURROGATE_BOUNDARY) {
\r
1456 int size = str.length();
\r
1457 if (limit + size > target.length) {
\r
1458 throw new ArrayIndexOutOfBoundsException(offset16 + size);
\r
1460 System.arraycopy(target, offset16, target, offset16 + size, limit - offset16);
\r
1461 target[offset16] = str.charAt(0);
\r
1463 target[offset16 + 1] = str.charAt(1);
\r
1465 return limit + size;
\r
1469 * Removes the codepoint at the specified position in this target (shortening target by 1
\r
1470 * character if the codepoint is a non-supplementary, 2 otherwise).
\r
1472 * @param target String buffer to remove codepoint from
\r
1473 * @param offset16 Offset which the codepoint will be removed
\r
1474 * @return a reference to target
\r
1475 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
\r
1478 public static StringBuffer delete(StringBuffer target, int offset16) {
\r
1480 switch (bounds(target, offset16)) {
\r
1481 case LEAD_SURROGATE_BOUNDARY:
\r
1484 case TRAIL_SURROGATE_BOUNDARY:
\r
1489 target.delete(offset16, offset16 + count);
\r
1494 * Removes the codepoint at the specified position in this target (shortening target by 1
\r
1495 * character if the codepoint is a non-supplementary, 2 otherwise).
\r
1497 * @param target String buffer to remove codepoint from
\r
1498 * @param limit End index of the char array, limit <= target.length
\r
1499 * @param offset16 Offset which the codepoint will be removed
\r
1500 * @return a new limit size
\r
1501 * @exception IndexOutOfBoundsException Thrown if offset16 is invalid.
\r
1504 public static int delete(char target[], int limit, int offset16) {
\r
1506 switch (bounds(target, 0, limit, offset16)) {
\r
1507 case LEAD_SURROGATE_BOUNDARY:
\r
1510 case TRAIL_SURROGATE_BOUNDARY:
\r
1515 System.arraycopy(target, offset16 + count, target, offset16, limit - (offset16 + count));
\r
1516 target[limit - count] = 0;
\r
1517 return limit - count;
\r
1521 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
\r
1522 * the argument codepoint. I.e., the smallest index <code>i</code> such that
\r
1523 * <code>UTF16.charAt(source, i) ==
\r
1524 * char32</code> is true.
\r
1526 * If no such character occurs in this string, then -1 is returned.
\r
1530 * UTF16.indexOf("abc", 'a') returns 0<br>
\r
1531 * UTF16.indexOf("abc\ud800\udc00", 0x10000) returns 3<br>
\r
1532 * UTF16.indexOf("abc\ud800\udc00", 0xd800) returns -1<br>
\r
1534 * Note this method is provided as support to jdk 1.3, which does not support supplementary
\r
1535 * characters to its fullest.
\r
1537 * @param source UTF16 format Unicode string that will be searched
\r
1538 * @param char32 Codepoint to search for
\r
1539 * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
\r
1540 * -1 if the codepoint does not occur.
\r
1543 public static int indexOf(String source, int char32) {
\r
1544 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
\r
1545 throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
\r
1547 // non-surrogate bmp
\r
1548 if (char32 < LEAD_SURROGATE_MIN_VALUE
\r
1549 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
\r
1550 return source.indexOf((char) char32);
\r
1553 if (char32 < SUPPLEMENTARY_MIN_VALUE) {
\r
1554 int result = source.indexOf((char) char32);
\r
1555 if (result >= 0) {
\r
1556 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
\r
1557 && isTrailSurrogate(source.charAt(result + 1))) {
\r
1558 return indexOf(source, char32, result + 1);
\r
1560 // trail surrogate
\r
1561 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
\r
1562 return indexOf(source, char32, result + 1);
\r
1568 String char32str = toString(char32);
\r
1569 return source.indexOf(char32str);
\r
1573 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
\r
1574 * the argument string str. This method is implemented based on codepoints, hence a "lead
\r
1575 * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
\r
1576 * starts with trail surrogate character at index 0, a source with a leading a surrogate
\r
1577 * character before str found at in source will not have a valid match. Vice versa for lead
\r
1578 * surrogates that ends str. See example below.
\r
1580 * If no such string str occurs in this source, then -1 is returned.
\r
1584 * UTF16.indexOf("abc", "ab") returns 0<br>
\r
1585 * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br>
\r
1586 * UTF16.indexOf("abc\ud800\udc00", "\ud800") returns -1<br>
\r
1588 * Note this method is provided as support to jdk 1.3, which does not support supplementary
\r
1589 * characters to its fullest.
\r
1591 * @param source UTF16 format Unicode string that will be searched
\r
1592 * @param str UTF16 format Unicode string to search for
\r
1593 * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
\r
1594 * -1 if the codepoint does not occur.
\r
1597 public static int indexOf(String source, String str) {
\r
1598 int strLength = str.length();
\r
1599 // non-surrogate ends
\r
1600 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
\r
1601 return source.indexOf(str);
\r
1604 int result = source.indexOf(str);
\r
1605 int resultEnd = result + strLength;
\r
1606 if (result >= 0) {
\r
1607 // check last character
\r
1608 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
\r
1609 && isTrailSurrogate(source.charAt(resultEnd + 1))) {
\r
1610 return indexOf(source, str, resultEnd + 1);
\r
1612 // check first character which is a trail surrogate
\r
1613 if (isTrailSurrogate(str.charAt(0)) && result > 0
\r
1614 && isLeadSurrogate(source.charAt(result - 1))) {
\r
1615 return indexOf(source, str, resultEnd + 1);
\r
1622 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
\r
1623 * the argument codepoint. I.e., the smallest index i such that: <br>
\r
1624 * (UTF16.charAt(source, i) == char32 && i >= fromIndex) is true.
\r
1626 * If no such character occurs in this string, then -1 is returned.
\r
1630 * UTF16.indexOf("abc", 'a', 1) returns -1<br>
\r
1631 * UTF16.indexOf("abc\ud800\udc00", 0x10000, 1) returns 3<br>
\r
1632 * UTF16.indexOf("abc\ud800\udc00", 0xd800, 1) returns -1<br>
\r
1634 * Note this method is provided as support to jdk 1.3, which does not support supplementary
\r
1635 * characters to its fullest.
\r
1637 * @param source UTF16 format Unicode string that will be searched
\r
1638 * @param char32 Codepoint to search for
\r
1639 * @param fromIndex The index to start the search from.
\r
1640 * @return the index of the first occurrence of the codepoint in the argument Unicode string at
\r
1641 * or after fromIndex, or -1 if the codepoint does not occur.
\r
1644 public static int indexOf(String source, int char32, int fromIndex) {
\r
1645 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
\r
1646 throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
\r
1648 // non-surrogate bmp
\r
1649 if (char32 < LEAD_SURROGATE_MIN_VALUE
\r
1650 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
\r
1651 return source.indexOf((char) char32, fromIndex);
\r
1654 if (char32 < SUPPLEMENTARY_MIN_VALUE) {
\r
1655 int result = source.indexOf((char) char32, fromIndex);
\r
1656 if (result >= 0) {
\r
1657 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
\r
1658 && isTrailSurrogate(source.charAt(result + 1))) {
\r
1659 return indexOf(source, char32, result + 1);
\r
1661 // trail surrogate
\r
1662 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
\r
1663 return indexOf(source, char32, result + 1);
\r
1669 String char32str = toString(char32);
\r
1670 return source.indexOf(char32str, fromIndex);
\r
1674 * Returns the index within the argument UTF16 format Unicode string of the first occurrence of
\r
1675 * the argument string str. This method is implemented based on codepoints, hence a "lead
\r
1676 * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
\r
1677 * starts with trail surrogate character at index 0, a source with a leading a surrogate
\r
1678 * character before str found at in source will not have a valid match. Vice versa for lead
\r
1679 * surrogates that ends str. See example below.
\r
1681 * If no such string str occurs in this source, then -1 is returned.
\r
1685 * UTF16.indexOf("abc", "ab", 0) returns 0<br>
\r
1686 * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 0) returns 3<br>
\r
1687 * UTF16.indexOf("abc\ud800\udc00", "\ud800\udc00", 2) returns 3<br>
\r
1688 * UTF16.indexOf("abc\ud800\udc00", "\ud800", 0) returns -1<br>
\r
1690 * Note this method is provided as support to jdk 1.3, which does not support supplementary
\r
1691 * characters to its fullest.
\r
1693 * @param source UTF16 format Unicode string that will be searched
\r
1694 * @param str UTF16 format Unicode string to search for
\r
1695 * @param fromIndex The index to start the search from.
\r
1696 * @return the index of the first occurrence of the codepoint in the argument Unicode string, or
\r
1697 * -1 if the codepoint does not occur.
\r
1700 public static int indexOf(String source, String str, int fromIndex) {
\r
1701 int strLength = str.length();
\r
1702 // non-surrogate ends
\r
1703 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
\r
1704 return source.indexOf(str, fromIndex);
\r
1707 int result = source.indexOf(str, fromIndex);
\r
1708 int resultEnd = result + strLength;
\r
1709 if (result >= 0) {
\r
1710 // check last character
\r
1711 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
\r
1712 && isTrailSurrogate(source.charAt(resultEnd))) {
\r
1713 return indexOf(source, str, resultEnd + 1);
\r
1715 // check first character which is a trail surrogate
\r
1716 if (isTrailSurrogate(str.charAt(0)) && result > 0
\r
1717 && isLeadSurrogate(source.charAt(result - 1))) {
\r
1718 return indexOf(source, str, resultEnd + 1);
\r
1725 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
\r
1726 * the argument codepoint. I.e., the index returned is the largest value i such that:
\r
1727 * UTF16.charAt(source, i) == char32 is true.
\r
1730 * UTF16.lastIndexOf("abc", 'a') returns 0<br>
\r
1731 * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000) returns 3<br>
\r
1732 * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br>
\r
1735 * source is searched backwards starting at the last character.
\r
1737 * Note this method is provided as support to jdk 1.3, which does not support supplementary
\r
1738 * characters to its fullest.
\r
1740 * @param source UTF16 format Unicode string that will be searched
\r
1741 * @param char32 Codepoint to search for
\r
1742 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
\r
1746 public static int lastIndexOf(String source, int char32) {
\r
1747 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
\r
1748 throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
\r
1750 // non-surrogate bmp
\r
1751 if (char32 < LEAD_SURROGATE_MIN_VALUE
\r
1752 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
\r
1753 return source.lastIndexOf((char) char32);
\r
1756 if (char32 < SUPPLEMENTARY_MIN_VALUE) {
\r
1757 int result = source.lastIndexOf((char) char32);
\r
1758 if (result >= 0) {
\r
1759 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
\r
1760 && isTrailSurrogate(source.charAt(result + 1))) {
\r
1761 return lastIndexOf(source, char32, result - 1);
\r
1763 // trail surrogate
\r
1764 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
\r
1765 return lastIndexOf(source, char32, result - 1);
\r
1771 String char32str = toString(char32);
\r
1772 return source.lastIndexOf(char32str);
\r
1776 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
\r
1777 * the argument string str. This method is implemented based on codepoints, hence a "lead
\r
1778 * surrogate character + trail surrogate character" is treated as one entity.e Hence if the str
\r
1779 * starts with trail surrogate character at index 0, a source with a leading a surrogate
\r
1780 * character before str found at in source will not have a valid match. Vice versa for lead
\r
1781 * surrogates that ends str. See example below.
\r
1784 * UTF16.lastIndexOf("abc", "a") returns 0<br>
\r
1785 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00") returns 3<br>
\r
1786 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800") returns -1<br>
\r
1789 * source is searched backwards starting at the last character.
\r
1791 * Note this method is provided as support to jdk 1.3, which does not support supplementary
\r
1792 * characters to its fullest.
\r
1794 * @param source UTF16 format Unicode string that will be searched
\r
1795 * @param str UTF16 format Unicode string to search for
\r
1796 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
\r
1800 public static int lastIndexOf(String source, String str) {
\r
1801 int strLength = str.length();
\r
1802 // non-surrogate ends
\r
1803 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
\r
1804 return source.lastIndexOf(str);
\r
1807 int result = source.lastIndexOf(str);
\r
1808 if (result >= 0) {
\r
1809 // check last character
\r
1810 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
\r
1811 && isTrailSurrogate(source.charAt(result + strLength + 1))) {
\r
1812 return lastIndexOf(source, str, result - 1);
\r
1814 // check first character which is a trail surrogate
\r
1815 if (isTrailSurrogate(str.charAt(0)) && result > 0
\r
1816 && isLeadSurrogate(source.charAt(result - 1))) {
\r
1817 return lastIndexOf(source, str, result - 1);
\r
1825 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
\r
1826 * the argument codepoint, where the result is less than or equals to fromIndex.
\r
1829 * This method is implemented based on codepoints, hence a single surrogate character will not
\r
1830 * match a supplementary character.
\r
1833 * source is searched backwards starting at the last character starting at the specified index.
\r
1837 * UTF16.lastIndexOf("abc", 'c', 2) returns 2<br>
\r
1838 * UTF16.lastIndexOf("abc", 'c', 1) returns -1<br>
\r
1839 * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 5) returns 3<br>
\r
1840 * UTF16.lastIndexOf("abc\ud800\udc00", 0x10000, 3) returns 3<br>
\r
1841 * UTF16.lastIndexOf("abc\ud800\udc00", 0xd800) returns -1<br>
\r
1843 * Note this method is provided as support to jdk 1.3, which does not support supplementary
\r
1844 * characters to its fullest.
\r
1846 * @param source UTF16 format Unicode string that will be searched
\r
1847 * @param char32 Codepoint to search for
\r
1848 * @param fromIndex the index to start the search from. There is no restriction on the value of
\r
1849 * fromIndex. If it is greater than or equal to the length of this string, it has the
\r
1850 * same effect as if it were equal to one less than the length of this string: this
\r
1851 * entire string may be searched. If it is negative, it has the same effect as if it
\r
1852 * were -1: -1 is returned.
\r
1853 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
\r
1857 public static int lastIndexOf(String source, int char32, int fromIndex) {
\r
1858 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
\r
1859 throw new IllegalArgumentException("Argument char32 is not a valid codepoint");
\r
1861 // non-surrogate bmp
\r
1862 if (char32 < LEAD_SURROGATE_MIN_VALUE
\r
1863 || (char32 > TRAIL_SURROGATE_MAX_VALUE && char32 < SUPPLEMENTARY_MIN_VALUE)) {
\r
1864 return source.lastIndexOf((char) char32, fromIndex);
\r
1867 if (char32 < SUPPLEMENTARY_MIN_VALUE) {
\r
1868 int result = source.lastIndexOf((char) char32, fromIndex);
\r
1869 if (result >= 0) {
\r
1870 if (isLeadSurrogate((char) char32) && (result < source.length() - 1)
\r
1871 && isTrailSurrogate(source.charAt(result + 1))) {
\r
1872 return lastIndexOf(source, char32, result - 1);
\r
1874 // trail surrogate
\r
1875 if (result > 0 && isLeadSurrogate(source.charAt(result - 1))) {
\r
1876 return lastIndexOf(source, char32, result - 1);
\r
1882 String char32str = toString(char32);
\r
1883 return source.lastIndexOf(char32str, fromIndex);
\r
1888 * Returns the index within the argument UTF16 format Unicode string of the last occurrence of
\r
1889 * the argument string str, where the result is less than or equals to fromIndex.
\r
1892 * This method is implemented based on codepoints, hence a "lead surrogate character + trail
\r
1893 * surrogate character" is treated as one entity. Hence if the str starts with trail surrogate
\r
1894 * character at index 0, a source with a leading a surrogate character before str found at in
\r
1895 * source will not have a valid match. Vice versa for lead surrogates that ends str.
\r
1897 * See example below.
\r
1900 * UTF16.lastIndexOf("abc", "c", 2) returns 2<br>
\r
1901 * UTF16.lastIndexOf("abc", "c", 1) returns -1<br>
\r
1902 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 5) returns 3<br>
\r
1903 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800\udc00", 3) returns 3<br>
\r
1904 * UTF16.lastIndexOf("abc\ud800\udc00", "\ud800", 4) returns -1<br>
\r
1907 * source is searched backwards starting at the last character.
\r
1909 * Note this method is provided as support to jdk 1.3, which does not support supplementary
\r
1910 * characters to its fullest.
\r
1912 * @param source UTF16 format Unicode string that will be searched
\r
1913 * @param str UTF16 format Unicode string to search for
\r
1914 * @param fromIndex the index to start the search from. There is no restriction on the value of
\r
1915 * fromIndex. If it is greater than or equal to the length of this string, it has the
\r
1916 * same effect as if it were equal to one less than the length of this string: this
\r
1917 * entire string may be searched. If it is negative, it has the same effect as if it
\r
1918 * were -1: -1 is returned.
\r
1919 * @return the index of the last occurrence of the codepoint in source, or -1 if the codepoint
\r
1923 public static int lastIndexOf(String source, String str, int fromIndex) {
\r
1924 int strLength = str.length();
\r
1925 // non-surrogate ends
\r
1926 if (!isTrailSurrogate(str.charAt(0)) && !isLeadSurrogate(str.charAt(strLength - 1))) {
\r
1927 return source.lastIndexOf(str, fromIndex);
\r
1930 int result = source.lastIndexOf(str, fromIndex);
\r
1931 if (result >= 0) {
\r
1932 // check last character
\r
1933 if (isLeadSurrogate(str.charAt(strLength - 1)) && (result < source.length() - 1)
\r
1934 && isTrailSurrogate(source.charAt(result + strLength))) {
\r
1935 return lastIndexOf(source, str, result - 1);
\r
1937 // check first character which is a trail surrogate
\r
1938 if (isTrailSurrogate(str.charAt(0)) && result > 0
\r
1939 && isLeadSurrogate(source.charAt(result - 1))) {
\r
1940 return lastIndexOf(source, str, result - 1);
\r
1947 * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of
\r
1948 * oldChar32 in source with newChar32. If the character oldChar32 does not occur in the UTF16
\r
1949 * format Unicode string source, then source will be returned. Otherwise, a new String object is
\r
1950 * created that represents a codepoint sequence identical to the codepoint sequence represented
\r
1951 * by source, except that every occurrence of oldChar32 is replaced by an occurrence of
\r
1955 * UTF16.replace("mesquite in your cellar", 'e', 'o');<br>
\r
1956 * returns "mosquito in your collar"<br>
\r
1957 * UTF16.replace("JonL", 'q', 'x');<br>
\r
1958 * returns "JonL" (no change)<br>
\r
1959 * UTF16.replace("Supplementary character \ud800\udc00", 0x10000, '!'); <br>
\r
1960 * returns "Supplementary character !"<br>
\r
1961 * UTF16.replace("Supplementary character \ud800\udc00", 0xd800, '!'); <br>
\r
1962 * returns "Supplementary character \ud800\udc00"<br>
\r
1964 * Note this method is provided as support to jdk 1.3, which does not support supplementary
\r
1965 * characters to its fullest.
\r
1967 * @param source UTF16 format Unicode string which the codepoint replacements will be based on.
\r
1968 * @param oldChar32 Non-zero old codepoint to be replaced.
\r
1969 * @param newChar32 The new codepoint to replace oldChar32
\r
1970 * @return new String derived from source by replacing every occurrence of oldChar32 with
\r
1971 * newChar32, unless when no oldChar32 is found in source then source will be returned.
\r
1974 public static String replace(String source, int oldChar32, int newChar32) {
\r
1975 if (oldChar32 <= 0 || oldChar32 > CODEPOINT_MAX_VALUE) {
\r
1976 throw new IllegalArgumentException("Argument oldChar32 is not a valid codepoint");
\r
1978 if (newChar32 <= 0 || newChar32 > CODEPOINT_MAX_VALUE) {
\r
1979 throw new IllegalArgumentException("Argument newChar32 is not a valid codepoint");
\r
1982 int index = indexOf(source, oldChar32);
\r
1983 if (index == -1) {
\r
1986 String newChar32Str = toString(newChar32);
\r
1987 int oldChar32Size = 1;
\r
1988 int newChar32Size = newChar32Str.length();
\r
1989 StringBuffer result = new StringBuffer(source);
\r
1990 int resultIndex = index;
\r
1992 if (oldChar32 >= SUPPLEMENTARY_MIN_VALUE) {
\r
1993 oldChar32Size = 2;
\r
1996 while (index != -1) {
\r
1997 int endResultIndex = resultIndex + oldChar32Size;
\r
1998 result.replace(resultIndex, endResultIndex, newChar32Str);
\r
1999 int lastEndIndex = index + oldChar32Size;
\r
2000 index = indexOf(source, oldChar32, lastEndIndex);
\r
2001 resultIndex += newChar32Size + index - lastEndIndex;
\r
2003 return result.toString();
\r
2007 * Returns a new UTF16 format Unicode string resulting from replacing all occurrences of oldStr
\r
2008 * in source with newStr. If the string oldStr does not occur in the UTF16 format Unicode string
\r
2009 * source, then source will be returned. Otherwise, a new String object is created that
\r
2010 * represents a codepoint sequence identical to the codepoint sequence represented by source,
\r
2011 * except that every occurrence of oldStr is replaced by an occurrence of newStr.
\r
2014 * UTF16.replace("mesquite in your cellar", "e", "o");<br>
\r
2015 * returns "mosquito in your collar"<br>
\r
2016 * UTF16.replace("mesquite in your cellar", "mesquite", "cat");<br>
\r
2017 * returns "cat in your cellar"<br>
\r
2018 * UTF16.replace("JonL", "q", "x");<br>
\r
2019 * returns "JonL" (no change)<br>
\r
2020 * UTF16.replace("Supplementary character \ud800\udc00", "\ud800\udc00", '!'); <br>
\r
2021 * returns "Supplementary character !"<br>
\r
2022 * UTF16.replace("Supplementary character \ud800\udc00", "\ud800", '!'); <br>
\r
2023 * returns "Supplementary character \ud800\udc00"<br>
\r
2025 * Note this method is provided as support to jdk 1.3, which does not support supplementary
\r
2026 * characters to its fullest.
\r
2028 * @param source UTF16 format Unicode string which the replacements will be based on.
\r
2029 * @param oldStr Non-zero-length string to be replaced.
\r
2030 * @param newStr The new string to replace oldStr
\r
2031 * @return new String derived from source by replacing every occurrence of oldStr with newStr.
\r
2032 * When no oldStr is found in source, then source will be returned.
\r
2035 public static String replace(String source, String oldStr, String newStr) {
\r
2036 int index = indexOf(source, oldStr);
\r
2037 if (index == -1) {
\r
2040 int oldStrSize = oldStr.length();
\r
2041 int newStrSize = newStr.length();
\r
2042 StringBuffer result = new StringBuffer(source);
\r
2043 int resultIndex = index;
\r
2045 while (index != -1) {
\r
2046 int endResultIndex = resultIndex + oldStrSize;
\r
2047 result.replace(resultIndex, endResultIndex, newStr);
\r
2048 int lastEndIndex = index + oldStrSize;
\r
2049 index = indexOf(source, oldStr, lastEndIndex);
\r
2050 resultIndex += newStrSize + index - lastEndIndex;
\r
2052 return result.toString();
\r
2056 * Reverses a UTF16 format Unicode string and replaces source's content with it. This method
\r
2057 * will reverse surrogate characters correctly, instead of blindly reversing every character.
\r
2060 * UTF16.reverse(new StringBuffer( "Supplementary characters \ud800\udc00\ud801\udc01"))<br>
\r
2061 * returns "\ud801\udc01\ud800\udc00 sretcarahc yratnemelppuS".
\r
2063 * @param source The source StringBuffer that contains UTF16 format Unicode string to be reversed
\r
2064 * @return a modified source with reversed UTF16 format Unicode string.
\r
2067 public static StringBuffer reverse(StringBuffer source) {
\r
2068 int length = source.length();
\r
2069 StringBuffer result = new StringBuffer(length);
\r
2070 for (int i = length; i-- > 0;) {
\r
2071 char ch = source.charAt(i);
\r
2072 if (isTrailSurrogate(ch) && i > 0) {
\r
2073 char ch2 = source.charAt(i - 1);
\r
2074 if (isLeadSurrogate(ch2)) {
\r
2075 result.append(ch2);
\r
2076 result.append(ch);
\r
2081 result.append(ch);
\r
2087 * Check if the string contains more Unicode code points than a certain number. This is more
\r
2088 * efficient than counting all code points in the entire string and comparing that number with a
\r
2089 * threshold. This function may not need to scan the string at all if the length is within a
\r
2090 * certain range, and never needs to count more than 'number + 1' code points. Logically
\r
2091 * equivalent to (countCodePoint(s) > number). A Unicode code point may occupy either one or two
\r
2094 * @param source The input string.
\r
2095 * @param number The number of code points in the string is compared against the 'number'
\r
2097 * @return boolean value for whether the string contains more Unicode code points than 'number'.
\r
2100 public static boolean hasMoreCodePointsThan(String source, int number) {
\r
2104 if (source == null) {
\r
2107 int length = source.length();
\r
2109 // length >= 0 known
\r
2110 // source contains at least (length + 1) / 2 code points: <= 2
\r
2112 if (((length + 1) >> 1) > number) {
\r
2116 // check if source does not even contain enough chars
\r
2117 int maxsupplementary = length - number;
\r
2118 if (maxsupplementary <= 0) {
\r
2122 // there are maxsupplementary = length - number more chars than
\r
2123 // asked-for code points
\r
2125 // count code points until they exceed and also check that there are
\r
2126 // no more than maxsupplementary supplementary code points (char pairs)
\r
2129 if (length == 0) {
\r
2132 if (number == 0) {
\r
2135 if (isLeadSurrogate(source.charAt(start++)) && start != length
\r
2136 && isTrailSurrogate(source.charAt(start))) {
\r
2138 if (--maxsupplementary <= 0) {
\r
2139 // too many pairs - too few code points
\r
2148 * Check if the sub-range of char array, from argument start to limit, contains more Unicode
\r
2149 * code points than a certain number. This is more efficient than counting all code points in
\r
2150 * the entire char array range and comparing that number with a threshold. This function may not
\r
2151 * need to scan the char array at all if start and limit is within a certain range, and never
\r
2152 * needs to count more than 'number + 1' code points. Logically equivalent to
\r
2153 * (countCodePoint(source, start, limit) > number). A Unicode code point may occupy either one
\r
2154 * or two code units.
\r
2156 * @param source Array of UTF-16 chars
\r
2157 * @param start Offset to substring in the source array for analyzing
\r
2158 * @param limit Offset to substring in the source array for analyzing
\r
2159 * @param number The number of code points in the string is compared against the 'number'
\r
2161 * @return boolean value for whether the string contains more Unicode code points than 'number'.
\r
2162 * @exception IndexOutOfBoundsException Thrown when limit < start
\r
2165 public static boolean hasMoreCodePointsThan(char source[], int start, int limit, int number) {
\r
2166 int length = limit - start;
\r
2167 if (length < 0 || start < 0 || limit < 0) {
\r
2168 throw new IndexOutOfBoundsException(
\r
2169 "Start and limit indexes should be non-negative and start <= limit");
\r
2174 if (source == null) {
\r
2178 // length >= 0 known
\r
2179 // source contains at least (length + 1) / 2 code points: <= 2
\r
2181 if (((length + 1) >> 1) > number) {
\r
2185 // check if source does not even contain enough chars
\r
2186 int maxsupplementary = length - number;
\r
2187 if (maxsupplementary <= 0) {
\r
2191 // there are maxsupplementary = length - number more chars than
\r
2192 // asked-for code points
\r
2194 // count code points until they exceed and also check that there are
\r
2195 // no more than maxsupplementary supplementary code points (char pairs)
\r
2197 if (length == 0) {
\r
2200 if (number == 0) {
\r
2203 if (isLeadSurrogate(source[start++]) && start != limit
\r
2204 && isTrailSurrogate(source[start])) {
\r
2206 if (--maxsupplementary <= 0) {
\r
2207 // too many pairs - too few code points
\r
2216 * Check if the string buffer contains more Unicode code points than a certain number. This is
\r
2217 * more efficient than counting all code points in the entire string buffer and comparing that
\r
2218 * number with a threshold. This function may not need to scan the string buffer at all if the
\r
2219 * length is within a certain range, and never needs to count more than 'number + 1' code
\r
2220 * points. Logically equivalent to (countCodePoint(s) > number). A Unicode code point may occupy
\r
2221 * either one or two code units.
\r
2223 * @param source The input string buffer.
\r
2224 * @param number The number of code points in the string buffer is compared against the 'number'
\r
2226 * @return boolean value for whether the string buffer contains more Unicode code points than
\r
2230 public static boolean hasMoreCodePointsThan(StringBuffer source, int number) {
\r
2234 if (source == null) {
\r
2237 int length = source.length();
\r
2239 // length >= 0 known
\r
2240 // source contains at least (length + 1) / 2 code points: <= 2
\r
2242 if (((length + 1) >> 1) > number) {
\r
2246 // check if source does not even contain enough chars
\r
2247 int maxsupplementary = length - number;
\r
2248 if (maxsupplementary <= 0) {
\r
2252 // there are maxsupplementary = length - number more chars than
\r
2253 // asked-for code points
\r
2255 // count code points until they exceed and also check that there are
\r
2256 // no more than maxsupplementary supplementary code points (char pairs)
\r
2259 if (length == 0) {
\r
2262 if (number == 0) {
\r
2265 if (isLeadSurrogate(source.charAt(start++)) && start != length
\r
2266 && isTrailSurrogate(source.charAt(start))) {
\r
2268 if (--maxsupplementary <= 0) {
\r
2269 // too many pairs - too few code points
\r
2278 * Cover JDK 1.5 API. Create a String from an array of codePoints.
\r
2280 * @param codePoints The code array
\r
2281 * @param offset The start of the text in the code point array
\r
2282 * @param count The number of code points
\r
2283 * @return a String representing the code points between offset and count
\r
2284 * @throws IllegalArgumentException If an invalid code point is encountered
\r
2285 * @throws IndexOutOfBoundsException If the offset or count are out of bounds.
\r
2288 public static String newString(int[] codePoints, int offset, int count) {
\r
2290 throw new IllegalArgumentException();
\r
2292 char[] chars = new char[count];
\r
2294 for (int r = offset, e = offset + count; r < e; ++r) {
\r
2295 int cp = codePoints[r];
\r
2296 if (cp < 0 || cp > 0x10ffff) {
\r
2297 throw new IllegalArgumentException();
\r
2301 if (cp < 0x010000) {
\r
2302 chars[w] = (char) cp;
\r
2305 chars[w] = (char) (LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_));
\r
2306 chars[w + 1] = (char) (TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_));
\r
2310 } catch (IndexOutOfBoundsException ex) {
\r
2311 int newlen = (int) (Math.ceil((double) codePoints.length * (w + 2)
\r
2312 / (r - offset + 1)));
\r
2313 char[] temp = new char[newlen];
\r
2314 System.arraycopy(chars, 0, temp, 0, w);
\r
2319 return new String(chars, 0, w);
\r
2324 * UTF16 string comparator class. Allows UTF16 string comparison to be done with the various
\r
2328 * <li> Code point comparison or code unit comparison
\r
2329 * <li> Case sensitive comparison, case insensitive comparison or case insensitive comparison
\r
2330 * with special handling for character 'i'.
\r
2333 * The code unit or code point comparison differ only when comparing supplementary code points
\r
2334 * (\u10000..\u10ffff) to BMP code points near the end of the BMP (i.e.,
\r
2335 * \ue000..\uffff). In code unit comparison, high BMP code points sort after
\r
2336 * supplementary code points because they are stored as pairs of surrogates which are at
\r
2337 * \ud800..\udfff.
\r
2340 * @see #FOLD_CASE_DEFAULT
\r
2341 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
\r
2344 public static final class StringComparator implements java.util.Comparator<String> {
\r
2345 // public constructor ------------------------------------------------
\r
2348 * Default constructor that does code unit comparison and case sensitive comparison.
\r
2352 public StringComparator() {
\r
2353 this(false, false, FOLD_CASE_DEFAULT);
\r
2357 * Constructor that does comparison based on the argument options.
\r
2359 * @param codepointcompare Flag to indicate true for code point comparison or false for code unit
\r
2361 * @param ignorecase False for case sensitive comparison, true for case-insensitive comparison
\r
2362 * @param foldcaseoption FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only
\r
2363 * when ignorecase is set to true. If ignorecase is false, this option is
\r
2365 * @see #FOLD_CASE_DEFAULT
\r
2366 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
\r
2367 * @throws IllegalArgumentException If foldcaseoption is out of range
\r
2370 public StringComparator(boolean codepointcompare, boolean ignorecase, int foldcaseoption) {
\r
2371 setCodePointCompare(codepointcompare);
\r
2372 m_ignoreCase_ = ignorecase;
\r
2373 if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) {
\r
2374 throw new IllegalArgumentException("Invalid fold case option");
\r
2376 m_foldCase_ = foldcaseoption;
\r
2379 // public data member ------------------------------------------------
\r
2383 * Option value for case folding comparison:
\r
2386 * Comparison is case insensitive, strings are folded using default mappings defined in
\r
2387 * Unicode data file CaseFolding.txt, before comparison.
\r
2392 public static final int FOLD_CASE_DEFAULT = 0;
\r
2396 * Option value for case folding comparison:
\r
2399 * Comparison is case insensitive, strings are folded using modified mappings defined in
\r
2400 * Unicode data file CaseFolding.txt, before comparison.
\r
2403 * The modified set of mappings is provided in a Unicode data file CaseFolding.txt to handle
\r
2404 * dotted I and dotless i appropriately for Turkic languages (tr, az).
\r
2407 * Before Unicode 3.2, CaseFolding.txt contains mappings marked with 'I' that are to be
\r
2408 * included for default mappings and excluded for the Turkic-specific mappings.
\r
2411 * Unicode 3.2 CaseFolding.txt instead contains mappings marked with 'T' that are to be
\r
2412 * excluded for default mappings and included for the Turkic-specific mappings.
\r
2417 public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = 1;
\r
2419 // public methods ----------------------------------------------------
\r
2421 // public setters ----------------------------------------------------
\r
2424 * Sets the comparison mode to code point compare if flag is true. Otherwise comparison mode
\r
2425 * is set to code unit compare
\r
2427 * @param flag True for code point compare, false for code unit compare
\r
2430 public void setCodePointCompare(boolean flag) {
\r
2432 m_codePointCompare_ = Normalizer.COMPARE_CODE_POINT_ORDER;
\r
2434 m_codePointCompare_ = 0;
\r
2439 * Sets the Comparator to case-insensitive comparison mode if argument is true, otherwise
\r
2440 * case sensitive comparison mode if set to false.
\r
2442 * @param ignorecase True for case-insitive comparison, false for case sensitive comparison
\r
2443 * @param foldcaseoption FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I. This option is used only
\r
2444 * when ignorecase is set to true. If ignorecase is false, this option is
\r
2446 * @see #FOLD_CASE_DEFAULT
\r
2447 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
\r
2450 public void setIgnoreCase(boolean ignorecase, int foldcaseoption) {
\r
2451 m_ignoreCase_ = ignorecase;
\r
2452 if (foldcaseoption < FOLD_CASE_DEFAULT || foldcaseoption > FOLD_CASE_EXCLUDE_SPECIAL_I) {
\r
2453 throw new IllegalArgumentException("Invalid fold case option");
\r
2455 m_foldCase_ = foldcaseoption;
\r
2458 // public getters ----------------------------------------------------
\r
2461 * Checks if the comparison mode is code point compare.
\r
2463 * @return true for code point compare, false for code unit compare
\r
2466 public boolean getCodePointCompare() {
\r
2467 return m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER;
\r
2471 * Checks if Comparator is in the case insensitive mode.
\r
2473 * @return true if Comparator performs case insensitive comparison, false otherwise
\r
2476 public boolean getIgnoreCase() {
\r
2477 return m_ignoreCase_;
\r
2481 * Gets the fold case options set in Comparator to be used with case insensitive comparison.
\r
2483 * @return either FOLD_CASE_DEFAULT or FOLD_CASE_EXCLUDE_SPECIAL_I
\r
2484 * @see #FOLD_CASE_DEFAULT
\r
2485 * @see #FOLD_CASE_EXCLUDE_SPECIAL_I
\r
2488 public int getIgnoreCaseOption() {
\r
2489 return m_foldCase_;
\r
2492 // public other methods ----------------------------------------------
\r
2495 * Compare two strings depending on the options selected during construction.
\r
2497 * @param a first source string.
\r
2498 * @param b second source string.
\r
2499 * @return 0 returned if a == b. If a < b, a negative value is returned. Otherwise if a > b,
\r
2500 * a positive value is returned.
\r
2501 * @exception ClassCastException thrown when either a or b is not a String object
\r
2504 public int compare(String a, String b) {
\r
2515 if (m_ignoreCase_) {
\r
2516 return compareCaseInsensitive(a, b);
\r
2518 return compareCaseSensitive(a, b);
\r
2521 // private data member ----------------------------------------------
\r
2524 * Code unit comparison flag. True if code unit comparison is required. False if code point
\r
2525 * comparison is required.
\r
2527 private int m_codePointCompare_;
\r
2530 * Fold case comparison option.
\r
2532 private int m_foldCase_;
\r
2535 * Flag indicator if ignore case is to be used during comparison
\r
2537 private boolean m_ignoreCase_;
\r
2540 * Code point order offset for surrogate characters
\r
2542 private static final int CODE_POINT_COMPARE_SURROGATE_OFFSET_ = 0x2800;
\r
2544 // private method ---------------------------------------------------
\r
2547 * Compares case insensitive. This is a direct port of ICU4C, to make maintainence life
\r
2551 * first string to compare
\r
2553 * second string to compare
\r
2554 * @return -1 is s1 < s2, 0 if equals,
\r
2556 private int compareCaseInsensitive(String s1, String s2) {
\r
2557 return Normalizer.cmpEquivFold(s1, s2, m_foldCase_ | m_codePointCompare_
\r
2558 | Normalizer.COMPARE_IGNORE_CASE);
\r
2562 * Compares case sensitive. This is a direct port of ICU4C, to make maintainence life
\r
2566 * first string to compare
\r
2568 * second string to compare
\r
2569 * @return -1 is s1 < s2, 0 if equals,
\r
2571 private int compareCaseSensitive(String s1, String s2) {
\r
2572 // compare identical prefixes - they do not need to be fixed up
\r
2573 // limit1 = start1 + min(lenght1, length2)
\r
2574 int length1 = s1.length();
\r
2575 int length2 = s2.length();
\r
2576 int minlength = length1;
\r
2578 if (length1 < length2) {
\r
2580 } else if (length1 > length2) {
\r
2582 minlength = length2;
\r
2588 for (; index < minlength; index++) {
\r
2589 c1 = s1.charAt(index);
\r
2590 c2 = s2.charAt(index);
\r
2591 // check pseudo-limit
\r
2597 if (index == minlength) {
\r
2601 boolean codepointcompare = m_codePointCompare_ == Normalizer.COMPARE_CODE_POINT_ORDER;
\r
2602 // if both values are in or above the surrogate range, fix them up
\r
2603 if (c1 >= LEAD_SURROGATE_MIN_VALUE && c2 >= LEAD_SURROGATE_MIN_VALUE
\r
2604 && codepointcompare) {
\r
2605 // subtract 0x2800 from BMP code points to make them smaller
\r
2606 // than supplementary ones
\r
2607 if ((c1 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length1 && isTrailSurrogate(s1.charAt(index + 1)))
\r
2608 || (isTrailSurrogate(c1) && index != 0 && isLeadSurrogate(s1.charAt(index - 1)))) {
\r
2609 // part of a surrogate pair, leave >=d800
\r
2611 // BMP code point - may be surrogate code point - make
\r
2613 c1 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_;
\r
2616 if ((c2 <= LEAD_SURROGATE_MAX_VALUE && (index + 1) != length2 && isTrailSurrogate(s2.charAt(index + 1)))
\r
2617 || (isTrailSurrogate(c2) && index != 0 && isLeadSurrogate(s2.charAt(index - 1)))) {
\r
2618 // part of a surrogate pair, leave >=d800
\r
2620 // BMP code point - may be surrogate code point - make <d800
\r
2621 c2 -= CODE_POINT_COMPARE_SURROGATE_OFFSET_;
\r
2625 // now c1 and c2 are in UTF-32-compatible order
\r
2630 // private data members -------------------------------------------------
\r
2633 * Shift value for lead surrogate to form a supplementary character.
\r
2635 private static final int LEAD_SURROGATE_SHIFT_ = 10;
\r
2638 * Mask to retrieve the significant value from a trail surrogate.
\r
2640 private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
\r
2643 * Value that all lead surrogate starts with
\r
2645 private static final int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE
\r
2646 - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);
\r
2648 // private methods ------------------------------------------------------
\r
2652 * Converts argument code point and returns a String object representing the code point's value
\r
2653 * in UTF16 format.
\r
2656 * This method does not check for the validity of the codepoint, the results are not guaranteed
\r
2657 * if a invalid codepoint is passed as argument.
\r
2660 * The result is a string whose length is 1 for non-supplementary code points, 2 otherwise.
\r
2665 * @return string representation of the code point
\r
2667 private static String toString(int ch) {
\r
2668 if (ch < SUPPLEMENTARY_MIN_VALUE) {
\r
2669 return String.valueOf((char) ch);
\r
2672 StringBuilder result = new StringBuilder();
\r
2673 result.append(getLeadSurrogate(ch));
\r
2674 result.append(getTrailSurrogate(ch));
\r
2675 return result.toString();
\r