2 *******************************************************************************
\r
3 * Copyright (C) 2002-2004, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.dev.test;
\r
10 * Utility class for supplementary code point
\r
11 * support. This one is written purely for updating
\r
12 * Normalization sample from the unicode.org site.
\r
13 * If you want the real thing, use UTF16 class
\r
15 * @author Vladimir Weinstein, Markus Scherer
\r
17 public class UTF16Util {
\r
18 static final int suppOffset = (0xd800 << 10) + 0xdc00 - 0x10000;
\r
21 * Method nextCodePoint. Returns the next code point
\r
23 * @param s String in question
\r
24 * @param i index from which we want a code point
\r
25 * @return int codepoint at index i
\r
27 public static final int nextCodePoint(String s, int i) {
\r
28 int ch = s.charAt(i);
\r
29 if (0xd800 <= ch && ch <= 0xdbff && ++i < s.length()) {
\r
30 int ch2 = s.charAt(i);
\r
31 if (0xdc00 <= ch2 && ch2 <= 0xdfff) {
\r
32 ch = (ch << 10) + ch2 - suppOffset;
\r
39 * Method prevCodePoint. Gets the code point preceding
\r
40 * index i (predecrement).
\r
41 * @param s String in question
\r
42 * @param i index in string
\r
43 * @return int codepoint at index --i
\r
45 public static final int prevCodePoint(String s, int i) {
\r
46 int ch = s.charAt(--i);
\r
47 if (0xdc00 <= ch && ch <= 0xdfff && --i >= 0) {
\r
48 int ch2 = s.charAt(i);
\r
49 if (0xd800 <= ch2 && ch2 <= 0xdbff) {
\r
50 ch = (ch2 << 10) + ch - suppOffset;
\r
57 * Method nextCodePoint. Returns the next code point
\r
59 * @param s StringBuffer in question
\r
60 * @param i index from which we want a code point
\r
61 * @return int codepoint at index i
\r
63 public static final int nextCodePoint(StringBuffer s, int i) {
\r
64 int ch = s.charAt(i);
\r
65 if (0xd800 <= ch && ch <= 0xdbff && ++i < s.length()) {
\r
66 int ch2 = s.charAt(i);
\r
67 if (0xdc00 <= ch2 && ch2 <= 0xdfff) {
\r
68 ch = (ch << 10) + ch2 - suppOffset;
\r
75 * Method prevCodePoint. Gets the code point preceding
\r
76 * index i (predecrement).
\r
77 * @param s StringBuffer in question
\r
78 * @param i index in string
\r
79 * @return int codepoint at index --i
\r
81 public static final int prevCodePoint(StringBuffer s, int i) {
\r
82 int ch = s.charAt(--i);
\r
83 if (0xdc00 <= ch && ch <= 0xdfff && --i >= 0) {
\r
84 int ch2 = s.charAt(i);
\r
85 if (0xd800 <= ch2 && ch2 <= 0xdbff) {
\r
86 ch = (ch2 << 10) + ch - suppOffset;
\r
93 * Method codePointLength. Returns the length
\r
94 * in UTF-16 code units of a given code point
\r
95 * @param c code point in question
\r
96 * @return int length in UTF-16 code units. Can be 1 or 2
\r
98 public static final int codePointLength(int c) {
\r
99 return c <= 0xffff ? 1 : 2;
\r
103 * Method appendCodePoint. Appends a code point
\r
104 * to a StringBuffer
\r
105 * @param buffer StringBuffer in question
\r
106 * @param ch code point to append
\r
108 public static final void appendCodePoint(StringBuffer buffer, int ch) {
\r
109 if (ch <= 0xffff) {
\r
110 buffer.append((char)ch);
\r
112 buffer.append((char)(0xd7c0 + (ch >> 10)));
\r
113 buffer.append((char)(0xdc00 + (ch & 0x3ff)));
\r
118 * Method insertCodePoint. Inserts a code point in
\r
120 * @param buffer StringBuffer in question
\r
121 * @param i index at which we want code point to be inserted
\r
122 * @param ch code point to be inserted
\r
124 public static final void insertCodePoint(StringBuffer buffer, int i, int ch) {
\r
125 if (ch <= 0xffff) {
\r
126 buffer.insert(i, (char)ch);
\r
128 buffer.insert(i, (char)(0xd7c0 + (ch >> 10))).insert(i + 1, (char)(0xdc00 + (ch & 0x3ff)));
\r
133 * Method setCodePointAt. Changes a code point at a
\r
134 * given index. Can change the length of the string.
\r
135 * @param buffer StringBuffer in question
\r
136 * @param i index at which we want to change the contents
\r
137 * @param ch replacement code point
\r
138 * @return int difference in resulting StringBuffer length
\r
140 public static final int setCodePointAt(StringBuffer buffer, int i, int ch) {
\r
141 int cp = nextCodePoint(buffer, i);
\r
143 if (ch <= 0xffff && cp <= 0xffff) { // Both BMP
\r
144 buffer.setCharAt(i, (char)ch);
\r
146 } else if (ch > 0xffff && cp > 0xffff) { // Both supplementary
\r
147 buffer.setCharAt(i, (char)(0xd7c0 + (ch >> 10)));
\r
148 buffer.setCharAt(i+1, (char)(0xdc00 + (ch & 0x3ff)));
\r
150 } else if (ch <= 0xffff && cp > 0xffff) { // putting BMP instead of supplementary, buffer shrinks
\r
151 buffer.setCharAt(i, (char)ch);
\r
152 buffer.deleteCharAt(i+1);
\r
154 } else { //if (ch > 0xffff && cp <= 0xffff) { // putting supplementary instead of BMP, buffer grows
\r
155 buffer.setCharAt(i, (char)(0xd7c0 + (ch >> 10)));
\r
156 buffer.insert(i+1, (char)(0xdc00 + (ch & 0x3ff)));
\r
162 * Method countCodePoint. Counts the UTF-32 code points
\r
163 * in a UTF-16 encoded string.
\r
164 * @param source String in question.
\r
165 * @return int number of code points in this string
\r
167 public static final int countCodePoint(String source)
\r
171 boolean hadLeadSurrogate = false;
\r
173 for (int i = 0; i < source.length(); ++ i)
\r
175 ch = source.charAt(i);
\r
176 if (hadLeadSurrogate && 0xdc00 <= ch && ch <= 0xdfff) {
\r
177 hadLeadSurrogate = false; // count valid trail as zero
\r
181 hadLeadSurrogate = (0xd800 <= ch && ch <= 0xdbff);
\r
182 ++ result; // count others as 1
\r
190 * Method countCodePoint. Counts the UTF-32 code points
\r
191 * in a UTF-16 encoded string.
\r
192 * @param source StringBuffer in question.
\r
193 * @return int number of code points in this string
\r
195 public static final int countCodePoint(StringBuffer source)
\r
199 boolean hadLeadSurrogate = false;
\r
201 for (int i = 0; i < source.length(); ++ i)
\r
203 ch = source.charAt(i);
\r
204 if (hadLeadSurrogate && 0xdc00 <= ch && ch <= 0xdfff) {
\r
205 hadLeadSurrogate = false; // count valid trail as zero
\r
209 hadLeadSurrogate = (0xd800 <= ch && ch <= 0xdbff);
\r
210 ++ result; // count others as 1
\r
217 * The minimum value for Supplementary code points
\r
219 public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
\r
221 * Determines how many chars this char32 requires.
\r
222 * If a validity check is required, use <code>
\r
223 * <a href="../UCharacter.html#isLegal(char)">isLegal()</a></code> on
\r
224 * char32 before calling.
\r
225 * @param char32 the input codepoint.
\r
226 * @return 2 if is in supplementary space, otherwise 1.
\r
228 public static int getCharCount(int char32)
\r
230 if (char32 < SUPPLEMENTARY_MIN_VALUE) {
\r
236 * Lead surrogate maximum value
\r
239 public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
\r
241 * Lead surrogate minimum value
\r
244 public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
\r
247 * Trail surrogate minimum value
\r
250 public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
\r
252 * Trail surrogate maximum value
\r
255 public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
\r
257 * Determines whether the code value is a surrogate.
\r
258 * @param char16 the input character.
\r
259 * @return true iff the input character is a surrogate.
\r
262 public static boolean isSurrogate(char char16)
\r
264 return LEAD_SURROGATE_MIN_VALUE <= char16 &&
\r
265 char16 <= TRAIL_SURROGATE_MAX_VALUE;
\r
269 * Determines whether the character is a trail surrogate.
\r
270 * @param char16 the input character.
\r
271 * @return true iff the input character is a trail surrogate.
\r
274 public static boolean isTrailSurrogate(char char16)
\r
276 return (TRAIL_SURROGATE_MIN_VALUE <= char16 &&
\r
277 char16 <= TRAIL_SURROGATE_MAX_VALUE);
\r
281 * Determines whether the character is a lead surrogate.
\r
282 * @param char16 the input character.
\r
283 * @return true iff the input character is a lead surrogate
\r
286 public static boolean isLeadSurrogate(char char16)
\r
288 return LEAD_SURROGATE_MIN_VALUE <= char16 &&
\r
289 char16 <= LEAD_SURROGATE_MAX_VALUE;
\r
292 * Extract a single UTF-32 value from a substring.
\r
293 * Used when iterating forwards or backwards (with
\r
294 * <code>UTF16.getCharCount()</code>, as well as random access. If a
\r
295 * validity check is required, use
\r
296 * <code><a href="../UCharacter.html#isLegal(char)">UCharacter.isLegal()
\r
297 * </a></code> on the return value.
\r
298 * If the char retrieved is part of a surrogate pair, its supplementary
\r
299 * character will be returned. If a complete supplementary character is
\r
300 * not found the incomplete character will be returned
\r
301 * @param source array of UTF-16 chars
\r
302 * @param start offset to substring in the source array for analyzing
\r
303 * @param limit offset to substring in the source array for analyzing
\r
304 * @param offset16 UTF-16 offset relative to start
\r
305 * @return UTF-32 value for the UTF-32 value that contains the char at
\r
306 * offset16. The boundaries of that codepoint are the same as in
\r
307 * <code>bounds32()</code>.
\r
308 * @exception IndexOutOfBoundsException thrown if offset16 is not within
\r
309 * the range of start and limit.
\r
312 public static int charAt(char source[], int start, int limit,
\r
316 if (offset16 < start || offset16 >= limit) {
\r
317 throw new ArrayIndexOutOfBoundsException(offset16);
\r
320 char single = source[offset16];
\r
321 if (!isSurrogate(single)) {
\r
325 // Convert the UTF-16 surrogate pair if necessary.
\r
326 // For simplicity in usage, and because the frequency of pairs is
\r
327 // low, look both directions.
\r
328 if (single <= LEAD_SURROGATE_MAX_VALUE) {
\r
330 if (offset16 >= limit) {
\r
333 char trail = source[offset16];
\r
334 if (isTrailSurrogate(trail)) {
\r
335 return getRawSupplementary(single, trail);
\r
338 else { // isTrailSurrogate(single), so
\r
339 if (offset16 == start) {
\r
343 char lead = source[offset16];
\r
344 if (isLeadSurrogate(lead))
\r
345 return getRawSupplementary(lead, single);
\r
347 return single; // return unmatched surrogate
\r
350 * Shift value for lead surrogate to form a supplementary character.
\r
352 private static final int LEAD_SURROGATE_SHIFT_ = 10;
\r
355 * Offset to add to combined surrogate pair to avoid msking.
\r
357 private static final int SURROGATE_OFFSET_ =
\r
358 SUPPLEMENTARY_MIN_VALUE -
\r
359 (LEAD_SURROGATE_MIN_VALUE <<
\r
360 LEAD_SURROGATE_SHIFT_) -
\r
361 TRAIL_SURROGATE_MIN_VALUE;
\r
365 * Forms a supplementary code point from the argument character<br>
\r
366 * Note this is for internal use hence no checks for the validity of the
\r
367 * surrogate characters are done
\r
368 * @param lead lead surrogate character
\r
369 * @param trail trailing surrogate character
\r
370 * @return code point of the supplementary character
\r
372 public static int getRawSupplementary(char lead, char trail)
\r
374 return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
\r