2 *******************************************************************************
3 * Copyright (C) 2002-2004, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
7 package com.ibm.icu.dev.test;
10 * Utility class for supplementary code point
11 * support. This one is written purely for updating
12 * Normalization sample from the unicode.org site.
13 * If you want the real thing, use UTF16 class
15 * @author Vladimir Weinstein, Markus Scherer
17 public class UTF16Util {
18 static final int suppOffset = (0xd800 << 10) + 0xdc00 - 0x10000;
21 * Method nextCodePoint. Returns the next code point
23 * @param s String in question
24 * @param i index from which we want a code point
25 * @return int codepoint at index i
27 public static final int nextCodePoint(String s, int i) {
29 if (0xd800 <= ch && ch <= 0xdbff && ++i < s.length()) {
30 int ch2 = s.charAt(i);
31 if (0xdc00 <= ch2 && ch2 <= 0xdfff) {
32 ch = (ch << 10) + ch2 - suppOffset;
39 * Method prevCodePoint. Gets the code point preceding
40 * index i (predecrement).
41 * @param s String in question
42 * @param i index in string
43 * @return int codepoint at index --i
45 public static final int prevCodePoint(String s, int i) {
46 int ch = s.charAt(--i);
47 if (0xdc00 <= ch && ch <= 0xdfff && --i >= 0) {
48 int ch2 = s.charAt(i);
49 if (0xd800 <= ch2 && ch2 <= 0xdbff) {
50 ch = (ch2 << 10) + ch - suppOffset;
57 * Method nextCodePoint. Returns the next code point
59 * @param s StringBuffer in question
60 * @param i index from which we want a code point
61 * @return int codepoint at index i
63 public static final int nextCodePoint(StringBuffer s, int i) {
65 if (0xd800 <= ch && ch <= 0xdbff && ++i < s.length()) {
66 int ch2 = s.charAt(i);
67 if (0xdc00 <= ch2 && ch2 <= 0xdfff) {
68 ch = (ch << 10) + ch2 - suppOffset;
75 * Method prevCodePoint. Gets the code point preceding
76 * index i (predecrement).
77 * @param s StringBuffer in question
78 * @param i index in string
79 * @return int codepoint at index --i
81 public static final int prevCodePoint(StringBuffer s, int i) {
82 int ch = s.charAt(--i);
83 if (0xdc00 <= ch && ch <= 0xdfff && --i >= 0) {
84 int ch2 = s.charAt(i);
85 if (0xd800 <= ch2 && ch2 <= 0xdbff) {
86 ch = (ch2 << 10) + ch - suppOffset;
93 * Method codePointLength. Returns the length
94 * in UTF-16 code units of a given code point
95 * @param c code point in question
96 * @return int length in UTF-16 code units. Can be 1 or 2
98 public static final int codePointLength(int c) {
99 return c <= 0xffff ? 1 : 2;
103 * Method appendCodePoint. Appends a code point
105 * @param buffer StringBuffer in question
106 * @param ch code point to append
108 public static final void appendCodePoint(StringBuffer buffer, int ch) {
110 buffer.append((char)ch);
112 buffer.append((char)(0xd7c0 + (ch >> 10)));
113 buffer.append((char)(0xdc00 + (ch & 0x3ff)));
118 * Method insertCodePoint. Inserts a code point in
120 * @param buffer StringBuffer in question
121 * @param i index at which we want code point to be inserted
122 * @param ch code point to be inserted
124 public static final void insertCodePoint(StringBuffer buffer, int i, int ch) {
126 buffer.insert(i, (char)ch);
128 buffer.insert(i, (char)(0xd7c0 + (ch >> 10))).insert(i + 1, (char)(0xdc00 + (ch & 0x3ff)));
133 * Method setCodePointAt. Changes a code point at a
134 * given index. Can change the length of the string.
135 * @param buffer StringBuffer in question
136 * @param i index at which we want to change the contents
137 * @param ch replacement code point
138 * @return int difference in resulting StringBuffer length
140 public static final int setCodePointAt(StringBuffer buffer, int i, int ch) {
141 int cp = nextCodePoint(buffer, i);
143 if (ch <= 0xffff && cp <= 0xffff) { // Both BMP
144 buffer.setCharAt(i, (char)ch);
146 } else if (ch > 0xffff && cp > 0xffff) { // Both supplementary
147 buffer.setCharAt(i, (char)(0xd7c0 + (ch >> 10)));
148 buffer.setCharAt(i+1, (char)(0xdc00 + (ch & 0x3ff)));
150 } else if (ch <= 0xffff && cp > 0xffff) { // putting BMP instead of supplementary, buffer shrinks
151 buffer.setCharAt(i, (char)ch);
152 buffer.deleteCharAt(i+1);
154 } else { //if (ch > 0xffff && cp <= 0xffff) { // putting supplementary instead of BMP, buffer grows
155 buffer.setCharAt(i, (char)(0xd7c0 + (ch >> 10)));
156 buffer.insert(i+1, (char)(0xdc00 + (ch & 0x3ff)));
162 * Method countCodePoint. Counts the UTF-32 code points
163 * in a UTF-16 encoded string.
164 * @param source String in question.
165 * @return int number of code points in this string
167 public static final int countCodePoint(String source)
171 boolean hadLeadSurrogate = false;
173 for (int i = 0; i < source.length(); ++ i)
175 ch = source.charAt(i);
176 if (hadLeadSurrogate && 0xdc00 <= ch && ch <= 0xdfff) {
177 hadLeadSurrogate = false; // count valid trail as zero
181 hadLeadSurrogate = (0xd800 <= ch && ch <= 0xdbff);
182 ++ result; // count others as 1
190 * Method countCodePoint. Counts the UTF-32 code points
191 * in a UTF-16 encoded string.
192 * @param source StringBuffer in question.
193 * @return int number of code points in this string
195 public static final int countCodePoint(StringBuffer source)
199 boolean hadLeadSurrogate = false;
201 for (int i = 0; i < source.length(); ++ i)
203 ch = source.charAt(i);
204 if (hadLeadSurrogate && 0xdc00 <= ch && ch <= 0xdfff) {
205 hadLeadSurrogate = false; // count valid trail as zero
209 hadLeadSurrogate = (0xd800 <= ch && ch <= 0xdbff);
210 ++ result; // count others as 1
217 * The minimum value for Supplementary code points
219 public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
221 * Determines how many chars this char32 requires.
222 * If a validity check is required, use <code>
223 * <a href="../UCharacter.html#isLegal(char)">isLegal()</a></code> on
224 * char32 before calling.
225 * @param char32 the input codepoint.
226 * @return 2 if is in supplementary space, otherwise 1.
228 public static int getCharCount(int char32)
230 if (char32 < SUPPLEMENTARY_MIN_VALUE) {
236 * Lead surrogate maximum value
239 public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
241 * Lead surrogate minimum value
244 public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
247 * Trail surrogate minimum value
250 public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
252 * Trail surrogate maximum value
255 public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
257 * Determines whether the code value is a surrogate.
258 * @param char16 the input character.
259 * @return true iff the input character is a surrogate.
262 public static boolean isSurrogate(char char16)
264 return LEAD_SURROGATE_MIN_VALUE <= char16 &&
265 char16 <= TRAIL_SURROGATE_MAX_VALUE;
269 * Determines whether the character is a trail surrogate.
270 * @param char16 the input character.
271 * @return true iff the input character is a trail surrogate.
274 public static boolean isTrailSurrogate(char char16)
276 return (TRAIL_SURROGATE_MIN_VALUE <= char16 &&
277 char16 <= TRAIL_SURROGATE_MAX_VALUE);
281 * Determines whether the character is a lead surrogate.
282 * @param char16 the input character.
283 * @return true iff the input character is a lead surrogate
286 public static boolean isLeadSurrogate(char char16)
288 return LEAD_SURROGATE_MIN_VALUE <= char16 &&
289 char16 <= LEAD_SURROGATE_MAX_VALUE;
292 * Extract a single UTF-32 value from a substring.
293 * Used when iterating forwards or backwards (with
294 * <code>UTF16.getCharCount()</code>, as well as random access. If a
295 * validity check is required, use
296 * <code><a href="../UCharacter.html#isLegal(char)">UCharacter.isLegal()
297 * </a></code> on the return value.
298 * If the char retrieved is part of a surrogate pair, its supplementary
299 * character will be returned. If a complete supplementary character is
300 * not found the incomplete character will be returned
301 * @param source array of UTF-16 chars
302 * @param start offset to substring in the source array for analyzing
303 * @param limit offset to substring in the source array for analyzing
304 * @param offset16 UTF-16 offset relative to start
305 * @return UTF-32 value for the UTF-32 value that contains the char at
306 * offset16. The boundaries of that codepoint are the same as in
307 * <code>bounds32()</code>.
308 * @exception IndexOutOfBoundsException thrown if offset16 is not within
309 * the range of start and limit.
312 public static int charAt(char source[], int start, int limit,
316 if (offset16 < start || offset16 >= limit) {
317 throw new ArrayIndexOutOfBoundsException(offset16);
320 char single = source[offset16];
321 if (!isSurrogate(single)) {
325 // Convert the UTF-16 surrogate pair if necessary.
326 // For simplicity in usage, and because the frequency of pairs is
327 // low, look both directions.
328 if (single <= LEAD_SURROGATE_MAX_VALUE) {
330 if (offset16 >= limit) {
333 char trail = source[offset16];
334 if (isTrailSurrogate(trail)) {
335 return getRawSupplementary(single, trail);
338 else { // isTrailSurrogate(single), so
339 if (offset16 == start) {
343 char lead = source[offset16];
344 if (isLeadSurrogate(lead))
345 return getRawSupplementary(lead, single);
347 return single; // return unmatched surrogate
350 * Shift value for lead surrogate to form a supplementary character.
352 private static final int LEAD_SURROGATE_SHIFT_ = 10;
355 * Offset to add to combined surrogate pair to avoid msking.
357 private static final int SURROGATE_OFFSET_ =
358 SUPPLEMENTARY_MIN_VALUE -
359 (LEAD_SURROGATE_MIN_VALUE <<
360 LEAD_SURROGATE_SHIFT_) -
361 TRAIL_SURROGATE_MIN_VALUE;
365 * Forms a supplementary code point from the argument character<br>
366 * Note this is for internal use hence no checks for the validity of the
367 * surrogate characters are done
368 * @param lead lead surrogate character
369 * @param trail trailing surrogate character
370 * @return code point of the supplementary character
372 public static int getRawSupplementary(char lead, char trail)
374 return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;