jars/icu4j-4_4_2-src/main/tests/framework/src/com/ibm/icu/dev/test/UTF16Util.java

   1 /**\r
   2 *******************************************************************************\r
   3 * Copyright (C) 2002-2004, International Business Machines Corporation and    *\r
   4 * others. All Rights Reserved.                                                *\r
   5 *******************************************************************************\r
   6 */\r
   7 package com.ibm.icu.dev.test;\r
   8 \r
   9 /**\r
  10  * Utility class for supplementary code point \r
  11  * support. This one is written purely for updating\r
  12  * Normalization sample from the unicode.org site.\r
  13  * If you want the real thing, use UTF16 class\r
  14  * from ICU4J\r
  15  * @author Vladimir Weinstein, Markus Scherer\r
  16  */\r
  17 public class UTF16Util {\r
  18     static final int suppOffset = (0xd800 << 10) + 0xdc00 - 0x10000;\r
  19 \r
  20     /**\r
  21      * Method nextCodePoint. Returns the next code point\r
  22      * in a string. \r
  23      * @param s String in question\r
  24      * @param i index from which we want a code point\r
  25      * @return int codepoint at index i\r
  26      */\r
  27     public static final int nextCodePoint(String s, int i) {\r
  28         int ch = s.charAt(i);\r
  29         if (0xd800 <= ch && ch <= 0xdbff && ++i < s.length()) {\r
  30             int ch2 = s.charAt(i);\r
  31             if (0xdc00 <= ch2 && ch2 <= 0xdfff) {\r
  32                 ch = (ch << 10) + ch2 - suppOffset;\r
  33             }\r
  34         }\r
  35         return ch;\r
  36     }\r
  37 \r
  38     /**\r
  39      * Method prevCodePoint. Gets the code point preceding\r
  40      * index i (predecrement). \r
  41      * @param s String in question\r
  42      * @param i index in string\r
  43      * @return int codepoint at index --i\r
  44      */\r
  45     public static final int prevCodePoint(String s, int i) {\r
  46         int ch = s.charAt(--i);\r
  47         if (0xdc00 <= ch && ch <= 0xdfff && --i >= 0) {\r
  48             int ch2 = s.charAt(i);\r
  49             if (0xd800 <= ch2 && ch2 <= 0xdbff) {\r
  50                 ch = (ch2 << 10) + ch - suppOffset;\r
  51             }\r
  52         }\r
  53         return ch;\r
  54     }\r
  55 \r
  56     /**\r
  57      * Method nextCodePoint. Returns the next code point\r
  58      * in a string. \r
  59      * @param s StringBuffer in question\r
  60      * @param i index from which we want a code point\r
  61      * @return int codepoint at index i\r
  62      */\r
  63     public static final int nextCodePoint(StringBuffer s, int i) {\r
  64         int ch = s.charAt(i);\r
  65         if (0xd800 <= ch && ch <= 0xdbff && ++i < s.length()) {\r
  66             int ch2 = s.charAt(i);\r
  67             if (0xdc00 <= ch2 && ch2 <= 0xdfff) {\r
  68                 ch = (ch << 10) + ch2 - suppOffset;\r
  69             }\r
  70         }\r
  71         return ch;\r
  72     }\r
  73 \r
  74     /**\r
  75      * Method prevCodePoint. Gets the code point preceding\r
  76      * index i (predecrement). \r
  77      * @param s StringBuffer in question\r
  78      * @param i index in string\r
  79      * @return int codepoint at index --i\r
  80      */\r
  81     public static final int prevCodePoint(StringBuffer s, int i) {\r
  82         int ch = s.charAt(--i);\r
  83         if (0xdc00 <= ch && ch <= 0xdfff && --i >= 0) {\r
  84             int ch2 = s.charAt(i);\r
  85             if (0xd800 <= ch2 && ch2 <= 0xdbff) {\r
  86                 ch = (ch2 << 10) + ch - suppOffset;\r
  87             }\r
  88         }\r
  89         return ch;\r
  90     }\r
  91 \r
  92     /**\r
  93      * Method codePointLength. Returns the length \r
  94      * in UTF-16 code units of a given code point\r
  95      * @param c code point in question\r
  96      * @return int length in UTF-16 code units. Can be 1 or 2\r
  97      */\r
  98     public static final int codePointLength(int c) {\r
  99         return c <= 0xffff ? 1 : 2;\r
 100     }\r
 101 \r
 102     /**\r
 103      * Method appendCodePoint. Appends a code point\r
 104      * to a StringBuffer\r
 105      * @param buffer StringBuffer in question\r
 106      * @param ch code point to append\r
 107      */\r
 108     public static final void appendCodePoint(StringBuffer buffer, int ch) {\r
 109         if (ch <= 0xffff) {\r
 110             buffer.append((char)ch);\r
 111         } else {\r
 112             buffer.append((char)(0xd7c0 + (ch >> 10)));\r
 113             buffer.append((char)(0xdc00 + (ch & 0x3ff))); \r
 114         }\r
 115     }\r
 116 \r
 117     /**\r
 118      * Method insertCodePoint. Inserts a code point in\r
 119      * a StringBuffer\r
 120      * @param buffer StringBuffer in question\r
 121      * @param i index at which we want code point to be inserted\r
 122      * @param ch code point to be inserted\r
 123      */\r
 124     public static final void insertCodePoint(StringBuffer buffer, int i, int ch) {\r
 125         if (ch <= 0xffff) {\r
 126             buffer.insert(i, (char)ch);\r
 127         } else {\r
 128             buffer.insert(i, (char)(0xd7c0 + (ch >> 10))).insert(i + 1, (char)(0xdc00 + (ch & 0x3ff))); \r
 129         }\r
 130     }\r
 131     \r
 132     /**\r
 133      * Method setCodePointAt. Changes a code point at a\r
 134      * given index. Can change the length of the string.\r
 135      * @param buffer StringBuffer in question\r
 136      * @param i index at which we want to change the contents\r
 137      * @param ch replacement code point\r
 138      * @return int difference in resulting StringBuffer length\r
 139      */\r
 140     public static final int setCodePointAt(StringBuffer buffer, int i, int ch) {\r
 141         int cp = nextCodePoint(buffer, i);\r
 142         \r
 143         if (ch <= 0xffff && cp <= 0xffff) { // Both BMP\r
 144             buffer.setCharAt(i, (char)ch);\r
 145             return 0;\r
 146         } else if (ch > 0xffff && cp > 0xffff) { // Both supplementary\r
 147             buffer.setCharAt(i, (char)(0xd7c0 + (ch >> 10)));\r
 148             buffer.setCharAt(i+1, (char)(0xdc00 + (ch & 0x3ff)));\r
 149             return 0;\r
 150         } else if (ch <= 0xffff && cp > 0xffff) { // putting BMP instead of supplementary, buffer shrinks\r
 151             buffer.setCharAt(i, (char)ch);\r
 152             buffer.deleteCharAt(i+1);\r
 153             return -1;\r
 154         } else { //if (ch > 0xffff && cp <= 0xffff) { // putting supplementary instead of BMP, buffer grows\r
 155             buffer.setCharAt(i, (char)(0xd7c0 + (ch >> 10)));\r
 156             buffer.insert(i+1, (char)(0xdc00 + (ch & 0x3ff))); \r
 157             return 1;           \r
 158         }\r
 159     }\r
 160 \r
 161     /**\r
 162      * Method countCodePoint. Counts the UTF-32 code points\r
 163      * in a UTF-16 encoded string.\r
 164      * @param source String in question.\r
 165      * @return int number of code points in this string\r
 166      */\r
 167     public static final int countCodePoint(String source) \r
 168     {         \r
 169         int result = 0;\r
 170         char ch;\r
 171         boolean hadLeadSurrogate = false;\r
 172         \r
 173         for (int i = 0; i < source.length(); ++ i) \r
 174         {\r
 175             ch = source.charAt(i);\r
 176             if (hadLeadSurrogate && 0xdc00 <= ch && ch <= 0xdfff) {\r
 177                 hadLeadSurrogate = false;           // count valid trail as zero\r
 178             }\r
 179             else\r
 180             {\r
 181                 hadLeadSurrogate = (0xd800 <= ch && ch <= 0xdbff);\r
 182                 ++ result;                          // count others as 1\r
 183             }\r
 184         }\r
 185         \r
 186         return result;\r
 187     }\r
 188     \r
 189     /**\r
 190      * Method countCodePoint. Counts the UTF-32 code points\r
 191      * in a UTF-16 encoded string.\r
 192      * @param source StringBuffer in question.\r
 193      * @return int number of code points in this string\r
 194      */\r
 195     public static final int countCodePoint(StringBuffer source) \r
 196     {         \r
 197         int result = 0;\r
 198         char ch;\r
 199         boolean hadLeadSurrogate = false;\r
 200         \r
 201         for (int i = 0; i < source.length(); ++ i) \r
 202         {\r
 203             ch = source.charAt(i);\r
 204             if (hadLeadSurrogate && 0xdc00 <= ch && ch <= 0xdfff) {\r
 205                 hadLeadSurrogate = false;           // count valid trail as zero\r
 206             }\r
 207             else\r
 208             {\r
 209                 hadLeadSurrogate = (0xd800 <= ch && ch <= 0xdbff);\r
 210                 ++ result;                          // count others as 1\r
 211             }\r
 212         }\r
 213         \r
 214         return result;\r
 215     }\r
 216     /**\r
 217      * The minimum value for Supplementary code points\r
 218      */\r
 219     public static final int SUPPLEMENTARY_MIN_VALUE  = 0x10000;  \r
 220     /**\r
 221      * Determines how many chars this char32 requires.\r
 222      * If a validity check is required, use <code>\r
 223      * <a href="../UCharacter.html#isLegal(char)">isLegal()</a></code> on \r
 224      * char32 before calling.\r
 225      * @param char32 the input codepoint.\r
 226      * @return 2 if is in supplementary space, otherwise 1. \r
 227      */\r
 228     public static int getCharCount(int char32) \r
 229     {\r
 230         if (char32 < SUPPLEMENTARY_MIN_VALUE) {\r
 231             return 1;\r
 232         }\r
 233         return 2;\r
 234     }\r
 235     /**\r
 236      * Lead surrogate maximum value\r
 237      * @stable ICU 2.1\r
 238      */\r
 239     public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;\r
 240     /**\r
 241      * Lead surrogate minimum value\r
 242      * @stable ICU 2.1\r
 243      */\r
 244     public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;\r
 245     \r
 246     /**\r
 247      * Trail surrogate minimum value\r
 248      * @stable ICU 2.1\r
 249      */\r
 250     public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00; \r
 251     /**\r
 252      * Trail surrogate maximum value\r
 253      * @stable ICU 2.1\r
 254      */\r
 255     public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;\r
 256     /**\r
 257      * Determines whether the code value is a surrogate.\r
 258      * @param char16 the input character.\r
 259      * @return true iff the input character is a surrogate.\r
 260      * @stable ICU 2.1\r
 261      */\r
 262     public static boolean isSurrogate(char char16) \r
 263     {\r
 264         return LEAD_SURROGATE_MIN_VALUE <= char16 && \r
 265             char16 <= TRAIL_SURROGATE_MAX_VALUE;\r
 266     }\r
 267         \r
 268     /**\r
 269      * Determines whether the character is a trail surrogate.\r
 270      * @param char16 the input character.\r
 271      * @return true iff the input character is a trail surrogate.\r
 272      * @stable ICU 2.1\r
 273      */\r
 274     public static boolean isTrailSurrogate(char char16) \r
 275     {\r
 276         return (TRAIL_SURROGATE_MIN_VALUE <= char16 && \r
 277                 char16 <= TRAIL_SURROGATE_MAX_VALUE);\r
 278     }\r
 279         \r
 280     /**\r
 281      * Determines whether the character is a lead surrogate.\r
 282      * @param char16 the input character.\r
 283      * @return true iff the input character is a lead surrogate\r
 284      * @stable ICU 2.1\r
 285      */\r
 286     public static boolean isLeadSurrogate(char char16) \r
 287     {\r
 288         return LEAD_SURROGATE_MIN_VALUE <= char16 && \r
 289             char16 <= LEAD_SURROGATE_MAX_VALUE;\r
 290     }\r
 291     /**\r
 292      * Extract a single UTF-32 value from a substring.\r
 293      * Used when iterating forwards or backwards (with\r
 294      * <code>UTF16.getCharCount()</code>, as well as random access. If a\r
 295      * validity check is required, use \r
 296      * <code><a href="../UCharacter.html#isLegal(char)">UCharacter.isLegal()\r
 297      * </a></code> on the return value.\r
 298      * If the char retrieved is part of a surrogate pair, its supplementary\r
 299      * character will be returned. If a complete supplementary character is \r
 300      * not found the incomplete character will be returned\r
 301      * @param source array of UTF-16 chars\r
 302      * @param start offset to substring in the source array for analyzing\r
 303      * @param limit offset to substring in the source array for analyzing\r
 304      * @param offset16 UTF-16 offset relative to start\r
 305      * @return UTF-32 value for the UTF-32 value that contains the char at\r
 306      *         offset16. The boundaries of that codepoint are the same as in\r
 307      *         <code>bounds32()</code>.\r
 308      * @exception IndexOutOfBoundsException thrown if offset16 is not within \r
 309      *            the range of start and limit.\r
 310      * @stable ICU 2.1\r
 311      */\r
 312     public static int charAt(char source[], int start, int limit, \r
 313                              int offset16)\r
 314     {\r
 315         offset16 += start;\r
 316         if (offset16 < start || offset16 >= limit) {\r
 317             throw new ArrayIndexOutOfBoundsException(offset16);\r
 318         }\r
 319             \r
 320         char single = source[offset16];\r
 321         if (!isSurrogate(single)) {\r
 322             return single;\r
 323         }\r
 324 \r
 325         // Convert the UTF-16 surrogate pair if necessary.\r
 326         // For simplicity in usage, and because the frequency of pairs is \r
 327         // low, look both directions.      \r
 328         if (single <= LEAD_SURROGATE_MAX_VALUE) {\r
 329             offset16 ++;\r
 330             if (offset16 >= limit) {\r
 331                 return single;\r
 332             }\r
 333             char trail = source[offset16];\r
 334             if (isTrailSurrogate(trail)) {\r
 335                 return getRawSupplementary(single, trail);\r
 336             }\r
 337         } \r
 338         else { // isTrailSurrogate(single), so\r
 339             if (offset16 == start) {\r
 340                 return single;\r
 341             }\r
 342             offset16 --;\r
 343             char lead = source[offset16];\r
 344             if (isLeadSurrogate(lead))\r
 345                 return getRawSupplementary(lead, single);\r
 346         }\r
 347         return single; // return unmatched surrogate\r
 348     }\r
 349     /**\r
 350      * Shift value for lead surrogate to form a supplementary character.\r
 351      */\r
 352     private static final int LEAD_SURROGATE_SHIFT_ = 10;\r
 353     \r
 354     /** \r
 355      * Offset to add to combined surrogate pair to avoid msking.\r
 356      */\r
 357     private static final int SURROGATE_OFFSET_ = \r
 358                            SUPPLEMENTARY_MIN_VALUE - \r
 359                            (LEAD_SURROGATE_MIN_VALUE << \r
 360                            LEAD_SURROGATE_SHIFT_) - \r
 361                            TRAIL_SURROGATE_MIN_VALUE;   \r
 362 \r
 363     \r
 364    /**\r
 365     * Forms a supplementary code point from the argument character<br>\r
 366     * Note this is for internal use hence no checks for the validity of the\r
 367     * surrogate characters are done\r
 368     * @param lead lead surrogate character\r
 369     * @param trail trailing surrogate character\r
 370     * @return code point of the supplementary character\r
 371     */\r
 372     public static int getRawSupplementary(char lead, char trail)\r
 373     {\r
 374         return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;\r
 375     }\r
 376     \r
 377 }\r