jars/icu4j-4_4_2-src/main/classes/charset/src/com/ibm/icu/charset/CharsetUTF8.java

   1 /**\r
   2  *******************************************************************************\r
   3  * Copyright (C) 2006-2008, International Business Machines Corporation and    *\r
   4  * others. All Rights Reserved.                                                *\r
   5  *******************************************************************************\r
   6  *\r
   7  *******************************************************************************\r
   8  */\r
   9 \r
  10 package com.ibm.icu.charset;\r
  11 \r
  12 import java.nio.ByteBuffer;\r
  13 import java.nio.CharBuffer;\r
  14 import java.nio.IntBuffer;\r
  15 import java.nio.charset.CharsetDecoder;\r
  16 import java.nio.charset.CharsetEncoder;\r
  17 import java.nio.charset.CoderResult;\r
  18 \r
  19 import com.ibm.icu.text.UTF16;\r
  20 import com.ibm.icu.text.UnicodeSet;\r
  21 \r
  22 /**\r
  23  * @author Niti Hantaweepant\r
  24  */\r
  25 class CharsetUTF8 extends CharsetICU {\r
  26 \r
  27     private static final byte[] fromUSubstitution = new byte[] { (byte) 0xef, (byte) 0xbf, (byte) 0xbd };\r
  28 \r
  29     public CharsetUTF8(String icuCanonicalName, String javaCanonicalName, String[] aliases) {\r
  30         super(icuCanonicalName, javaCanonicalName, aliases);\r
  31         /* max 3 bytes per code unit from UTF-8 (4 bytes from surrogate _pair_) */\r
  32         maxBytesPerChar = 3;\r
  33         minBytesPerChar = 1;\r
  34         maxCharsPerByte = 1;\r
  35     }\r
  36 \r
  37     private static final int BITMASK_FROM_UTF8[] = { -1, 0x7f, 0x1f, 0xf, 0x7, 0x3, 0x1 };\r
  38 \r
  39     private static final byte BYTES_FROM_UTF8[] = {\r
  40         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r
  41         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r
  42         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r
  43         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r
  44         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\r
  45         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\r
  46         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r
  47         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0\r
  48     };\r
  49 \r
  50     /*\r
  51      * Starting with Unicode 3.0.1: UTF-8 byte sequences of length N _must_ encode code points of or\r
  52      * above utf8_minChar32[N]; byte sequences with more than 4 bytes are illegal in UTF-8, which is\r
  53      * tested with impossible values for them\r
  54      */\r
  55     private static final int UTF8_MIN_CHAR32[] = { 0, 0, 0x80, 0x800, 0x10000,\r
  56             Integer.MAX_VALUE, Integer.MAX_VALUE };\r
  57 \r
  58     private final boolean isCESU8 = this instanceof CharsetCESU8;\r
  59 \r
  60     class CharsetDecoderUTF8 extends CharsetDecoderICU {\r
  61 \r
  62         public CharsetDecoderUTF8(CharsetICU cs) {\r
  63             super(cs);\r
  64         }\r
  65 \r
  66         protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets,\r
  67                 boolean flush) {\r
  68             if (!source.hasRemaining()) {\r
  69                 /* no input, nothing to do */\r
  70                 return CoderResult.UNDERFLOW;\r
  71             }\r
  72             if (!target.hasRemaining()) {\r
  73                 /* no output available, can't do anything */\r
  74                 return CoderResult.OVERFLOW;\r
  75             }\r
  76 \r
  77             if (source.hasArray() && target.hasArray()) {\r
  78                 /* source and target are backed by arrays, so use the arrays for optimal performance */\r
  79                 byte[] sourceArray = source.array();\r
  80                 int sourceIndex = source.arrayOffset() + source.position();\r
  81                 int sourceLimit = source.arrayOffset() + source.limit();\r
  82                 char[] targetArray = target.array();\r
  83                 int targetIndex = target.arrayOffset() + target.position();\r
  84                 int targetLimit = target.arrayOffset() + target.limit();\r
  85 \r
  86                 byte ch;\r
  87                 int char32, bytesExpected, bytesSoFar;\r
  88                 CoderResult cr;\r
  89 \r
  90                 if (mode == 0) {\r
  91                     /* nothing is stored in toUnicodeStatus, read a byte as input */\r
  92                     char32 = (toUBytesArray[0] = sourceArray[sourceIndex++]) & 0xff;\r
  93                     bytesExpected = BYTES_FROM_UTF8[char32];\r
  94                     char32 &= BITMASK_FROM_UTF8[bytesExpected];\r
  95                     bytesSoFar = 1;\r
  96                 } else {\r
  97                     /* a partially or fully built code point is stored in toUnicodeStatus */\r
  98                     char32 = toUnicodeStatus;\r
  99                     bytesExpected = mode;\r
 100                     bytesSoFar = toULength;\r
 101 \r
 102                     toUnicodeStatus = 0;\r
 103                     mode = 0;\r
 104                     toULength = 0;\r
 105                 }\r
 106 \r
 107                 outer: while (true) {\r
 108                     if (bytesSoFar < bytesExpected) {\r
 109                         /* read a trail byte and insert its relevant bits into char32 */\r
 110                         if (sourceIndex >= sourceLimit) {\r
 111                             /* no source left, save the state for later and break out of the loop */\r
 112                             toUnicodeStatus = char32;\r
 113                             mode = bytesExpected;\r
 114                             toULength = bytesSoFar;\r
 115                             cr = CoderResult.UNDERFLOW;\r
 116                             break;\r
 117                         }\r
 118                         if (((ch = toUBytesArray[bytesSoFar] = sourceArray[sourceIndex++]) & 0xc0) != 0x80) {\r
 119                             /* not a trail byte (is not of the form 10xxxxxx) */\r
 120                             sourceIndex--;\r
 121                             toULength = bytesSoFar;\r
 122                             cr = CoderResult.malformedForLength(bytesSoFar);\r
 123                             break;\r
 124                         }\r
 125                         char32 = (char32 << 6) | (ch & 0x3f);\r
 126                         bytesSoFar++;\r
 127                     } else if (bytesSoFar == bytesExpected && UTF8_MIN_CHAR32[bytesExpected] <= char32 && char32 <= 0x10ffff\r
 128                             && (isCESU8 ? bytesExpected <= 3 : !UTF16.isSurrogate((char) char32))) {\r
 129                         /*\r
 130                          * char32 is a valid code point and is composed of the correct number of\r
 131                          * bytes ... we now need to output it in UTF-16\r
 132                          */\r
 133 \r
 134                         if (char32 <= UConverterConstants.MAXIMUM_UCS2) {\r
 135                             /* fits in 16 bits */\r
 136                             targetArray[targetIndex++] = (char) char32;\r
 137                         } else {\r
 138                             /* fit char32 into 20 bits */\r
 139                             char32 -= UConverterConstants.HALF_BASE;\r
 140 \r
 141                             /* write out the surrogates */\r
 142                             targetArray[targetIndex++] = (char) ((char32 >>> UConverterConstants.HALF_SHIFT) + UConverterConstants.SURROGATE_HIGH_START);\r
 143 \r
 144                             if (targetIndex >= targetLimit) {\r
 145                                 /* put in overflow buffer (not handled here) */\r
 146                                 charErrorBufferArray[charErrorBufferBegin++] = (char) char32;\r
 147                                 cr = CoderResult.OVERFLOW;\r
 148                                 break;\r
 149                             }\r
 150                             targetArray[targetIndex++] = (char) ((char32 & UConverterConstants.HALF_MASK) + UConverterConstants.SURROGATE_LOW_START);\r
 151                         }\r
 152 \r
 153                         /*\r
 154                          * we're finished outputing, so now we need to read in the first byte of the\r
 155                          * next byte sequence that could form a code point\r
 156                          */\r
 157 \r
 158                         if (sourceIndex >= sourceLimit) {\r
 159                             cr = CoderResult.UNDERFLOW;\r
 160                             break;\r
 161                         }\r
 162                         if (targetIndex >= targetLimit) {\r
 163                             cr = CoderResult.OVERFLOW;\r
 164                             break;\r
 165                         }\r
 166 \r
 167                         /* keep reading the next input (and writing it) while bytes == 1 */\r
 168                         while ((bytesExpected = BYTES_FROM_UTF8[char32 = (toUBytesArray[0] = sourceArray[sourceIndex++]) & 0xff]) == 1) {\r
 169                             targetArray[targetIndex++] = (char) char32;\r
 170                             if (sourceIndex >= sourceLimit) {\r
 171                                 cr = CoderResult.UNDERFLOW;\r
 172                                 break outer;\r
 173                             }\r
 174                             if (targetIndex >= targetLimit) {\r
 175                                 cr = CoderResult.OVERFLOW;\r
 176                                 break outer;\r
 177                             }\r
 178                         }\r
 179 \r
 180                         /* remove the bits that indicate the number of bytes */\r
 181                         char32 &= BITMASK_FROM_UTF8[bytesExpected];\r
 182                         bytesSoFar = 1;\r
 183                     } else {\r
 184                         /*\r
 185                          * either the lead byte in the code sequence is invalid (bytes == 0) or the\r
 186                          * lead byte combined with all the trail chars does not form a valid code\r
 187                          * point\r
 188                          */\r
 189                         toULength = bytesSoFar;\r
 190                         cr = CoderResult.malformedForLength(bytesSoFar);\r
 191                         break;\r
 192                     }\r
 193                 }\r
 194 \r
 195                 source.position(sourceIndex - source.arrayOffset());\r
 196                 target.position(targetIndex - target.arrayOffset());\r
 197                 return cr;\r
 198 \r
 199             } else {\r
 200 \r
 201                 int sourceIndex = source.position();\r
 202                 int sourceLimit = source.limit();\r
 203                 int targetIndex = target.position();\r
 204                 int targetLimit = target.limit();\r
 205 \r
 206                 byte ch;\r
 207                 int char32, bytesExpected, bytesSoFar;\r
 208                 CoderResult cr;\r
 209 \r
 210                 if (mode == 0) {\r
 211                     /* nothing is stored in toUnicodeStatus, read a byte as input */\r
 212                     char32 = (toUBytesArray[0] = source.get(sourceIndex++)) & 0xff;\r
 213                     bytesExpected = BYTES_FROM_UTF8[char32];\r
 214                     char32 &= BITMASK_FROM_UTF8[bytesExpected];\r
 215                     bytesSoFar = 1;\r
 216                 } else {\r
 217                     /* a partially or fully built code point is stored in toUnicodeStatus */\r
 218                     char32 = toUnicodeStatus;\r
 219                     bytesExpected = mode;\r
 220                     bytesSoFar = toULength;\r
 221 \r
 222                     toUnicodeStatus = 0;\r
 223                     mode = 0;\r
 224                     toULength = 0;\r
 225                 }\r
 226 \r
 227                 outer: while (true) {\r
 228                     if (bytesSoFar < bytesExpected) {\r
 229                         /* read a trail byte and insert its relevant bits into char32 */\r
 230                         if (sourceIndex >= sourceLimit) {\r
 231                             /* no source left, save the state for later and break out of the loop */\r
 232                             toUnicodeStatus = char32;\r
 233                             mode = bytesExpected;\r
 234                             toULength = bytesSoFar;\r
 235                             cr = CoderResult.UNDERFLOW;\r
 236                             break;\r
 237                         }\r
 238                         if (((ch = toUBytesArray[bytesSoFar] = source.get(sourceIndex++)) & 0xc0) != 0x80) {\r
 239                             /* not a trail byte (is not of the form 10xxxxxx) */\r
 240                             sourceIndex--;\r
 241                             toULength = bytesSoFar;\r
 242                             cr = CoderResult.malformedForLength(bytesSoFar);\r
 243                             break;\r
 244                         }\r
 245                         char32 = (char32 << 6) | (ch & 0x3f);\r
 246                         bytesSoFar++;\r
 247                     }\r
 248                     /*\r
 249                      * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:\r
 250                      * - use only trail bytes after a lead byte (checked above)\r
 251                      * - use the right number of trail bytes for a given lead byte\r
 252                      * - encode a code point <= U+10ffff\r
 253                      * - use the fewest possible number of bytes for their code points\r
 254                      * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])\r
 255                      *\r
 256                      * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.\r
 257                      * There are no irregular sequences any more.\r
 258                      * In CESU-8, only surrogates, not supplementary code points, are encoded directly.\r
 259                      */\r
 260                     else if (bytesSoFar == bytesExpected && UTF8_MIN_CHAR32[bytesExpected] <= char32 && char32 <= 0x10ffff\r
 261                             && (isCESU8 ? bytesExpected <= 3 : !UTF16.isSurrogate((char) char32))) {\r
 262                         /*\r
 263                          * char32 is a valid code point and is composed of the correct number of\r
 264                          * bytes ... we now need to output it in UTF-16\r
 265                          */\r
 266 \r
 267                         if (char32 <= UConverterConstants.MAXIMUM_UCS2) {\r
 268                             /* fits in 16 bits */\r
 269                             target.put(targetIndex++, (char) char32);\r
 270                         } else {\r
 271                             /* fit char32 into 20 bits */\r
 272                             char32 -= UConverterConstants.HALF_BASE;\r
 273 \r
 274                             /* write out the surrogates */\r
 275                             target.put(\r
 276                                     targetIndex++,\r
 277                                     (char) ((char32 >>> UConverterConstants.HALF_SHIFT) + UConverterConstants.SURROGATE_HIGH_START));\r
 278 \r
 279                             if (targetIndex >= targetLimit) {\r
 280                                 /* put in overflow buffer (not handled here) */\r
 281                                 charErrorBufferArray[charErrorBufferBegin++] = (char) char32;\r
 282                                 cr = CoderResult.OVERFLOW;\r
 283                                 break;\r
 284                             }\r
 285                             target.put(\r
 286                                     targetIndex++,\r
 287                                     (char) ((char32 & UConverterConstants.HALF_MASK) + UConverterConstants.SURROGATE_LOW_START));\r
 288                         }\r
 289 \r
 290                         /*\r
 291                          * we're finished outputing, so now we need to read in the first byte of the\r
 292                          * next byte sequence that could form a code point\r
 293                          */\r
 294 \r
 295                         if (sourceIndex >= sourceLimit) {\r
 296                             cr = CoderResult.UNDERFLOW;\r
 297                             break;\r
 298                         }\r
 299                         if (targetIndex >= targetLimit) {\r
 300                             cr = CoderResult.OVERFLOW;\r
 301                             break;\r
 302                         }\r
 303 \r
 304                         /* keep reading the next input (and writing it) while bytes == 1 */\r
 305                         while ((bytesExpected = BYTES_FROM_UTF8[char32 = (toUBytesArray[0] = source.get(sourceIndex++)) & 0xff]) == 1) {\r
 306                             target.put(targetIndex++, (char) char32);\r
 307                             if (sourceIndex >= sourceLimit) {\r
 308                                 cr = CoderResult.UNDERFLOW;\r
 309                                 break outer;\r
 310                             }\r
 311                             if (targetIndex >= targetLimit) {\r
 312                                 cr = CoderResult.OVERFLOW;\r
 313                                 break outer;\r
 314                             }\r
 315                         }\r
 316 \r
 317                         /* remove the bits that indicate the number of bytes */\r
 318                         char32 &= BITMASK_FROM_UTF8[bytesExpected];\r
 319                         bytesSoFar = 1;\r
 320                     } else {\r
 321                         /*\r
 322                          * either the lead byte in the code sequence is invalid (bytes == 0) or the\r
 323                          * lead byte combined with all the trail chars does not form a valid code\r
 324                          * point\r
 325                          */\r
 326                         toULength = bytesSoFar;\r
 327                         cr = CoderResult.malformedForLength(bytesSoFar);\r
 328                         break;\r
 329                     }\r
 330                 }\r
 331 \r
 332                 source.position(sourceIndex);\r
 333                 target.position(targetIndex);\r
 334                 return cr;\r
 335             }\r
 336         }\r
 337 \r
 338     }\r
 339 \r
 340     class CharsetEncoderUTF8 extends CharsetEncoderICU {\r
 341 \r
 342         public CharsetEncoderUTF8(CharsetICU cs) {\r
 343             super(cs, fromUSubstitution);\r
 344             implReset();\r
 345         }\r
 346 \r
 347         protected void implReset() {\r
 348             super.implReset();\r
 349         }\r
 350 \r
 351         protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets,\r
 352                 boolean flush) {\r
 353             if (!source.hasRemaining()) {\r
 354                 /* no input, nothing to do */\r
 355                 return CoderResult.UNDERFLOW;\r
 356             }\r
 357             if (!target.hasRemaining()) {\r
 358                 /* no output available, can't do anything */\r
 359                 return CoderResult.OVERFLOW;\r
 360             }\r
 361 \r
 362             if (source.hasArray() && target.hasArray()) {\r
 363                 /* source and target are backed by arrays, so use the arrays for optimal performance */\r
 364                 char[] sourceArray = source.array();\r
 365                 int srcIdx = source.arrayOffset() + source.position();\r
 366                 int sourceLimit = source.arrayOffset() + source.limit();\r
 367                 byte[] targetArray = target.array();\r
 368                 int tgtIdx = target.arrayOffset() + target.position();\r
 369                 int targetLimit = target.arrayOffset() + target.limit();\r
 370 \r
 371                 int char32;\r
 372                 CoderResult cr;\r
 373 \r
 374                 /* take care of the special condition of fromUChar32 not being 0 (it is a surrogate) */\r
 375                 if (fromUChar32 != 0) {\r
 376                     /* 4 bytes to encode from char32 and a following char in source */\r
 377 \r
 378                     sourceIndex = srcIdx;\r
 379                     targetIndex = tgtIdx;\r
 380                     cr = encodeFourBytes(sourceArray, targetArray, sourceLimit, targetLimit,\r
 381                             fromUChar32);\r
 382                     srcIdx = sourceIndex;\r
 383                     tgtIdx = targetIndex;\r
 384                     if (cr != null) {\r
 385                         source.position(srcIdx - source.arrayOffset());\r
 386                         target.position(tgtIdx - target.arrayOffset());\r
 387                         return cr;\r
 388                     }\r
 389                 }\r
 390 \r
 391                 while (true) {\r
 392                     if (srcIdx >= sourceLimit) {\r
 393                         /* nothing left to read */\r
 394                         cr = CoderResult.UNDERFLOW;\r
 395                         break;\r
 396                     }\r
 397                     if (tgtIdx >= targetLimit) {\r
 398                         /* no space left to write */\r
 399                         cr = CoderResult.OVERFLOW;\r
 400                         break;\r
 401                     }\r
 402 \r
 403                     /* reach the next char into char32 */\r
 404                     char32 = sourceArray[srcIdx++];\r
 405 \r
 406                     if (char32 <= 0x7f) {\r
 407                         /* 1 byte to encode from char32 */\r
 408 \r
 409                         targetArray[tgtIdx++] = encodeHeadOf1(char32);\r
 410 \r
 411                     } else if (char32 <= 0x7ff) {\r
 412                         /* 2 bytes to encode from char32 */\r
 413 \r
 414                         targetArray[tgtIdx++] = encodeHeadOf2(char32);\r
 415 \r
 416                         if (tgtIdx >= targetLimit) {\r
 417                             errorBuffer[errorBufferLength++] = encodeLastTail(char32);\r
 418                             cr = CoderResult.OVERFLOW;\r
 419                             break;\r
 420                         }\r
 421                         targetArray[tgtIdx++] = encodeLastTail(char32);\r
 422 \r
 423                     } else if (!UTF16.isSurrogate((char) char32) || isCESU8) {\r
 424                         /* 3 bytes to encode from char32 */\r
 425 \r
 426                         targetArray[tgtIdx++] = encodeHeadOf3(char32);\r
 427 \r
 428                         if (tgtIdx >= targetLimit) {\r
 429                             errorBuffer[errorBufferLength++] = encodeSecondToLastTail(char32);\r
 430                             errorBuffer[errorBufferLength++] = encodeLastTail(char32);\r
 431                             cr = CoderResult.OVERFLOW;\r
 432                             break;\r
 433                         }\r
 434                         targetArray[tgtIdx++] = encodeSecondToLastTail(char32);\r
 435 \r
 436                         if (tgtIdx >= targetLimit) {\r
 437                             errorBuffer[errorBufferLength++] = encodeLastTail(char32);\r
 438                             cr = CoderResult.OVERFLOW;\r
 439                             break;\r
 440                         }\r
 441                         targetArray[tgtIdx++] = encodeLastTail(char32);\r
 442 \r
 443                     } else {\r
 444                         /* 4 bytes to encode from char32 and a following char in source */\r
 445 \r
 446                         sourceIndex = srcIdx;\r
 447                         targetIndex = tgtIdx;\r
 448                         cr = encodeFourBytes(sourceArray, targetArray, sourceLimit, targetLimit,\r
 449                                 char32);\r
 450                         srcIdx = sourceIndex;\r
 451                         tgtIdx = targetIndex;\r
 452                         if (cr != null)\r
 453                             break;\r
 454                     }\r
 455                 }\r
 456 \r
 457                 /* set the new source and target positions and return the CoderResult stored in cr */\r
 458                 source.position(srcIdx - source.arrayOffset());\r
 459                 target.position(tgtIdx - target.arrayOffset());\r
 460                 return cr;\r
 461 \r
 462             } else {\r
 463                 int char32;\r
 464                 CoderResult cr;\r
 465 \r
 466                 /* take care of the special condition of fromUChar32 not being 0 (it is a surrogate) */\r
 467                 if (fromUChar32 != 0) {\r
 468                     /* 4 bytes to encode from char32 and a following char in source */\r
 469 \r
 470                     cr = encodeFourBytes(source, target, fromUChar32);\r
 471                     if (cr != null)\r
 472                         return cr;\r
 473                 }\r
 474 \r
 475                 while (true) {\r
 476                     if (!source.hasRemaining()) {\r
 477                         /* nothing left to read */\r
 478                         cr = CoderResult.UNDERFLOW;\r
 479                         break;\r
 480                     }\r
 481                     if (!target.hasRemaining()) {\r
 482                         /* no space left to write */\r
 483                         cr = CoderResult.OVERFLOW;\r
 484                         break;\r
 485                     }\r
 486 \r
 487                     /* reach the next char into char32 */\r
 488                     char32 = source.get();\r
 489 \r
 490                     if (char32 <= 0x7f) {\r
 491                         /* 1 byte to encode from char32 */\r
 492 \r
 493                         target.put(encodeHeadOf1(char32));\r
 494 \r
 495                     } else if (char32 <= 0x7ff) {\r
 496                         /* 2 bytes to encode from char32 */\r
 497 \r
 498                         target.put(encodeHeadOf2(char32));\r
 499 \r
 500                         if (!target.hasRemaining()) {\r
 501                             errorBuffer[errorBufferLength++] = encodeLastTail(char32);\r
 502                             cr = CoderResult.OVERFLOW;\r
 503                             break;\r
 504                         }\r
 505                         target.put(encodeLastTail(char32));\r
 506 \r
 507                     } else if (!UTF16.isSurrogate((char) char32) || isCESU8) {\r
 508                         /* 3 bytes to encode from char32 */\r
 509 \r
 510                         target.put(encodeHeadOf3(char32));\r
 511 \r
 512                         if (!target.hasRemaining()) {\r
 513                             errorBuffer[errorBufferLength++] = encodeSecondToLastTail(char32);\r
 514                             errorBuffer[errorBufferLength++] = encodeLastTail(char32);\r
 515                             cr = CoderResult.OVERFLOW;\r
 516                             break;\r
 517                         }\r
 518                         target.put(encodeSecondToLastTail(char32));\r
 519 \r
 520                         if (!target.hasRemaining()) {\r
 521                             errorBuffer[errorBufferLength++] = encodeLastTail(char32);\r
 522                             cr = CoderResult.OVERFLOW;\r
 523                             break;\r
 524                         }\r
 525                         target.put(encodeLastTail(char32));\r
 526 \r
 527                     } else {\r
 528                         /* 4 bytes to encode from char32 and a following char in source */\r
 529 \r
 530                         cr = encodeFourBytes(source, target, char32);\r
 531                         if (cr != null)\r
 532                             break;\r
 533                     }\r
 534                 }\r
 535 \r
 536                 /* set the new source and target positions and return the CoderResult stored in cr */\r
 537                 return cr;\r
 538             }\r
 539         }\r
 540 \r
 541         private final CoderResult encodeFourBytes(char[] sourceArray, byte[] targetArray,\r
 542                 int sourceLimit, int targetLimit, int char32) {\r
 543 \r
 544             /* we need to read another char to match up the surrogate stored in char32 */\r
 545             /* handle the surrogate stuff, returning on a non-null CoderResult */\r
 546             CoderResult cr = handleSurrogates(sourceArray, sourceIndex, sourceLimit, (char)char32);\r
 547             if (cr != null)\r
 548                 return cr;\r
 549             \r
 550             sourceIndex++;\r
 551             char32 = fromUChar32;\r
 552             fromUChar32 = 0;\r
 553 \r
 554             /* the rest is routine -- encode four bytes, stopping on overflow */\r
 555 \r
 556             targetArray[targetIndex++] = encodeHeadOf4(char32);\r
 557 \r
 558             if (targetIndex >= targetLimit) {\r
 559                 errorBuffer[errorBufferLength++] = encodeThirdToLastTail(char32);\r
 560                 errorBuffer[errorBufferLength++] = encodeSecondToLastTail(char32);\r
 561                 errorBuffer[errorBufferLength++] = encodeLastTail(char32);\r
 562                 return CoderResult.OVERFLOW;\r
 563             }\r
 564             targetArray[targetIndex++] = encodeThirdToLastTail(char32);\r
 565 \r
 566             if (targetIndex >= targetLimit) {\r
 567                 errorBuffer[errorBufferLength++] = encodeSecondToLastTail(char32);\r
 568                 errorBuffer[errorBufferLength++] = encodeLastTail(char32);\r
 569                 return CoderResult.OVERFLOW;\r
 570             }\r
 571             targetArray[targetIndex++] = encodeSecondToLastTail(char32);\r
 572 \r
 573             if (targetIndex >= targetLimit) {\r
 574                 errorBuffer[errorBufferLength++] = encodeLastTail(char32);\r
 575                 return CoderResult.OVERFLOW;\r
 576             }\r
 577             targetArray[targetIndex++] = encodeLastTail(char32);\r
 578 \r
 579             /* return null for success */\r
 580             return null;\r
 581         }\r
 582 \r
 583         private final CoderResult encodeFourBytes(CharBuffer source, ByteBuffer target, int char32) {\r
 584 \r
 585             /* handle the surrogate stuff, returning on a non-null CoderResult */\r
 586             CoderResult cr = handleSurrogates(source, (char)char32);\r
 587             if (cr != null)\r
 588                 return cr;\r
 589             \r
 590             char32 = fromUChar32;\r
 591             fromUChar32 = 0;\r
 592             \r
 593             /* the rest is routine -- encode four bytes, stopping on overflow */\r
 594 \r
 595             target.put(encodeHeadOf4(char32));\r
 596 \r
 597             if (!target.hasRemaining()) {\r
 598                 errorBuffer[errorBufferLength++] = encodeThirdToLastTail(char32);\r
 599                 errorBuffer[errorBufferLength++] = encodeSecondToLastTail(char32);\r
 600                 errorBuffer[errorBufferLength++] = encodeLastTail(char32);\r
 601                 return CoderResult.OVERFLOW;\r
 602             }\r
 603             target.put(encodeThirdToLastTail(char32));\r
 604 \r
 605             if (!target.hasRemaining()) {\r
 606                 errorBuffer[errorBufferLength++] = encodeSecondToLastTail(char32);\r
 607                 errorBuffer[errorBufferLength++] = encodeLastTail(char32);\r
 608                 return CoderResult.OVERFLOW;\r
 609             }\r
 610             target.put(encodeSecondToLastTail(char32));\r
 611 \r
 612             if (!target.hasRemaining()) {\r
 613                 errorBuffer[errorBufferLength++] = encodeLastTail(char32);\r
 614                 return CoderResult.OVERFLOW;\r
 615             }\r
 616             target.put(encodeLastTail(char32));\r
 617 \r
 618             /* return null for success */\r
 619             return null;\r
 620         }\r
 621 \r
 622         private int sourceIndex;\r
 623 \r
 624         private int targetIndex;\r
 625 \r
 626     }\r
 627 \r
 628     private static final byte encodeHeadOf1(int char32) {\r
 629         return (byte) char32;\r
 630     }\r
 631 \r
 632     private static final byte encodeHeadOf2(int char32) {\r
 633         return (byte) (0xc0 | (char32 >>> 6));\r
 634     }\r
 635 \r
 636     private static final byte encodeHeadOf3(int char32) {\r
 637         return (byte) (0xe0 | ((char32 >>> 12)));\r
 638     }\r
 639 \r
 640     private static final byte encodeHeadOf4(int char32) {\r
 641         return (byte) (0xf0 | ((char32 >>> 18)));\r
 642     }\r
 643 \r
 644     private static final byte encodeThirdToLastTail(int char32) {\r
 645         return (byte) (0x80 | ((char32 >>> 12) & 0x3f));\r
 646     }\r
 647 \r
 648     private static final byte encodeSecondToLastTail(int char32) {\r
 649         return (byte) (0x80 | ((char32 >>> 6) & 0x3f));\r
 650     }\r
 651 \r
 652     private static final byte encodeLastTail(int char32) {\r
 653         return (byte) (0x80 | (char32 & 0x3f));\r
 654     }\r
 655 \r
 656     /* single-code point definitions -------------------------------------------- */\r
 657 \r
 658     /*\r
 659      * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?\r
 660      * @param c 8-bit code unit (byte)\r
 661      * @return TRUE or FALSE\r
 662      */\r
 663     // static final boolean isSingle(byte c) {return (((c)&0x80)==0);}\r
 664     /*\r
 665      * Is this code unit (byte) a UTF-8 lead byte?\r
 666      * @param c 8-bit code unit (byte)\r
 667      * @return TRUE or FALSE\r
 668      */\r
 669     // static final boolean isLead(byte c) {return ((((c)-0xc0) &\r
 670     // UConverterConstants.UNSIGNED_BYTE_MASK)<0x3e);}\r
 671     /*\r
 672      * Is this code unit (byte) a UTF-8 trail byte?\r
 673      * \r
 674      * @param c\r
 675      *            8-bit code unit (byte)\r
 676      * @return TRUE or FALSE\r
 677      */\r
 678     /*private static final boolean isTrail(byte c) {\r
 679         return (((c) & 0xc0) == 0x80);\r
 680     }*/\r
 681 \r
 682     public CharsetDecoder newDecoder() {\r
 683         return new CharsetDecoderUTF8(this);\r
 684     }\r
 685 \r
 686     public CharsetEncoder newEncoder() {\r
 687         return new CharsetEncoderUTF8(this);\r
 688     }\r
 689     \r
 690     \r
 691     void getUnicodeSetImpl( UnicodeSet setFillIn, int which){\r
 692         getNonSurrogateUnicodeSet(setFillIn);\r
 693     }\r
 694 }\r