jars/icu4j-52_1/main/classes/charset/src/com/ibm/icu/charset/CharsetUTF16.java

   1 /**
   2  *******************************************************************************
   3  * Copyright (C) 2006-2011, International Business Machines Corporation and    *
   4  * others. All Rights Reserved.                                                *
   5  *******************************************************************************
   6  */
   7 package com.ibm.icu.charset;
   8
   9 import java.nio.ByteBuffer;
  10 import java.nio.CharBuffer;
  11 import java.nio.IntBuffer;
  12 import java.nio.charset.CharsetDecoder;
  13 import java.nio.charset.CharsetEncoder;
  14 import java.nio.charset.CoderResult;
  15
  16 import com.ibm.icu.text.UTF16;
  17 import com.ibm.icu.text.UnicodeSet;
  18 import com.ibm.icu.util.VersionInfo;
  19
  20 /**
  21  * @author Niti Hantaweepant
  22  */
  23 class CharsetUTF16 extends CharsetICU {
  24
  25     private static final int SIGNATURE_LENGTH = 2;
  26     private static final byte[] fromUSubstitution_BE = { (byte) 0xff, (byte) 0xfd };
  27     private static final byte[] fromUSubstitution_LE = { (byte) 0xfd, (byte) 0xff };
  28     private static final byte[] BOM_BE = { (byte) 0xfe, (byte) 0xff };
  29     private static final byte[] BOM_LE = { (byte) 0xff, (byte) 0xfe };
  30     private static final int ENDIAN_XOR_BE = 0;
  31     private static final int ENDIAN_XOR_LE = 1;
  32     private static final int NEED_TO_WRITE_BOM = 1;
  33
  34     private boolean isEndianSpecified;
  35     private boolean isBigEndian;
  36     private int endianXOR;
  37     private byte[] bom;
  38     private byte[] fromUSubstitution;
  39
  40     private int version;
  41
  42     public CharsetUTF16(String icuCanonicalName, String javaCanonicalName, String[] aliases) {
  43         super(icuCanonicalName, javaCanonicalName, aliases);
  44
  45         /* Get the version number (e.g. UTF-16LE,version=1) */
  46         int versionIndex = icuCanonicalName.indexOf("version=");
  47         if (versionIndex > 0) {
  48             version = Integer.decode(icuCanonicalName.substring(versionIndex+8, versionIndex+9)).intValue();
  49         } else {
  50             version = 0;
  51         }
  52
  53         this.isEndianSpecified = (this instanceof CharsetUTF16BE || this instanceof CharsetUTF16LE);
  54         this.isBigEndian = !(this instanceof CharsetUTF16LE);
  55
  56         if (isBigEndian) {
  57             this.bom = BOM_BE;
  58             this.fromUSubstitution = fromUSubstitution_BE;
  59             this.endianXOR = ENDIAN_XOR_BE;
  60         } else {
  61             this.bom = BOM_LE;
  62             this.fromUSubstitution = fromUSubstitution_LE;
  63             this.endianXOR = ENDIAN_XOR_LE;
  64         }
  65
  66         /* UnicodeBig and UnicodeLittle requires maxBytesPerChar set to 4 in Java 5 or less */
  67         if ((VersionInfo.javaVersion().getMajor() == 1 && VersionInfo.javaVersion().getMinor() <= 5)
  68                 && (isEndianSpecified && version == 1)) {
  69             maxBytesPerChar = 4;
  70         } else {
  71             maxBytesPerChar = 2;
  72         }
  73
  74         minBytesPerChar = 2;
  75         maxCharsPerByte = 1;
  76     }
  77
  78     class CharsetDecoderUTF16 extends CharsetDecoderICU {
  79
  80         private boolean isBOMReadYet;
  81         private int actualEndianXOR;
  82         private byte[] actualBOM;
  83
  84         public CharsetDecoderUTF16(CharsetICU cs) {
  85             super(cs);
  86         }
  87
  88         protected void implReset() {
  89             super.implReset();
  90             isBOMReadYet = false;
  91             actualBOM = null;
  92         }
  93
  94         protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
  95             /*
  96              * If we detect a BOM in this buffer, then we must add the BOM size to the offsets because the actual
  97              * converter function will not see and count the BOM. offsetDelta will have the number of the BOM bytes that
  98              * are in the current buffer.
  99              */
 100             if (!isBOMReadYet) {
 101                 while (true) {
 102                     if (!source.hasRemaining())
 103                         return CoderResult.UNDERFLOW;
 104
 105                     toUBytesArray[toULength++] = source.get();
 106
 107                     if (toULength == 1) {
 108                         // on the first byte, we haven't decided whether or not it's bigEndian yet
 109                         if ((!isEndianSpecified || isBigEndian)
 110                                 && toUBytesArray[toULength - 1] == BOM_BE[toULength - 1]) {
 111                             actualBOM = BOM_BE;
 112                             actualEndianXOR = ENDIAN_XOR_BE;
 113                         } else if ((!isEndianSpecified || !isBigEndian)
 114                                 && toUBytesArray[toULength - 1] == BOM_LE[toULength - 1]) {
 115                             actualBOM = BOM_LE;
 116                             actualEndianXOR = ENDIAN_XOR_LE;
 117                         } else {
 118                             // we do not have a BOM (and we have toULength==1 bytes)
 119                             if (isEndianSpecified && version == 1) {
 120                                 actualBOM = isBigEndian ? CharsetUTF16.BOM_BE : CharsetUTF16.BOM_LE;
 121                                 actualEndianXOR = isBigEndian ? CharsetUTF16.ENDIAN_XOR_BE : CharsetUTF16.ENDIAN_XOR_LE;
 122                             } else {
 123                                 actualBOM = null;
 124                                 actualEndianXOR = endianXOR;
 125                             }
 126                             break;
 127                         }
 128                     } else if (isEndianSpecified && version == 1 && (toUBytesArray[toULength - 1] == actualBOM[toULength - 2] && toUBytesArray[toULength - 2] == actualBOM[toULength - 1])) {
 129                         return CoderResult.malformedForLength(2);
 130                     } else if (isEndianSpecified && version == 1 && (toUBytesArray[toULength - 1] == actualBOM[toULength - 1] && toUBytesArray[toULength - 2] == actualBOM[toULength - 2])) {
 131                         // we found a BOM! at last!
 132                         // too bad we have to get ignore it now (like it was unwanted or something)
 133                         toULength = 0;
 134                         break;
 135                     } else if (isEndianSpecified || toUBytesArray[toULength - 1] != actualBOM[toULength - 1]) {
 136                         // we do not have a BOM (and we have toULength bytes)
 137                         actualBOM = null;
 138                         actualEndianXOR = endianXOR;
 139                         break;
 140                     } else if (toULength == SIGNATURE_LENGTH) {
 141                         // we found a BOM! at last!
 142                         // too bad we have to get ignore it now (like it was unwanted or something)
 143                         toULength = 0;
 144                         break;
 145                     }
 146                 }
 147
 148                 isBOMReadYet = true;
 149             }
 150
 151             // now that we no longer need to look for a BOM, let's do some work
 152
 153             // if we have unfinished business
 154             if (toUnicodeStatus != 0) {
 155                 CoderResult cr = decodeTrail(source, target, offsets, (char) toUnicodeStatus);
 156                 if (cr != null)
 157                     return cr;
 158             }
 159
 160             char char16;
 161
 162             while (true) {
 163                 while (toULength < 2) {
 164                     if (!source.hasRemaining())
 165                         return CoderResult.UNDERFLOW;
 166                     toUBytesArray[toULength++] = source.get();
 167                 }
 168
 169                 if (isEndianSpecified && version == 1 && (toUBytesArray[toULength - 1] == actualBOM[toULength - 2] && toUBytesArray[toULength - 2] == actualBOM[toULength - 1])) {
 170                     return CoderResult.malformedForLength(2);
 171                 } else if (isEndianSpecified && version == 1 && (toUBytesArray[toULength - 1] == actualBOM[toULength - 1] && toUBytesArray[toULength - 2] == actualBOM[toULength - 2])) {
 172                     // we found a BOM! at last!
 173                     // too bad we have to get ignore it now (like it was unwanted or something)
 174                     toULength = 0;
 175                     continue;
 176                 }
 177
 178                 if (!target.hasRemaining())
 179                     return CoderResult.OVERFLOW;
 180
 181                 char16 = (char) (((toUBytesArray[0 ^ actualEndianXOR] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | ((toUBytesArray[1 ^ actualEndianXOR] & UConverterConstants.UNSIGNED_BYTE_MASK)));
 182
 183                 if (!UTF16.isSurrogate(char16)) {
 184                     toULength = 0;
 185                     target.put(char16);
 186                 } else {
 187                     CoderResult cr = decodeTrail(source, target, offsets, char16);
 188                     if (cr != null)
 189                         return cr;
 190                 }
 191             }
 192         }
 193
 194         private final CoderResult decodeTrail(ByteBuffer source, CharBuffer target, IntBuffer offsets, char lead) {
 195             if (!UTF16.isLeadSurrogate(lead)) {
 196                 // 2 bytes, lead malformed
 197                 toUnicodeStatus = 0;
 198                 return CoderResult.malformedForLength(2);
 199             }
 200
 201             while (toULength < 4) {
 202                 if (!source.hasRemaining()) {
 203                     // let this be unfinished business
 204                     toUnicodeStatus = lead;
 205                     return CoderResult.UNDERFLOW;
 206                 }
 207                 toUBytesArray[toULength++] = source.get();
 208             }
 209
 210             char trail = (char) (((toUBytesArray[2 ^ actualEndianXOR] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | ((toUBytesArray[3 ^ actualEndianXOR] & UConverterConstants.UNSIGNED_BYTE_MASK)));
 211
 212             if (!UTF16.isTrailSurrogate(trail)) {
 213                 // pretend like we didnt read the last 2 bytes
 214                 toULength = 2;
 215                 source.position(source.position() - 2);
 216
 217                 // 2 bytes, lead malformed
 218                 toUnicodeStatus = 0;
 219                 return CoderResult.malformedForLength(2);
 220             }
 221
 222             toUnicodeStatus = 0;
 223             toULength = 0;
 224
 225             target.put(lead);
 226
 227             if (target.hasRemaining()) {
 228                 target.put(trail);
 229                 return null;
 230             } else {
 231                 /* Put in overflow buffer (not handled here) */
 232                 charErrorBufferArray[0] = trail;
 233                 charErrorBufferLength = 1;
 234                 return CoderResult.OVERFLOW;
 235             }
 236         }
 237     }
 238
 239     class CharsetEncoderUTF16 extends CharsetEncoderICU {
 240         private final byte[] temp = new byte[4];
 241
 242         public CharsetEncoderUTF16(CharsetICU cs) {
 243             super(cs, fromUSubstitution);
 244             fromUnicodeStatus = (isEndianSpecified && version != 1) ? 0 : NEED_TO_WRITE_BOM;
 245         }
 246
 247         protected void implReset() {
 248             super.implReset();
 249             fromUnicodeStatus = (isEndianSpecified && version != 1) ? 0 : NEED_TO_WRITE_BOM;
 250         }
 251
 252         protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
 253             CoderResult cr;
 254
 255             /* write the BOM if necessary */
 256             if (fromUnicodeStatus == NEED_TO_WRITE_BOM) {
 257                 if (!target.hasRemaining())
 258                     return CoderResult.OVERFLOW;
 259
 260                 fromUnicodeStatus = 0;
 261                 cr = fromUWriteBytes(this, bom, 0, bom.length, target, offsets, -1);
 262                 if (cr.isOverflow())
 263                     return cr;
 264             }
 265
 266             if (fromUChar32 != 0) {
 267                 if (!target.hasRemaining())
 268                     return CoderResult.OVERFLOW;
 269
 270                 // a note: fromUChar32 will either be 0 or a lead surrogate
 271                 cr = encodeChar(source, target, offsets, (char) fromUChar32);
 272                 if (cr != null)
 273                     return cr;
 274             }
 275
 276             while (true) {
 277                 if (!source.hasRemaining())
 278                     return CoderResult.UNDERFLOW;
 279                 if (!target.hasRemaining())
 280                     return CoderResult.OVERFLOW;
 281
 282                 cr = encodeChar(source, target, offsets, source.get());
 283                 if (cr != null)
 284                     return cr;
 285             }
 286         }
 287
 288         private final CoderResult encodeChar(CharBuffer source, ByteBuffer target, IntBuffer offsets, char ch) {
 289             int sourceIndex = source.position() - 1;
 290             CoderResult cr;
 291
 292             if (UTF16.isSurrogate(ch)) {
 293                 cr = handleSurrogates(source, ch);
 294                 if (cr != null)
 295                     return cr;
 296
 297                 char trail = UTF16.getTrailSurrogate(fromUChar32);
 298                 fromUChar32 = 0;
 299
 300                 // 4 bytes
 301                 temp[0 ^ endianXOR] = (byte) (ch >>> 8);
 302                 temp[1 ^ endianXOR] = (byte) (ch);
 303                 temp[2 ^ endianXOR] = (byte) (trail >>> 8);
 304                 temp[3 ^ endianXOR] = (byte) (trail);
 305                 cr = fromUWriteBytes(this, temp, 0, 4, target, offsets, sourceIndex);
 306             } else {
 307                 // 2 bytes
 308                 temp[0 ^ endianXOR] = (byte) (ch >>> 8);
 309                 temp[1 ^ endianXOR] = (byte) (ch);
 310                 cr = fromUWriteBytes(this, temp, 0, 2, target, offsets, sourceIndex);
 311             }
 312             return (cr.isUnderflow() ? null : cr);
 313         }
 314     }
 315
 316     public CharsetDecoder newDecoder() {
 317         return new CharsetDecoderUTF16(this);
 318     }
 319
 320     public CharsetEncoder newEncoder() {
 321         return new CharsetEncoderUTF16(this);
 322     }
 323
 324     void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
 325         getNonSurrogateUnicodeSet(setFillIn);
 326     }
 327 }