2 *******************************************************************************
3 * Copyright (C) 2006-2011, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
7 package com.ibm.icu.charset;
9 import java.nio.ByteBuffer;
10 import java.nio.CharBuffer;
11 import java.nio.IntBuffer;
12 import java.nio.charset.CharsetDecoder;
13 import java.nio.charset.CharsetEncoder;
14 import java.nio.charset.CoderResult;
16 import com.ibm.icu.text.UTF16;
17 import com.ibm.icu.text.UnicodeSet;
18 import com.ibm.icu.util.VersionInfo;
21 * @author Niti Hantaweepant
23 class CharsetUTF16 extends CharsetICU {
25 private static final int SIGNATURE_LENGTH = 2;
26 private static final byte[] fromUSubstitution_BE = { (byte) 0xff, (byte) 0xfd };
27 private static final byte[] fromUSubstitution_LE = { (byte) 0xfd, (byte) 0xff };
28 private static final byte[] BOM_BE = { (byte) 0xfe, (byte) 0xff };
29 private static final byte[] BOM_LE = { (byte) 0xff, (byte) 0xfe };
30 private static final int ENDIAN_XOR_BE = 0;
31 private static final int ENDIAN_XOR_LE = 1;
32 private static final int NEED_TO_WRITE_BOM = 1;
34 private boolean isEndianSpecified;
35 private boolean isBigEndian;
36 private int endianXOR;
38 private byte[] fromUSubstitution;
42 public CharsetUTF16(String icuCanonicalName, String javaCanonicalName, String[] aliases) {
43 super(icuCanonicalName, javaCanonicalName, aliases);
45 /* Get the version number (e.g. UTF-16LE,version=1) */
46 int versionIndex = icuCanonicalName.indexOf("version=");
47 if (versionIndex > 0) {
48 version = Integer.decode(icuCanonicalName.substring(versionIndex+8, versionIndex+9)).intValue();
53 this.isEndianSpecified = (this instanceof CharsetUTF16BE || this instanceof CharsetUTF16LE);
54 this.isBigEndian = !(this instanceof CharsetUTF16LE);
58 this.fromUSubstitution = fromUSubstitution_BE;
59 this.endianXOR = ENDIAN_XOR_BE;
62 this.fromUSubstitution = fromUSubstitution_LE;
63 this.endianXOR = ENDIAN_XOR_LE;
66 /* UnicodeBig and UnicodeLittle requires maxBytesPerChar set to 4 in Java 5 or less */
67 if ((VersionInfo.javaVersion().getMajor() == 1 && VersionInfo.javaVersion().getMinor() <= 5)
68 && (isEndianSpecified && version == 1)) {
78 class CharsetDecoderUTF16 extends CharsetDecoderICU {
80 private boolean isBOMReadYet;
81 private int actualEndianXOR;
82 private byte[] actualBOM;
84 public CharsetDecoderUTF16(CharsetICU cs) {
88 protected void implReset() {
94 protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
96 * If we detect a BOM in this buffer, then we must add the BOM size to the offsets because the actual
97 * converter function will not see and count the BOM. offsetDelta will have the number of the BOM bytes that
98 * are in the current buffer.
102 if (!source.hasRemaining())
103 return CoderResult.UNDERFLOW;
105 toUBytesArray[toULength++] = source.get();
107 if (toULength == 1) {
108 // on the first byte, we haven't decided whether or not it's bigEndian yet
109 if ((!isEndianSpecified || isBigEndian)
110 && toUBytesArray[toULength - 1] == BOM_BE[toULength - 1]) {
112 actualEndianXOR = ENDIAN_XOR_BE;
113 } else if ((!isEndianSpecified || !isBigEndian)
114 && toUBytesArray[toULength - 1] == BOM_LE[toULength - 1]) {
116 actualEndianXOR = ENDIAN_XOR_LE;
118 // we do not have a BOM (and we have toULength==1 bytes)
119 if (isEndianSpecified && version == 1) {
120 actualBOM = isBigEndian ? CharsetUTF16.BOM_BE : CharsetUTF16.BOM_LE;
121 actualEndianXOR = isBigEndian ? CharsetUTF16.ENDIAN_XOR_BE : CharsetUTF16.ENDIAN_XOR_LE;
124 actualEndianXOR = endianXOR;
128 } else if (isEndianSpecified && version == 1 && (toUBytesArray[toULength - 1] == actualBOM[toULength - 2] && toUBytesArray[toULength - 2] == actualBOM[toULength - 1])) {
129 return CoderResult.malformedForLength(2);
130 } else if (isEndianSpecified && version == 1 && (toUBytesArray[toULength - 1] == actualBOM[toULength - 1] && toUBytesArray[toULength - 2] == actualBOM[toULength - 2])) {
131 // we found a BOM! at last!
132 // too bad we have to get ignore it now (like it was unwanted or something)
135 } else if (isEndianSpecified || toUBytesArray[toULength - 1] != actualBOM[toULength - 1]) {
136 // we do not have a BOM (and we have toULength bytes)
138 actualEndianXOR = endianXOR;
140 } else if (toULength == SIGNATURE_LENGTH) {
141 // we found a BOM! at last!
142 // too bad we have to get ignore it now (like it was unwanted or something)
151 // now that we no longer need to look for a BOM, let's do some work
153 // if we have unfinished business
154 if (toUnicodeStatus != 0) {
155 CoderResult cr = decodeTrail(source, target, offsets, (char) toUnicodeStatus);
163 while (toULength < 2) {
164 if (!source.hasRemaining())
165 return CoderResult.UNDERFLOW;
166 toUBytesArray[toULength++] = source.get();
169 if (isEndianSpecified && version == 1 && (toUBytesArray[toULength - 1] == actualBOM[toULength - 2] && toUBytesArray[toULength - 2] == actualBOM[toULength - 1])) {
170 return CoderResult.malformedForLength(2);
171 } else if (isEndianSpecified && version == 1 && (toUBytesArray[toULength - 1] == actualBOM[toULength - 1] && toUBytesArray[toULength - 2] == actualBOM[toULength - 2])) {
172 // we found a BOM! at last!
173 // too bad we have to get ignore it now (like it was unwanted or something)
178 if (!target.hasRemaining())
179 return CoderResult.OVERFLOW;
181 char16 = (char) (((toUBytesArray[0 ^ actualEndianXOR] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | ((toUBytesArray[1 ^ actualEndianXOR] & UConverterConstants.UNSIGNED_BYTE_MASK)));
183 if (!UTF16.isSurrogate(char16)) {
187 CoderResult cr = decodeTrail(source, target, offsets, char16);
194 private final CoderResult decodeTrail(ByteBuffer source, CharBuffer target, IntBuffer offsets, char lead) {
195 if (!UTF16.isLeadSurrogate(lead)) {
196 // 2 bytes, lead malformed
198 return CoderResult.malformedForLength(2);
201 while (toULength < 4) {
202 if (!source.hasRemaining()) {
203 // let this be unfinished business
204 toUnicodeStatus = lead;
205 return CoderResult.UNDERFLOW;
207 toUBytesArray[toULength++] = source.get();
210 char trail = (char) (((toUBytesArray[2 ^ actualEndianXOR] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | ((toUBytesArray[3 ^ actualEndianXOR] & UConverterConstants.UNSIGNED_BYTE_MASK)));
212 if (!UTF16.isTrailSurrogate(trail)) {
213 // pretend like we didnt read the last 2 bytes
215 source.position(source.position() - 2);
217 // 2 bytes, lead malformed
219 return CoderResult.malformedForLength(2);
227 if (target.hasRemaining()) {
231 /* Put in overflow buffer (not handled here) */
232 charErrorBufferArray[0] = trail;
233 charErrorBufferLength = 1;
234 return CoderResult.OVERFLOW;
239 class CharsetEncoderUTF16 extends CharsetEncoderICU {
240 private final byte[] temp = new byte[4];
242 public CharsetEncoderUTF16(CharsetICU cs) {
243 super(cs, fromUSubstitution);
244 fromUnicodeStatus = (isEndianSpecified && version != 1) ? 0 : NEED_TO_WRITE_BOM;
247 protected void implReset() {
249 fromUnicodeStatus = (isEndianSpecified && version != 1) ? 0 : NEED_TO_WRITE_BOM;
252 protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
255 /* write the BOM if necessary */
256 if (fromUnicodeStatus == NEED_TO_WRITE_BOM) {
257 if (!target.hasRemaining())
258 return CoderResult.OVERFLOW;
260 fromUnicodeStatus = 0;
261 cr = fromUWriteBytes(this, bom, 0, bom.length, target, offsets, -1);
266 if (fromUChar32 != 0) {
267 if (!target.hasRemaining())
268 return CoderResult.OVERFLOW;
270 // a note: fromUChar32 will either be 0 or a lead surrogate
271 cr = encodeChar(source, target, offsets, (char) fromUChar32);
277 if (!source.hasRemaining())
278 return CoderResult.UNDERFLOW;
279 if (!target.hasRemaining())
280 return CoderResult.OVERFLOW;
282 cr = encodeChar(source, target, offsets, source.get());
288 private final CoderResult encodeChar(CharBuffer source, ByteBuffer target, IntBuffer offsets, char ch) {
289 int sourceIndex = source.position() - 1;
292 if (UTF16.isSurrogate(ch)) {
293 cr = handleSurrogates(source, ch);
297 char trail = UTF16.getTrailSurrogate(fromUChar32);
301 temp[0 ^ endianXOR] = (byte) (ch >>> 8);
302 temp[1 ^ endianXOR] = (byte) (ch);
303 temp[2 ^ endianXOR] = (byte) (trail >>> 8);
304 temp[3 ^ endianXOR] = (byte) (trail);
305 cr = fromUWriteBytes(this, temp, 0, 4, target, offsets, sourceIndex);
308 temp[0 ^ endianXOR] = (byte) (ch >>> 8);
309 temp[1 ^ endianXOR] = (byte) (ch);
310 cr = fromUWriteBytes(this, temp, 0, 2, target, offsets, sourceIndex);
312 return (cr.isUnderflow() ? null : cr);
316 public CharsetDecoder newDecoder() {
317 return new CharsetDecoderUTF16(this);
320 public CharsetEncoder newEncoder() {
321 return new CharsetEncoderUTF16(this);
324 void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
325 getNonSurrogateUnicodeSet(setFillIn);