2 *******************************************************************************
\r
3 * Copyright (C) 2006-2009, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.charset;
\r
9 import java.nio.ByteBuffer;
\r
10 import java.nio.CharBuffer;
\r
11 import java.nio.IntBuffer;
\r
12 import java.nio.charset.CharsetDecoder;
\r
13 import java.nio.charset.CharsetEncoder;
\r
14 import java.nio.charset.CoderResult;
\r
16 import com.ibm.icu.text.UTF16;
\r
17 import com.ibm.icu.text.UnicodeSet;
\r
20 * @author Niti Hantaweepant
\r
22 class CharsetUTF16 extends CharsetICU {
\r
24 private static final int SIGNATURE_LENGTH = 2;
\r
25 private static final byte[] fromUSubstitution_BE = { (byte) 0xff, (byte) 0xfd };
\r
26 private static final byte[] fromUSubstitution_LE = { (byte) 0xfd, (byte) 0xff };
\r
27 private static final byte[] BOM_BE = { (byte) 0xfe, (byte) 0xff };
\r
28 private static final byte[] BOM_LE = { (byte) 0xff, (byte) 0xfe };
\r
29 private static final int ENDIAN_XOR_BE = 0;
\r
30 private static final int ENDIAN_XOR_LE = 1;
\r
31 private static final int NEED_TO_WRITE_BOM = 1;
\r
33 private boolean isEndianSpecified;
\r
34 private boolean isBigEndian;
\r
35 private int endianXOR;
\r
37 private byte[] fromUSubstitution;
\r
39 public CharsetUTF16(String icuCanonicalName, String javaCanonicalName, String[] aliases) {
\r
40 super(icuCanonicalName, javaCanonicalName, aliases);
\r
42 this.isEndianSpecified = (this instanceof CharsetUTF16BE || this instanceof CharsetUTF16LE);
\r
43 this.isBigEndian = !(this instanceof CharsetUTF16LE);
\r
47 this.fromUSubstitution = fromUSubstitution_BE;
\r
48 this.endianXOR = ENDIAN_XOR_BE;
\r
51 this.fromUSubstitution = fromUSubstitution_LE;
\r
52 this.endianXOR = ENDIAN_XOR_LE;
\r
55 maxBytesPerChar = 2;
\r
56 minBytesPerChar = 2;
\r
57 maxCharsPerByte = 1;
\r
60 class CharsetDecoderUTF16 extends CharsetDecoderICU {
\r
62 private boolean isBOMReadYet;
\r
63 private int actualEndianXOR;
\r
64 private byte[] actualBOM;
\r
66 public CharsetDecoderUTF16(CharsetICU cs) {
\r
70 protected void implReset() {
\r
72 isBOMReadYet = false;
\r
76 protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
\r
78 * If we detect a BOM in this buffer, then we must add the BOM size to the offsets because the actual
\r
79 * converter function will not see and count the BOM. offsetDelta will have the number of the BOM bytes that
\r
80 * are in the current buffer.
\r
82 if (!isBOMReadYet) {
\r
84 if (!source.hasRemaining())
\r
85 return CoderResult.UNDERFLOW;
\r
87 toUBytesArray[toULength++] = source.get();
\r
89 if (toULength == 1) {
\r
90 // on the first byte, we haven't decided whether or not it's bigEndian yet
\r
91 if ((!isEndianSpecified || isBigEndian)
\r
92 && toUBytesArray[toULength - 1] == BOM_BE[toULength - 1]) {
\r
94 actualEndianXOR = ENDIAN_XOR_BE;
\r
95 } else if ((!isEndianSpecified || !isBigEndian)
\r
96 && toUBytesArray[toULength - 1] == BOM_LE[toULength - 1]) {
\r
98 actualEndianXOR = ENDIAN_XOR_LE;
\r
100 // we do not have a BOM (and we have toULength==1 bytes)
\r
102 actualEndianXOR = endianXOR;
\r
105 } else if (toUBytesArray[toULength - 1] != actualBOM[toULength - 1]) {
\r
106 // we do not have a BOM (and we have toULength bytes)
\r
108 actualEndianXOR = endianXOR;
\r
110 } else if (toULength == SIGNATURE_LENGTH) {
\r
111 // we found a BOM! at last!
\r
112 // too bad we have to get ignore it now (like it was unwanted or something)
\r
118 isBOMReadYet = true;
\r
121 // now that we no longer need to look for a BOM, let's do some work
\r
123 // if we have unfinished business
\r
124 if (toUnicodeStatus != 0) {
\r
125 CoderResult cr = decodeTrail(source, target, offsets, (char) toUnicodeStatus);
\r
133 while (toULength < 2) {
\r
134 if (!source.hasRemaining())
\r
135 return CoderResult.UNDERFLOW;
\r
136 toUBytesArray[toULength++] = source.get();
\r
139 if (!target.hasRemaining())
\r
140 return CoderResult.OVERFLOW;
\r
142 char16 = (char) (((toUBytesArray[0 ^ actualEndianXOR] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | ((toUBytesArray[1 ^ actualEndianXOR] & UConverterConstants.UNSIGNED_BYTE_MASK)));
\r
144 if (!UTF16.isSurrogate(char16)) {
\r
146 target.put(char16);
\r
148 CoderResult cr = decodeTrail(source, target, offsets, char16);
\r
155 private final CoderResult decodeTrail(ByteBuffer source, CharBuffer target, IntBuffer offsets, char lead) {
\r
156 if (!UTF16.isLeadSurrogate(lead)) {
\r
157 // 2 bytes, lead malformed
\r
158 toUnicodeStatus = 0;
\r
159 return CoderResult.malformedForLength(2);
\r
162 while (toULength < 4) {
\r
163 if (!source.hasRemaining()) {
\r
164 // let this be unfinished business
\r
165 toUnicodeStatus = lead;
\r
166 return CoderResult.UNDERFLOW;
\r
168 toUBytesArray[toULength++] = source.get();
\r
171 char trail = (char) (((toUBytesArray[2 ^ actualEndianXOR] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | ((toUBytesArray[3 ^ actualEndianXOR] & UConverterConstants.UNSIGNED_BYTE_MASK)));
\r
173 if (!UTF16.isTrailSurrogate(trail)) {
\r
174 // pretend like we didnt read the last 2 bytes
\r
176 source.position(source.position() - 2);
\r
178 // 2 bytes, lead malformed
\r
179 toUnicodeStatus = 0;
\r
180 return CoderResult.malformedForLength(2);
\r
183 toUnicodeStatus = 0;
\r
188 if (target.hasRemaining()) {
\r
192 /* Put in overflow buffer (not handled here) */
\r
193 charErrorBufferArray[0] = trail;
\r
194 charErrorBufferLength = 1;
\r
195 return CoderResult.OVERFLOW;
\r
200 class CharsetEncoderUTF16 extends CharsetEncoderICU {
\r
201 private final byte[] temp = new byte[4];
\r
203 public CharsetEncoderUTF16(CharsetICU cs) {
\r
204 super(cs, fromUSubstitution);
\r
205 fromUnicodeStatus = isEndianSpecified ? 0 : NEED_TO_WRITE_BOM;
\r
208 protected void implReset() {
\r
210 fromUnicodeStatus = isEndianSpecified ? 0 : NEED_TO_WRITE_BOM;
\r
213 protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
\r
216 /* write the BOM if necessary */
\r
217 if (fromUnicodeStatus == NEED_TO_WRITE_BOM) {
\r
218 if (!target.hasRemaining())
\r
219 return CoderResult.OVERFLOW;
\r
221 fromUnicodeStatus = 0;
\r
222 cr = fromUWriteBytes(this, bom, 0, bom.length, target, offsets, -1);
\r
223 if (cr.isOverflow())
\r
227 if (fromUChar32 != 0) {
\r
228 if (!target.hasRemaining())
\r
229 return CoderResult.OVERFLOW;
\r
231 // a note: fromUChar32 will either be 0 or a lead surrogate
\r
232 cr = encodeChar(source, target, offsets, (char) fromUChar32);
\r
238 if (!source.hasRemaining())
\r
239 return CoderResult.UNDERFLOW;
\r
240 if (!target.hasRemaining())
\r
241 return CoderResult.OVERFLOW;
\r
243 cr = encodeChar(source, target, offsets, source.get());
\r
249 private final CoderResult encodeChar(CharBuffer source, ByteBuffer target, IntBuffer offsets, char ch) {
\r
250 int sourceIndex = source.position() - 1;
\r
253 if (UTF16.isSurrogate(ch)) {
\r
254 cr = handleSurrogates(source, ch);
\r
258 char trail = UTF16.getTrailSurrogate(fromUChar32);
\r
262 temp[0 ^ endianXOR] = (byte) (ch >>> 8);
\r
263 temp[1 ^ endianXOR] = (byte) (ch);
\r
264 temp[2 ^ endianXOR] = (byte) (trail >>> 8);
\r
265 temp[3 ^ endianXOR] = (byte) (trail);
\r
266 cr = fromUWriteBytes(this, temp, 0, 4, target, offsets, sourceIndex);
\r
269 temp[0 ^ endianXOR] = (byte) (ch >>> 8);
\r
270 temp[1 ^ endianXOR] = (byte) (ch);
\r
271 cr = fromUWriteBytes(this, temp, 0, 2, target, offsets, sourceIndex);
\r
273 return (cr.isUnderflow() ? null : cr);
\r
277 public CharsetDecoder newDecoder() {
\r
278 return new CharsetDecoderUTF16(this);
\r
281 public CharsetEncoder newEncoder() {
\r
282 return new CharsetEncoderUTF16(this);
\r
285 void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
\r
286 getNonSurrogateUnicodeSet(setFillIn);
\r