2 *******************************************************************************
3 * Copyright (C) 2006-2008, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
7 package com.ibm.icu.charset;
9 import java.nio.ByteBuffer;
10 import java.nio.CharBuffer;
11 import java.nio.IntBuffer;
12 import java.nio.charset.CharsetDecoder;
13 import java.nio.charset.CharsetEncoder;
14 import java.nio.charset.CoderResult;
16 import com.ibm.icu.text.UTF16;
17 import com.ibm.icu.text.UnicodeSet;
20 * @author Niti Hantaweepant
22 class CharsetUTF32 extends CharsetICU {
24 private static final int SIGNATURE_LENGTH = 4;
25 private static final byte[] fromUSubstitution_BE = { (byte) 0, (byte) 0, (byte) 0xff, (byte) 0xfd };
26 private static final byte[] fromUSubstitution_LE = { (byte) 0xfd, (byte) 0xff, (byte) 0, (byte) 0 };
27 private static final byte[] BOM_BE = { 0, 0, (byte) 0xfe, (byte) 0xff };
28 private static final byte[] BOM_LE = { (byte) 0xff, (byte) 0xfe, 0, 0 };
29 private static final int ENDIAN_XOR_BE = 0;
30 private static final int ENDIAN_XOR_LE = 3;
31 private static final int NEED_TO_WRITE_BOM = 1;
33 private boolean isEndianSpecified;
34 private boolean isBigEndian;
35 private int endianXOR;
37 private byte[] fromUSubstitution;
39 public CharsetUTF32(String icuCanonicalName, String javaCanonicalName, String[] aliases) {
40 super(icuCanonicalName, javaCanonicalName, aliases);
42 this.isEndianSpecified = (this instanceof CharsetUTF32BE || this instanceof CharsetUTF32LE);
43 this.isBigEndian = !(this instanceof CharsetUTF32LE);
47 this.fromUSubstitution = fromUSubstitution_BE;
48 this.endianXOR = ENDIAN_XOR_BE;
51 this.fromUSubstitution = fromUSubstitution_LE;
52 this.endianXOR = ENDIAN_XOR_LE;
60 class CharsetDecoderUTF32 extends CharsetDecoderICU {
62 private boolean isBOMReadYet;
63 private int actualEndianXOR;
64 private byte[] actualBOM;
66 public CharsetDecoderUTF32(CharsetICU cs) {
70 protected void implReset() {
76 protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
78 * If we detect a BOM in this buffer, then we must add the BOM size to the offsets because the actual
79 * converter function will not see and count the BOM. offsetDelta will have the number of the BOM bytes that
80 * are in the current buffer.
84 if (!source.hasRemaining())
85 return CoderResult.UNDERFLOW;
87 toUBytesArray[toULength++] = source.get();
90 // on the first byte, we haven't decided whether or not it's bigEndian yet
91 if ((!isEndianSpecified || isBigEndian)
92 && toUBytesArray[toULength - 1] == BOM_BE[toULength - 1]) {
94 actualEndianXOR = ENDIAN_XOR_BE;
95 } else if ((!isEndianSpecified || !isBigEndian)
96 && toUBytesArray[toULength - 1] == BOM_LE[toULength - 1]) {
98 actualEndianXOR = ENDIAN_XOR_LE;
100 // we do not have a BOM (and we have toULength==1 bytes)
102 actualEndianXOR = endianXOR;
105 } else if (toUBytesArray[toULength - 1] != actualBOM[toULength - 1]) {
106 // we do not have a BOM (and we have toULength bytes)
108 actualEndianXOR = endianXOR;
110 } else if (toULength == SIGNATURE_LENGTH) {
111 // we found a BOM! at last!
112 // too bad we have to get ignore it now (like it was unwanted or something)
121 // now that we no longer need to look for a BOM, let's do some work
125 while (toULength < 4) {
126 if (!source.hasRemaining())
127 return CoderResult.UNDERFLOW;
128 toUBytesArray[toULength++] = source.get();
131 if (!target.hasRemaining())
132 return CoderResult.OVERFLOW;
135 for (int i = 0; i < 4; i++)
136 char32 = (char32 << 8)
137 | (toUBytesArray[i ^ actualEndianXOR] & UConverterConstants.UNSIGNED_BYTE_MASK);
139 if (0 <= char32 && char32 <= UConverterConstants.MAXIMUM_UTF && !isSurrogate(char32)) {
141 if (char32 <= UConverterConstants.MAXIMUM_UCS2) {
142 /* fits in 16 bits */
143 target.put((char) char32);
145 /* write out the surrogates */
146 target.put(UTF16.getLeadSurrogate(char32));
147 char32 = UTF16.getTrailSurrogate(char32);
148 if (target.hasRemaining()) {
149 target.put((char) char32);
151 /* Put in overflow buffer (not handled here) */
152 charErrorBufferArray[0] = (char) char32;
153 charErrorBufferLength = 1;
154 return CoderResult.OVERFLOW;
158 return CoderResult.malformedForLength(toULength);
164 class CharsetEncoderUTF32 extends CharsetEncoderICU {
165 private final byte[] temp = new byte[4];
167 public CharsetEncoderUTF32(CharsetICU cs) {
168 super(cs, fromUSubstitution);
169 fromUnicodeStatus = isEndianSpecified ? 0 : NEED_TO_WRITE_BOM;
172 protected void implReset() {
174 fromUnicodeStatus = isEndianSpecified ? 0 : NEED_TO_WRITE_BOM;
177 protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
180 /* write the BOM if necessary */
181 if (fromUnicodeStatus == NEED_TO_WRITE_BOM) {
182 if (!target.hasRemaining())
183 return CoderResult.OVERFLOW;
185 fromUnicodeStatus = 0;
186 cr = fromUWriteBytes(this, bom, 0, bom.length, target, offsets, -1);
191 if (fromUChar32 != 0) {
192 if (!target.hasRemaining())
193 return CoderResult.OVERFLOW;
195 // a note: fromUChar32 will either be 0 or a lead surrogate
196 cr = encodeChar(source, target, offsets, (char) fromUChar32);
202 if (!source.hasRemaining())
203 return CoderResult.UNDERFLOW;
204 if (!target.hasRemaining())
205 return CoderResult.OVERFLOW;
207 cr = encodeChar(source, target, offsets, source.get());
213 private final CoderResult encodeChar(CharBuffer source, ByteBuffer target, IntBuffer offsets, char ch) {
214 int sourceIndex = source.position() - 1;
218 if (UTF16.isSurrogate(ch)) {
219 cr = handleSurrogates(source, ch);
223 char32 = fromUChar32;
229 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
230 // temp[0 ^ endianXOR] = (byte) (char32 >>> 24); // (always 0)
231 temp[1 ^ endianXOR] = (byte) (char32 >>> 16); // same as (byte)((char32 >>> 16) & 0x1f)
232 temp[2 ^ endianXOR] = (byte) (char32 >>> 8);
233 temp[3 ^ endianXOR] = (byte) (char32);
234 cr = fromUWriteBytes(this, temp, 0, 4, target, offsets, sourceIndex);
235 return (cr.isUnderflow() ? null : cr);
239 public CharsetDecoder newDecoder() {
240 return new CharsetDecoderUTF32(this);
243 public CharsetEncoder newEncoder() {
244 return new CharsetEncoderUTF32(this);
248 void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
249 getNonSurrogateUnicodeSet(setFillIn);