/** ******************************************************************************* * Copyright (C) 2006-2008, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ package com.ibm.icu.charset; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.IntBuffer; import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetEncoder; import java.nio.charset.CoderResult; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; /** * @author Niti Hantaweepant */ class CharsetUTF32 extends CharsetICU { private static final int SIGNATURE_LENGTH = 4; private static final byte[] fromUSubstitution_BE = { (byte) 0, (byte) 0, (byte) 0xff, (byte) 0xfd }; private static final byte[] fromUSubstitution_LE = { (byte) 0xfd, (byte) 0xff, (byte) 0, (byte) 0 }; private static final byte[] BOM_BE = { 0, 0, (byte) 0xfe, (byte) 0xff }; private static final byte[] BOM_LE = { (byte) 0xff, (byte) 0xfe, 0, 0 }; private static final int ENDIAN_XOR_BE = 0; private static final int ENDIAN_XOR_LE = 3; private static final int NEED_TO_WRITE_BOM = 1; private boolean isEndianSpecified; private boolean isBigEndian; private int endianXOR; private byte[] bom; private byte[] fromUSubstitution; public CharsetUTF32(String icuCanonicalName, String javaCanonicalName, String[] aliases) { super(icuCanonicalName, javaCanonicalName, aliases); this.isEndianSpecified = (this instanceof CharsetUTF32BE || this instanceof CharsetUTF32LE); this.isBigEndian = !(this instanceof CharsetUTF32LE); if (isBigEndian) { this.bom = BOM_BE; this.fromUSubstitution = fromUSubstitution_BE; this.endianXOR = ENDIAN_XOR_BE; } else { this.bom = BOM_LE; this.fromUSubstitution = fromUSubstitution_LE; this.endianXOR = ENDIAN_XOR_LE; } maxBytesPerChar = 4; minBytesPerChar = 4; maxCharsPerByte = 1; } class CharsetDecoderUTF32 extends CharsetDecoderICU { private boolean isBOMReadYet; private int actualEndianXOR; private byte[] actualBOM; public CharsetDecoderUTF32(CharsetICU cs) { super(cs); } protected void implReset() { super.implReset(); isBOMReadYet = false; actualBOM = null; } protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) { /* * If we detect a BOM in this buffer, then we must add the BOM size to the offsets because the actual * converter function will not see and count the BOM. offsetDelta will have the number of the BOM bytes that * are in the current buffer. */ if (!isBOMReadYet) { while (true) { if (!source.hasRemaining()) return CoderResult.UNDERFLOW; toUBytesArray[toULength++] = source.get(); if (toULength == 1) { // on the first byte, we haven't decided whether or not it's bigEndian yet if ((!isEndianSpecified || isBigEndian) && toUBytesArray[toULength - 1] == BOM_BE[toULength - 1]) { actualBOM = BOM_BE; actualEndianXOR = ENDIAN_XOR_BE; } else if ((!isEndianSpecified || !isBigEndian) && toUBytesArray[toULength - 1] == BOM_LE[toULength - 1]) { actualBOM = BOM_LE; actualEndianXOR = ENDIAN_XOR_LE; } else { // we do not have a BOM (and we have toULength==1 bytes) actualBOM = null; actualEndianXOR = endianXOR; break; } } else if (toUBytesArray[toULength - 1] != actualBOM[toULength - 1]) { // we do not have a BOM (and we have toULength bytes) actualBOM = null; actualEndianXOR = endianXOR; break; } else if (toULength == SIGNATURE_LENGTH) { // we found a BOM! at last! // too bad we have to get ignore it now (like it was unwanted or something) toULength = 0; break; } } isBOMReadYet = true; } // now that we no longer need to look for a BOM, let's do some work int char32; while (true) { while (toULength < 4) { if (!source.hasRemaining()) return CoderResult.UNDERFLOW; toUBytesArray[toULength++] = source.get(); } if (!target.hasRemaining()) return CoderResult.OVERFLOW; char32 = 0; for (int i = 0; i < 4; i++) char32 = (char32 << 8) | (toUBytesArray[i ^ actualEndianXOR] & UConverterConstants.UNSIGNED_BYTE_MASK); if (0 <= char32 && char32 <= UConverterConstants.MAXIMUM_UTF && !isSurrogate(char32)) { toULength = 0; if (char32 <= UConverterConstants.MAXIMUM_UCS2) { /* fits in 16 bits */ target.put((char) char32); } else { /* write out the surrogates */ target.put(UTF16.getLeadSurrogate(char32)); char32 = UTF16.getTrailSurrogate(char32); if (target.hasRemaining()) { target.put((char) char32); } else { /* Put in overflow buffer (not handled here) */ charErrorBufferArray[0] = (char) char32; charErrorBufferLength = 1; return CoderResult.OVERFLOW; } } } else { return CoderResult.malformedForLength(toULength); } } } } class CharsetEncoderUTF32 extends CharsetEncoderICU { private final byte[] temp = new byte[4]; public CharsetEncoderUTF32(CharsetICU cs) { super(cs, fromUSubstitution); fromUnicodeStatus = isEndianSpecified ? 0 : NEED_TO_WRITE_BOM; } protected void implReset() { super.implReset(); fromUnicodeStatus = isEndianSpecified ? 0 : NEED_TO_WRITE_BOM; } protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) { CoderResult cr; /* write the BOM if necessary */ if (fromUnicodeStatus == NEED_TO_WRITE_BOM) { if (!target.hasRemaining()) return CoderResult.OVERFLOW; fromUnicodeStatus = 0; cr = fromUWriteBytes(this, bom, 0, bom.length, target, offsets, -1); if (cr.isOverflow()) return cr; } if (fromUChar32 != 0) { if (!target.hasRemaining()) return CoderResult.OVERFLOW; // a note: fromUChar32 will either be 0 or a lead surrogate cr = encodeChar(source, target, offsets, (char) fromUChar32); if (cr != null) return cr; } while (true) { if (!source.hasRemaining()) return CoderResult.UNDERFLOW; if (!target.hasRemaining()) return CoderResult.OVERFLOW; cr = encodeChar(source, target, offsets, source.get()); if (cr != null) return cr; } } private final CoderResult encodeChar(CharBuffer source, ByteBuffer target, IntBuffer offsets, char ch) { int sourceIndex = source.position() - 1; CoderResult cr; int char32; if (UTF16.isSurrogate(ch)) { cr = handleSurrogates(source, ch); if (cr != null) return cr; char32 = fromUChar32; fromUChar32 = 0; } else { char32 = ch; } /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ // temp[0 ^ endianXOR] = (byte) (char32 >>> 24); // (always 0) temp[1 ^ endianXOR] = (byte) (char32 >>> 16); // same as (byte)((char32 >>> 16) & 0x1f) temp[2 ^ endianXOR] = (byte) (char32 >>> 8); temp[3 ^ endianXOR] = (byte) (char32); cr = fromUWriteBytes(this, temp, 0, 4, target, offsets, sourceIndex); return (cr.isUnderflow() ? null : cr); } } public CharsetDecoder newDecoder() { return new CharsetDecoderUTF32(this); } public CharsetEncoder newEncoder() { return new CharsetEncoderUTF32(this); } void getUnicodeSetImpl( UnicodeSet setFillIn, int which){ getNonSurrogateUnicodeSet(setFillIn); } }