/* ******************************************************************************* * Copyright (C) 2008-2009, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ package com.ibm.icu.charset; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.IntBuffer; import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetEncoder; import java.nio.charset.CoderResult; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; class CharsetHZ extends CharsetICU { private static final int UCNV_TILDE = 0x7E; /* ~ */ private static final int UCNV_OPEN_BRACE = 0x7B; /* { */ private static final int UCNV_CLOSE_BRACE = 0x7D; /* } */ private static final byte[] SB_ESCAPE = new byte[] { 0x7E, 0x7D }; private static final byte[] DB_ESCAPE = new byte[] { 0x7E, 0x7B }; private static final byte[] TILDE_ESCAPE = new byte[] { 0x7E, 0x7E }; private static final byte[] fromUSubstitution = new byte[] { (byte) 0x1A }; private CharsetMBCS gbCharset; private boolean isEmptySegment; public CharsetHZ(String icuCanonicalName, String canonicalName, String[] aliases) { super(icuCanonicalName, canonicalName, aliases); gbCharset = (CharsetMBCS) new CharsetProviderICU().charsetForName("GBK"); maxBytesPerChar = 4; minBytesPerChar = 1; maxCharsPerByte = 1; isEmptySegment = false; } class CharsetDecoderHZ extends CharsetDecoderICU { CharsetMBCS.CharsetDecoderMBCS gbDecoder; boolean isStateDBCS = false; public CharsetDecoderHZ(CharsetICU cs) { super(cs); gbDecoder = (CharsetMBCS.CharsetDecoderMBCS) gbCharset.newDecoder(); } protected void implReset() { super.implReset(); gbDecoder.implReset(); isStateDBCS = false; isEmptySegment = false; } protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) { CoderResult err = CoderResult.UNDERFLOW; byte[] tempBuf = new byte[2]; int targetUniChar = 0; int mySourceChar = 0; if (!source.hasRemaining()) return CoderResult.UNDERFLOW; else if (!target.hasRemaining()) return CoderResult.OVERFLOW; while (source.hasRemaining()) { if (target.hasRemaining()) { // get the byte as unsigned mySourceChar = source.get() & 0xff; if (mode == UCNV_TILDE) { /* second byte after ~ */ mode = 0; switch (mySourceChar) { case 0x0A: /* no output for ~\n (line-continuation marker) */ continue; case UCNV_TILDE: if (offsets != null) { offsets.put(source.position() - 2); } target.put((char) mySourceChar); continue; case UCNV_OPEN_BRACE: case UCNV_CLOSE_BRACE: isStateDBCS = (mySourceChar == UCNV_OPEN_BRACE); if (isEmptySegment) { isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */ this.toUBytesArray[0] = UCNV_TILDE; this.toUBytesArray[1] = (byte)mySourceChar; this.toULength = 2; return CoderResult.malformedForLength(1); } isEmptySegment = true; continue; default: /* * if the first byte is equal to TILDE and the trail byte is not a valid byte then it is an * error condition */ /* * Ticket 5691: consistent illegal sequences: * - We include at least the first byte in the illegal sequence. * - If any of the non-initial bytes could be the start of a character, * we stop the illegal sequence before the first one of those. */ isEmptySegment = false; /* different error here, reset this to avoid spurious furture error */ err = CoderResult.malformedForLength(1); toUBytesArray[0] = UCNV_TILDE; if (isStateDBCS ? (0x21 <= mySourceChar && mySourceChar <= 0x7e) : mySourceChar <= 0x7f) { /* The current byte could be the start of a character: Back it out. */ toULength = 1; source.position(source.position() - 1); } else { /* Include the current byte in the illegal sequence. */ toUBytesArray[1] = (byte)mySourceChar; toULength = 2; } return err; } } else if (isStateDBCS) { if (toUnicodeStatus == 0) { /* lead byte */ if (mySourceChar == UCNV_TILDE) { mode = UCNV_TILDE; } else { /* * add another bit to distinguish a 0 byte from not having seen a lead byte */ toUnicodeStatus = mySourceChar | 0x100; isEmptySegment = false; /* the segment has something, either valid or will produce a different error, so reset this */ } continue; } else { /* trail byte */ boolean leadIsOk, trailIsOk; int leadByte = toUnicodeStatus & 0xff; targetUniChar = 0xffff; /* * Ticket 5691: consistent illegal sequence * - We include at least the first byte in the illegal sequence. * - If any of the non-initial bytes could be the start of a character, * we stop the illegal sequence before the first one of those * * In HZ DBCS, if the second byte is in the 21..7e range, * we report ony the first byte as the illegal sequence. * Otherwise we convert of report the pair of bytes. */ leadIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (leadByte - 0x21)) <= (0x7d - 0x21); trailIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (mySourceChar - 0x21)) <= (0x7e - 0x21); if (leadIsOk && trailIsOk) { tempBuf[0] = (byte)(leadByte + 0x80); tempBuf[1] = (byte)(mySourceChar + 0x80); targetUniChar = gbDecoder.simpleGetNextUChar(ByteBuffer.wrap(tempBuf), super.isFallbackUsed()); mySourceChar = (leadByte << 8) | mySourceChar; } else if (trailIsOk) { /* report a single illegal byte and continue with the following DBCS starter byte */ source.position(source.position() - 1); mySourceChar = leadByte; } else { /* report a pair of illegal bytes if the second byte is not a DBCS starter */ /* add another bit so that the code below writes 2 bytes in case of error */ mySourceChar = 0x10000 | (leadByte << 8) | mySourceChar; } toUnicodeStatus = 0x00; } } else { if (mySourceChar == UCNV_TILDE) { mode = UCNV_TILDE; continue; } else if (mySourceChar <= 0x7f) { targetUniChar = mySourceChar; /* ASCII */ isEmptySegment = false; /* the segment has something valid */ } else { targetUniChar = 0xffff; isEmptySegment = false; /* different error here, reset this to avoid spurious future error */ } } if (targetUniChar < 0xfffe) { if (offsets != null) { offsets.put(source.position() - 1 - (isStateDBCS ? 1 : 0)); } target.put((char) targetUniChar); } else /* targetUniChar >= 0xfffe */{ if (mySourceChar > 0xff) { toUBytesArray[toUBytesBegin + 0] = (byte) (mySourceChar >> 8); toUBytesArray[toUBytesBegin + 1] = (byte) mySourceChar; toULength = 2; } else { toUBytesArray[toUBytesBegin + 0] = (byte) mySourceChar; toULength = 1; } if (targetUniChar == 0xfffe) { return CoderResult.unmappableForLength(toULength); } else { return CoderResult.malformedForLength(toULength); } } } else { return CoderResult.OVERFLOW; } } return err; } } class CharsetEncoderHZ extends CharsetEncoderICU { CharsetMBCS.CharsetEncoderMBCS gbEncoder; boolean isEscapeAppended = false; boolean isTargetUCharDBCS = false; public CharsetEncoderHZ(CharsetICU cs) { super(cs, fromUSubstitution); gbEncoder = (CharsetMBCS.CharsetEncoderMBCS) gbCharset.newEncoder(); } protected void implReset() { super.implReset(); gbEncoder.implReset(); isEscapeAppended = false; isTargetUCharDBCS = false; } protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) { int length = 0; int[] targetUniChar = new int[] { 0 }; int mySourceChar = 0; boolean oldIsTargetUCharDBCS = isTargetUCharDBCS; if (!source.hasRemaining()) return CoderResult.UNDERFLOW; else if (!target.hasRemaining()) return CoderResult.OVERFLOW; if (fromUChar32 != 0 && target.hasRemaining()) { CoderResult cr = handleSurrogates(source, (char) fromUChar32); return (cr != null) ? cr : CoderResult.unmappableForLength(2); } /* writing the char to the output stream */ while (source.hasRemaining()) { targetUniChar[0] = MISSING_CHAR_MARKER; if (target.hasRemaining()) { mySourceChar = source.get(); oldIsTargetUCharDBCS = isTargetUCharDBCS; if (mySourceChar == UCNV_TILDE) { /* * concatEscape(args, &myTargetIndex, &targetLength,"\x7E\x7E",err,2,&mySourceIndex); */ concatEscape(source, target, offsets, TILDE_ESCAPE); continue; } else if (mySourceChar <= 0x7f) { length = 1; targetUniChar[0] = mySourceChar; } else { length = gbEncoder.fromUChar32(mySourceChar, targetUniChar, super.isFallbackUsed()); /* * we can only use lead bytes 21..7D and trail bytes 21..7E */ if (length == 2 && 0xa1a1 <= targetUniChar[0] && targetUniChar[0] <= 0xfdfe && 0xa1 <= (targetUniChar[0] & 0xff) && (targetUniChar[0] & 0xff) <= 0xfe) { targetUniChar[0] -= 0x8080; } else { targetUniChar[0] = MISSING_CHAR_MARKER; } } if (targetUniChar[0] != MISSING_CHAR_MARKER) { isTargetUCharDBCS = (targetUniChar[0] > 0x00FF); if (oldIsTargetUCharDBCS != isTargetUCharDBCS || !isEscapeAppended) { /* Shifting from a double byte to single byte mode */ if (!isTargetUCharDBCS) { concatEscape(source, target, offsets, SB_ESCAPE); isEscapeAppended = true; } else { /* * Shifting from a single byte to double byte mode */ concatEscape(source, target, offsets, DB_ESCAPE); isEscapeAppended = true; } } if (isTargetUCharDBCS) { if (target.hasRemaining()) { target.put((byte) (targetUniChar[0] >> 8)); if (offsets != null) { offsets.put(source.position() - 1); } if (target.hasRemaining()) { target.put((byte) targetUniChar[0]); if (offsets != null) { offsets.put(source.position() - 1); } } else { errorBuffer[errorBufferLength++] = (byte) targetUniChar[0]; // *err = U_BUFFER_OVERFLOW_ERROR; } } else { errorBuffer[errorBufferLength++] = (byte) (targetUniChar[0] >> 8); errorBuffer[errorBufferLength++] = (byte) targetUniChar[0]; // *err = U_BUFFER_OVERFLOW_ERROR; } } else { if (target.hasRemaining()) { target.put((byte) targetUniChar[0]); if (offsets != null) { offsets.put(source.position() - 1); } } else { errorBuffer[errorBufferLength++] = (byte) targetUniChar[0]; // *err = U_BUFFER_OVERFLOW_ERROR; } } } else { /* oops.. the code point is unassigned */ /* Handle surrogates */ /* check if the char is a First surrogate */ if (UTF16.isSurrogate((char) mySourceChar)) { // use that handy handleSurrogates method everyone's been talking about! CoderResult cr = handleSurrogates(source, (char) mySourceChar); return (cr != null) ? cr : CoderResult.unmappableForLength(2); } else { /* callback(unassigned) for a BMP code point */ // *err = U_INVALID_CHAR_FOUND; fromUChar32 = mySourceChar; return CoderResult.unmappableForLength(1); } } } else { // *err = U_BUFFER_OVERFLOW_ERROR; return CoderResult.OVERFLOW; } } return CoderResult.UNDERFLOW; } private CoderResult concatEscape(CharBuffer source, ByteBuffer target, IntBuffer offsets, byte[] strToAppend) { CoderResult cr = null; for (int i=0; i