2 *******************************************************************************
\r
3 * Copyright (C) 2008-2009, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.charset;
\r
9 import java.nio.ByteBuffer;
\r
10 import java.nio.CharBuffer;
\r
11 import java.nio.IntBuffer;
\r
12 import java.nio.charset.CharsetDecoder;
\r
13 import java.nio.charset.CharsetEncoder;
\r
14 import java.nio.charset.CoderResult;
\r
16 import com.ibm.icu.text.UTF16;
\r
17 import com.ibm.icu.text.UnicodeSet;
\r
19 class CharsetHZ extends CharsetICU {
\r
21 private static final int UCNV_TILDE = 0x7E; /* ~ */
\r
22 private static final int UCNV_OPEN_BRACE = 0x7B; /* { */
\r
23 private static final int UCNV_CLOSE_BRACE = 0x7D; /* } */
\r
24 private static final byte[] SB_ESCAPE = new byte[] { 0x7E, 0x7D };
\r
25 private static final byte[] DB_ESCAPE = new byte[] { 0x7E, 0x7B };
\r
26 private static final byte[] TILDE_ESCAPE = new byte[] { 0x7E, 0x7E };
\r
27 private static final byte[] fromUSubstitution = new byte[] { (byte) 0x1A };
\r
29 private CharsetMBCS gbCharset;
\r
30 private boolean isEmptySegment;
\r
32 public CharsetHZ(String icuCanonicalName, String canonicalName, String[] aliases) {
\r
33 super(icuCanonicalName, canonicalName, aliases);
\r
34 gbCharset = (CharsetMBCS) new CharsetProviderICU().charsetForName("GBK");
\r
36 maxBytesPerChar = 4;
\r
37 minBytesPerChar = 1;
\r
38 maxCharsPerByte = 1;
\r
40 isEmptySegment = false;
\r
43 class CharsetDecoderHZ extends CharsetDecoderICU {
\r
44 CharsetMBCS.CharsetDecoderMBCS gbDecoder;
\r
45 boolean isStateDBCS = false;
\r
47 public CharsetDecoderHZ(CharsetICU cs) {
\r
49 gbDecoder = (CharsetMBCS.CharsetDecoderMBCS) gbCharset.newDecoder();
\r
52 protected void implReset() {
\r
54 gbDecoder.implReset();
\r
56 isStateDBCS = false;
\r
57 isEmptySegment = false;
\r
60 protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
\r
61 CoderResult err = CoderResult.UNDERFLOW;
\r
62 byte[] tempBuf = new byte[2];
\r
63 int targetUniChar = 0;
\r
64 int mySourceChar = 0;
\r
66 if (!source.hasRemaining())
\r
67 return CoderResult.UNDERFLOW;
\r
68 else if (!target.hasRemaining())
\r
69 return CoderResult.OVERFLOW;
\r
71 while (source.hasRemaining()) {
\r
73 if (target.hasRemaining()) {
\r
75 // get the byte as unsigned
\r
76 mySourceChar = source.get() & 0xff;
\r
78 if (mode == UCNV_TILDE) {
\r
79 /* second byte after ~ */
\r
81 switch (mySourceChar) {
\r
83 /* no output for ~\n (line-continuation marker) */
\r
86 if (offsets != null) {
\r
87 offsets.put(source.position() - 2);
\r
89 target.put((char) mySourceChar);
\r
91 case UCNV_OPEN_BRACE:
\r
92 case UCNV_CLOSE_BRACE:
\r
93 isStateDBCS = (mySourceChar == UCNV_OPEN_BRACE);
\r
94 if (isEmptySegment) {
\r
95 isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */
\r
96 this.toUBytesArray[0] = UCNV_TILDE;
\r
97 this.toUBytesArray[1] = (byte)mySourceChar;
\r
99 return CoderResult.malformedForLength(1);
\r
101 isEmptySegment = true;
\r
105 * if the first byte is equal to TILDE and the trail byte is not a valid byte then it is an
\r
109 * Ticket 5691: consistent illegal sequences:
\r
110 * - We include at least the first byte in the illegal sequence.
\r
111 * - If any of the non-initial bytes could be the start of a character,
\r
112 * we stop the illegal sequence before the first one of those.
\r
114 isEmptySegment = false; /* different error here, reset this to avoid spurious furture error */
\r
115 err = CoderResult.malformedForLength(1);
\r
116 toUBytesArray[0] = UCNV_TILDE;
\r
117 if (isStateDBCS ? (0x21 <= mySourceChar && mySourceChar <= 0x7e) : mySourceChar <= 0x7f) {
\r
118 /* The current byte could be the start of a character: Back it out. */
\r
120 source.position(source.position() - 1);
\r
122 /* Include the current byte in the illegal sequence. */
\r
123 toUBytesArray[1] = (byte)mySourceChar;
\r
128 } else if (isStateDBCS) {
\r
129 if (toUnicodeStatus == 0) {
\r
131 if (mySourceChar == UCNV_TILDE) {
\r
135 * add another bit to distinguish a 0 byte from not having seen a lead byte
\r
137 toUnicodeStatus = mySourceChar | 0x100;
\r
138 isEmptySegment = false; /* the segment has something, either valid or will produce a different error, so reset this */
\r
143 boolean leadIsOk, trailIsOk;
\r
144 int leadByte = toUnicodeStatus & 0xff;
\r
145 targetUniChar = 0xffff;
\r
147 * Ticket 5691: consistent illegal sequence
\r
148 * - We include at least the first byte in the illegal sequence.
\r
149 * - If any of the non-initial bytes could be the start of a character,
\r
150 * we stop the illegal sequence before the first one of those
\r
152 * In HZ DBCS, if the second byte is in the 21..7e range,
\r
153 * we report ony the first byte as the illegal sequence.
\r
154 * Otherwise we convert of report the pair of bytes.
\r
156 leadIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (leadByte - 0x21)) <= (0x7d - 0x21);
\r
157 trailIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (mySourceChar - 0x21)) <= (0x7e - 0x21);
\r
158 if (leadIsOk && trailIsOk) {
\r
159 tempBuf[0] = (byte)(leadByte + 0x80);
\r
160 tempBuf[1] = (byte)(mySourceChar + 0x80);
\r
161 targetUniChar = gbDecoder.simpleGetNextUChar(ByteBuffer.wrap(tempBuf), super.isFallbackUsed());
\r
162 mySourceChar = (leadByte << 8) | mySourceChar;
\r
163 } else if (trailIsOk) {
\r
164 /* report a single illegal byte and continue with the following DBCS starter byte */
\r
165 source.position(source.position() - 1);
\r
166 mySourceChar = leadByte;
\r
168 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
\r
169 /* add another bit so that the code below writes 2 bytes in case of error */
\r
170 mySourceChar = 0x10000 | (leadByte << 8) | mySourceChar;
\r
172 toUnicodeStatus = 0x00;
\r
175 if (mySourceChar == UCNV_TILDE) {
\r
178 } else if (mySourceChar <= 0x7f) {
\r
179 targetUniChar = mySourceChar; /* ASCII */
\r
180 isEmptySegment = false; /* the segment has something valid */
\r
182 targetUniChar = 0xffff;
\r
183 isEmptySegment = false; /* different error here, reset this to avoid spurious future error */
\r
187 if (targetUniChar < 0xfffe) {
\r
188 if (offsets != null) {
\r
189 offsets.put(source.position() - 1 - (isStateDBCS ? 1 : 0));
\r
192 target.put((char) targetUniChar);
\r
193 } else /* targetUniChar >= 0xfffe */{
\r
194 if (mySourceChar > 0xff) {
\r
195 toUBytesArray[toUBytesBegin + 0] = (byte) (mySourceChar >> 8);
\r
196 toUBytesArray[toUBytesBegin + 1] = (byte) mySourceChar;
\r
199 toUBytesArray[toUBytesBegin + 0] = (byte) mySourceChar;
\r
202 if (targetUniChar == 0xfffe) {
\r
203 return CoderResult.unmappableForLength(toULength);
\r
205 return CoderResult.malformedForLength(toULength);
\r
209 return CoderResult.OVERFLOW;
\r
217 class CharsetEncoderHZ extends CharsetEncoderICU {
\r
218 CharsetMBCS.CharsetEncoderMBCS gbEncoder;
\r
219 boolean isEscapeAppended = false;
\r
220 boolean isTargetUCharDBCS = false;
\r
222 public CharsetEncoderHZ(CharsetICU cs) {
\r
223 super(cs, fromUSubstitution);
\r
224 gbEncoder = (CharsetMBCS.CharsetEncoderMBCS) gbCharset.newEncoder();
\r
227 protected void implReset() {
\r
229 gbEncoder.implReset();
\r
231 isEscapeAppended = false;
\r
232 isTargetUCharDBCS = false;
\r
235 protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
\r
237 int[] targetUniChar = new int[] { 0 };
\r
238 int mySourceChar = 0;
\r
239 boolean oldIsTargetUCharDBCS = isTargetUCharDBCS;
\r
241 if (!source.hasRemaining())
\r
242 return CoderResult.UNDERFLOW;
\r
243 else if (!target.hasRemaining())
\r
244 return CoderResult.OVERFLOW;
\r
246 if (fromUChar32 != 0 && target.hasRemaining()) {
\r
247 CoderResult cr = handleSurrogates(source, (char) fromUChar32);
\r
248 return (cr != null) ? cr : CoderResult.unmappableForLength(2);
\r
250 /* writing the char to the output stream */
\r
251 while (source.hasRemaining()) {
\r
252 targetUniChar[0] = MISSING_CHAR_MARKER;
\r
253 if (target.hasRemaining()) {
\r
255 mySourceChar = source.get();
\r
257 oldIsTargetUCharDBCS = isTargetUCharDBCS;
\r
258 if (mySourceChar == UCNV_TILDE) {
\r
260 * concatEscape(args, &myTargetIndex, &targetLength,"\x7E\x7E",err,2,&mySourceIndex);
\r
262 concatEscape(source, target, offsets, TILDE_ESCAPE);
\r
264 } else if (mySourceChar <= 0x7f) {
\r
266 targetUniChar[0] = mySourceChar;
\r
268 length = gbEncoder.fromUChar32(mySourceChar, targetUniChar, super.isFallbackUsed());
\r
271 * we can only use lead bytes 21..7D and trail bytes 21..7E
\r
273 if (length == 2 && 0xa1a1 <= targetUniChar[0] && targetUniChar[0] <= 0xfdfe
\r
274 && 0xa1 <= (targetUniChar[0] & 0xff) && (targetUniChar[0] & 0xff) <= 0xfe) {
\r
275 targetUniChar[0] -= 0x8080;
\r
277 targetUniChar[0] = MISSING_CHAR_MARKER;
\r
280 if (targetUniChar[0] != MISSING_CHAR_MARKER) {
\r
281 isTargetUCharDBCS = (targetUniChar[0] > 0x00FF);
\r
282 if (oldIsTargetUCharDBCS != isTargetUCharDBCS || !isEscapeAppended) {
\r
283 /* Shifting from a double byte to single byte mode */
\r
284 if (!isTargetUCharDBCS) {
\r
285 concatEscape(source, target, offsets, SB_ESCAPE);
\r
286 isEscapeAppended = true;
\r
288 * Shifting from a single byte to double byte mode
\r
290 concatEscape(source, target, offsets, DB_ESCAPE);
\r
291 isEscapeAppended = true;
\r
296 if (isTargetUCharDBCS) {
\r
297 if (target.hasRemaining()) {
\r
298 target.put((byte) (targetUniChar[0] >> 8));
\r
299 if (offsets != null) {
\r
300 offsets.put(source.position() - 1);
\r
302 if (target.hasRemaining()) {
\r
303 target.put((byte) targetUniChar[0]);
\r
304 if (offsets != null) {
\r
305 offsets.put(source.position() - 1);
\r
308 errorBuffer[errorBufferLength++] = (byte) targetUniChar[0];
\r
309 // *err = U_BUFFER_OVERFLOW_ERROR;
\r
312 errorBuffer[errorBufferLength++] = (byte) (targetUniChar[0] >> 8);
\r
313 errorBuffer[errorBufferLength++] = (byte) targetUniChar[0];
\r
314 // *err = U_BUFFER_OVERFLOW_ERROR;
\r
318 if (target.hasRemaining()) {
\r
319 target.put((byte) targetUniChar[0]);
\r
320 if (offsets != null) {
\r
321 offsets.put(source.position() - 1);
\r
325 errorBuffer[errorBufferLength++] = (byte) targetUniChar[0];
\r
326 // *err = U_BUFFER_OVERFLOW_ERROR;
\r
331 /* oops.. the code point is unassigned */
\r
332 /* Handle surrogates */
\r
333 /* check if the char is a First surrogate */
\r
335 if (UTF16.isSurrogate((char) mySourceChar)) {
\r
336 // use that handy handleSurrogates method everyone's been talking about!
\r
337 CoderResult cr = handleSurrogates(source, (char) mySourceChar);
\r
338 return (cr != null) ? cr : CoderResult.unmappableForLength(2);
\r
340 /* callback(unassigned) for a BMP code point */
\r
341 // *err = U_INVALID_CHAR_FOUND;
\r
342 fromUChar32 = mySourceChar;
\r
343 return CoderResult.unmappableForLength(1);
\r
347 // *err = U_BUFFER_OVERFLOW_ERROR;
\r
348 return CoderResult.OVERFLOW;
\r
352 return CoderResult.UNDERFLOW;
\r
355 private CoderResult concatEscape(CharBuffer source, ByteBuffer target, IntBuffer offsets, byte[] strToAppend) {
\r
356 CoderResult cr = null;
\r
357 for (int i=0; i<strToAppend.length; i++) {
\r
358 byte b = strToAppend[i];
\r
359 if (target.hasRemaining()) {
\r
361 if (offsets != null)
\r
362 offsets.put(source.position() - 1);
\r
364 errorBuffer[errorBufferLength++] = b;
\r
365 cr = CoderResult.OVERFLOW;
\r
372 public CharsetDecoder newDecoder() {
\r
373 return new CharsetDecoderHZ(this);
\r
376 public CharsetEncoder newEncoder() {
\r
377 return new CharsetEncoderHZ(this);
\r
380 void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
\r
381 setFillIn.add(0,0x7f);
\r
382 // CharsetMBCS mbcshz = (CharsetMBCS)CharsetICU.forNameICU("icu-internal-25546");
\r
383 gbCharset.MBCSGetFilteredUnicodeSetForUnicode(gbCharset.sharedData, setFillIn, which, CharsetMBCS.UCNV_SET_FILTER_HZ);
\r