2 *******************************************************************************
3 * Copyright (C) 2008-2011, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
7 package com.ibm.icu.charset;
9 import java.nio.ByteBuffer;
10 import java.nio.CharBuffer;
11 import java.nio.IntBuffer;
12 import java.nio.charset.CharsetDecoder;
13 import java.nio.charset.CharsetEncoder;
14 import java.nio.charset.CoderResult;
15 import java.util.Arrays;
17 import com.ibm.icu.charset.CharsetMBCS.CharsetDecoderMBCS;
18 import com.ibm.icu.charset.CharsetMBCS.CharsetEncoderMBCS;
19 import com.ibm.icu.lang.UCharacter;
20 import com.ibm.icu.text.UTF16;
21 import com.ibm.icu.text.UnicodeSet;
23 class CharsetISO2022 extends CharsetICU {
24 private UConverterDataISO2022 myConverterData;
25 private int variant; // one of enum {ISO_2022_JP, ISO_2022_KR, or ISO_2022_CN}
27 private static final byte[] SHIFT_IN_STR = { 0x0f };
28 // private static final byte[] SHIFT_OUT_STR = { 0x0e };
30 private static final byte CR = 0x0D;
31 private static final byte LF = 0x0A;
33 private static final byte H_TAB = 0x09;
34 private static final byte SPACE = 0x20;
36 private static final char HWKANA_START = 0xff61;
37 private static final char HWKANA_END = 0xff9f;
40 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
41 * as bytes 21..7E. (Subtract 0x80.)
42 * 96-character sets with native bit values A0..FF are encoded in ISO 2022
43 * as bytes 20..7F. (Subtract 0x80.)
44 * Do not encode C1 control codes with native bytes 80..9F
45 * as bytes 00..1F (C0 control codes).
48 private static final char GR94_START = 0xa1;
49 private static final char GR94_END = 0xfe;
51 private static final char GR96_START = 0xa0;
52 private static final char GR96_END = 0xff;
54 /* for ISO-2022-JP and -CN implementations */
57 private static final byte INVALID_STATE = -1;
58 private static final byte ASCII = 0;
60 private static final byte SS2_STATE = 0x10;
61 private static final byte SS3_STATE = 0x11;
64 private static final byte ISO8859_1 = 1;
65 private static final byte ISO8859_7 = 2;
66 private static final byte JISX201 = 3;
67 private static final byte JISX208 = 4;
68 private static final byte JISX212 = 5;
69 private static final byte GB2312 = 6;
70 private static final byte KSC5601 = 7;
71 private static final byte HWKANA_7BIT = 8; /* Halfwidth Katakana 7 bit */
74 /* the first few enum constants must keep their values because they corresponds to myConverterArray[] */
75 private static final byte GB2312_1 = 1;
76 private static final byte ISO_IR_165= 2;
77 private static final byte CNS_11643 = 3;
80 * these are used in StateEnum and ISO2022State variables,
81 * but CNS_11643 must be used to index into myConverterArray[]
83 private static final byte CNS_11643_0 = 0x20;
84 private static final byte CNS_11643_1 = 0x21;
85 private static final byte CNS_11643_2 = 0x22;
86 private static final byte CNS_11643_3 = 0x23;
87 private static final byte CNS_11643_4 = 0x24;
88 private static final byte CNS_11643_5 = 0x25;
89 private static final byte CNS_11643_6 = 0x26;
90 private static final byte CNS_11643_7 = 0x27;
94 public CharsetISO2022(String icuCanonicalName, String javaCanonicalName, String[] aliases) {
95 super(icuCanonicalName, javaCanonicalName, aliases);
97 myConverterData = new UConverterDataISO2022();
99 int versionIndex = icuCanonicalName.indexOf("version=");
100 int version = Integer.decode(icuCanonicalName.substring(versionIndex+8, versionIndex+9)).intValue();
102 myConverterData.version = version;
104 if (icuCanonicalName.indexOf("locale=ja") > 0) {
105 ISO2022InitJP(version);
106 } else if (icuCanonicalName.indexOf("locale=zh") > 0) {
107 ISO2022InitCN(version);
108 } else /* if (icuCanonicalName.indexOf("locale=ko") > 0) */ {
109 ISO2022InitKR(version);
112 myConverterData.currentEncoder = (CharsetEncoderMBCS)myConverterData.currentConverter.newEncoder();
113 myConverterData.currentDecoder = (CharsetDecoderMBCS)myConverterData.currentConverter.newDecoder();
116 private void ISO2022InitJP(int version) {
117 variant = ISO_2022_JP;
122 // open the required converters and cache them
123 if((jpCharsetMasks[version]&CSM(ISO8859_7)) != 0) {
124 myConverterData.myConverterArray[ISO8859_7] = ((CharsetMBCS)CharsetICU.forNameICU("ISO8859_7")).sharedData;
126 // myConverterData.myConverterArray[JISX201] = ((CharsetMBCS)CharsetICU.forNameICU("jisx-201")).sharedData;
127 myConverterData.myConverterArray[JISX208] = ((CharsetMBCS)CharsetICU.forNameICU("Shift-JIS")).sharedData;
128 if ((jpCharsetMasks[version]&CSM(JISX212)) != 0) {
129 myConverterData.myConverterArray[JISX212] = ((CharsetMBCS)CharsetICU.forNameICU("jisx-212")).sharedData;
131 if ((jpCharsetMasks[version]&CSM(GB2312)) != 0) {
132 myConverterData.myConverterArray[GB2312] = ((CharsetMBCS)CharsetICU.forNameICU("ibm-5478")).sharedData;
134 if ((jpCharsetMasks[version]&CSM(KSC5601)) != 0) {
135 myConverterData.myConverterArray[KSC5601] = ((CharsetMBCS)CharsetICU.forNameICU("ksc_5601")).sharedData;
138 // create a generic CharsetMBCS object
139 myConverterData.currentConverter = (CharsetMBCS)CharsetICU.forNameICU("icu-internal-25546");
142 private void ISO2022InitCN(int version) {
143 variant = ISO_2022_CN;
148 // open the required coverters and cache them.
149 myConverterData.myConverterArray[GB2312_1] = ((CharsetMBCS)CharsetICU.forNameICU("ibm-5478")).sharedData;
151 myConverterData.myConverterArray[ISO_IR_165] = ((CharsetMBCS)CharsetICU.forNameICU("iso-ir-165")).sharedData;
153 myConverterData.myConverterArray[CNS_11643] = ((CharsetMBCS)CharsetICU.forNameICU("cns-11643-1992")).sharedData;
155 // create a generic CharsetMBCS object
156 myConverterData.currentConverter = (CharsetMBCS)CharsetICU.forNameICU("icu-internal-25546");
159 private void ISO2022InitKR(int version) {
160 variant = ISO_2022_KR;
167 myConverterData.currentConverter = (CharsetMBCS)CharsetICU.forNameICU("icu-internal-25546");
168 myConverterData.currentConverter.subChar1 = fromUSubstitutionChar[0][0];
170 myConverterData.currentConverter = (CharsetMBCS)CharsetICU.forNameICU("ibm-949");
173 myConverterData.currentEncoder = (CharsetEncoderMBCS)myConverterData.currentConverter.newEncoder();
174 myConverterData.currentDecoder = (CharsetDecoderMBCS)myConverterData.currentConverter.newDecoder();
178 * ISO 2022 control codes must not be converted from Unicode
179 * because they would mess up the byte stream.
180 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
181 * corresponding to SO, SI, and ESC.
183 private static boolean IS_2022_CONTROL(int c) {
184 return (c<0x20) && (((1<<c) & 0x0800c000) != 0);
188 * Check that the result is a 2-byte value with each byte in the range A1..FE
189 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
190 * to move it to the ISO 2022 range 21..7E.
191 * return 0 if out of range.
193 private static int _2022FromGR94DBCS(int value) {
194 if ((value <= 0xfefe && value >= 0xa1a1) &&
195 ((short)(value&UConverterConstants.UNSIGNED_BYTE_MASK) <= 0xfe && ((short)(value&UConverterConstants.UNSIGNED_BYTE_MASK) >= 0xa1))) {
196 return (value - 0x8080); /* shift down to 21..7e byte range */
198 return 0; /* not valid for ISO 2022 */
203 * Commented out because Ticket 5691: Call sites now check for validity. They can just += 0x8080 after that.
205 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
206 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
209 private static int _2022ToGR94DBCS(int value) {
210 int returnValue = value + 0x8080;
212 if ((returnValue <= 0xfefe && returnValue >= 0xa1a1) &&
213 ((short)(returnValue&UConverterConstants.UNSIGNED_BYTE_MASK) <= 0xfe && ((short)(returnValue&UConverterConstants.UNSIGNED_BYTE_MASK) >= 0xa1))) {
220 /* is the StateEnum charset value for a DBCS charset? */
221 private static boolean IS_JP_DBCS(byte cs) {
222 return ((JISX208 <= cs) && (cs <= KSC5601));
225 private static short CSM(short cs) {
226 return (short)(1<<cs);
229 /* This gets the valid index of the end of buffer when decoding. */
230 private static int getEndOfBuffer_2022(ByteBuffer source) {
231 int sourceIndex = source.position();
233 mySource = source.get(sourceIndex);
235 while (source.hasRemaining() && mySource != ESC_2022) {
236 mySource = source.get();
237 if (mySource == ESC_2022) {
246 * This is a simple version of _MBCSGetNextUChar() calls the method in CharsetDecoderMBCS and returns
252 * otherwise the Unicode code point
254 private int MBCSSimpleGetNextUChar(UConverterSharedData sharedData,
256 boolean useFallback) {
258 UConverterSharedData tempSharedData = myConverterData.currentConverter.sharedData;
259 myConverterData.currentConverter.sharedData = sharedData;
260 returnValue = myConverterData.currentDecoder.simpleGetNextUChar(source, useFallback);
261 myConverterData.currentConverter.sharedData = tempSharedData;
267 * @param is the the output byte
268 * @return 1 roundtrip byte 0 no mapping -1 fallback byte
270 static int MBCSSingleFromUChar32(UConverterSharedData sharedData, int c, int[] retval, boolean useFallback) {
273 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
274 if (c >= 0x10000 && (sharedData.mbcs.unicodeMask&UConverterConstants.HAS_SUPPLEMENTARY) == 0) {
277 /* convert the Unicode code point in c into codepage bytes */
278 table = sharedData.mbcs.fromUnicodeTable;
279 /* get the byte for the output */
280 value = CharsetMBCS.MBCS_SINGLE_RESULT_FROM_U(table, sharedData.mbcs.fromUnicodeBytes, c);
281 /* get the byte for the output */
282 retval[0] = value & 0xff;
283 if (value >= 0xf00) {
284 return 1; /* roundtrip */
285 } else if (useFallback ? value>=0x800 : value>=0xc00) {
286 return -1; /* fallback taken */
288 return 0; /* no mapping */
293 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
294 * to whether that charset is used in the corresponding version x of ISO_2022, locale=ja,version=x
296 * Note: The converter uses some leniency:
297 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
298 * all versions, not just JIS7 and JIS8.
299 * - ICU does not distinguish between different version so of JIS X 0208.
301 private static final short jpCharsetMasks[] = {
302 (short)(CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)),
303 (short)(CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)),
304 (short)(CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)),
305 (short)(CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)),
306 (short)(CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7))
311 private static final byte ASCII1 = 0;
312 private static final byte LATIN1 = 1;
313 private static final byte SBCS = 2;
314 private static final byte DBCS = 3;
315 private static final byte MBCS = 4;
316 private static final byte HWKANA = 5;
320 private static class ISO2022State {
321 private byte []cs; /* Charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
322 private byte g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
323 private byte prevG; /* g before single shift (SS2 or SS3) */
330 Arrays.fill(cs, (byte)0);
336 // private static final byte UCNV_OPTIONS_VERSION_MASK = 0xf;
337 private static final byte UCNV_2022_MAX_CONVERTERS = 10;
339 @SuppressWarnings("unused")
340 private static class UConverterDataISO2022 {
341 UConverterSharedData []myConverterArray;
342 CharsetEncoderMBCS currentEncoder;
343 CharsetDecoderMBCS currentDecoder;
344 CharsetMBCS currentConverter;
345 int currentType; // Cnv2022Type;
346 ISO2022State toU2022State;
347 ISO2022State fromU2022State;
350 boolean isEmptySegment;
352 UConverterDataISO2022() {
353 myConverterArray = new UConverterSharedData[UCNV_2022_MAX_CONVERTERS];
354 toU2022State = new ISO2022State();
355 fromU2022State = new ISO2022State();
359 isEmptySegment = false;
363 toU2022State.reset();
364 fromU2022State.reset();
365 isEmptySegment = false;
369 private static final byte ESC_2022 = 0x1B; /* ESC */
372 private static final byte INVALID_2022 = -1; /* Doesn't correspond to a valid iso 2022 escape sequence */
373 private static final byte VALID_NON_TERMINAL_2022 = 0; /* so far corresponds to a valid iso 2022 escape sequence */
374 private static final byte VALID_TERMINAL_2022 = 1; /* corresponds to a valid iso 2022 escape sequence */
375 private static final byte VALID_MAYBE_TERMINAL_2022 = 2; /* so far matches one iso 2022 escape sequence, but by adding
376 more characters might match another escape sequence */
377 // } UCNV_TableStates_2022;
380 * The way these state transition arrays work is:
381 * ex : ESC$B is the sequence for JISX208
382 * a) First Iteration: char is ESC
383 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
384 * int x = normalize_esq_chars_2022[27] which is equal to 1
385 * ii) Search for this value in escSeqStateTable_Key_2022[]
386 * value of x is stored at escSeqStateTable_Key_2022[0]
387 * iii) Save this index as offset
388 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
389 * escSeqStateTable_value_2022[offset], which is VALID_NON_TERMINAL_2022
390 * b) Switch on this state and continue to next char
391 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
392 * which is normalize_esq_chars_2022[36] == 4
393 * ii) x is currently 1(from above)
394 * x<<=5 -- x is now 32
395 * x+=normalize_esq_chars_2022[36]
397 * iii) Search for this value in escSeqStateTable_Key_2022[]
398 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
399 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
400 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
401 * c) Switch on this state and continue to next char
402 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
403 * ii) x is currently 36 (from above)
404 * x<<=5 -- x is now 1152
405 * x+= normalize_esq_chars_2022[66]
407 * iii) Search for this value in escSeqStateTable_Key_2022[]
408 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
409 * iv) Get state of this sequence from escSeqStateTable_Value_2022[1]
410 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
411 * v) Get the converter name from escSeqStateTable_Result_2022[21] which is JISX208
413 /* Below are the 3 arrays depicting a state transition table */
414 private static final byte normalize_esq_chars_2022[] = {
415 /* 0 1 2 3 4 5 6 7 8 9 */
416 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
417 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
418 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
419 0, 0, 0, 0, 0, 0, 4, 7, 29, 0,
420 2, 24, 26, 27, 0, 3, 23, 6, 0, 0,
421 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
422 0, 0, 0, 0, 5, 8, 9, 10, 11, 12,
423 13, 14, 15, 16, 17, 18, 19, 20, 25, 28,
424 0, 0, 21, 0, 0, 0, 0, 0, 0, 0,
425 22, 0, 0, 0, 0, 0, 0, 0, 0, 0,
426 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
427 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
428 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
429 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
430 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
431 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
432 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
433 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
434 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
435 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
436 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
437 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
438 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
439 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
440 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
441 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
445 private static final short MAX_STATES_2022 = 74;
446 private static final int escSeqStateTable_Key_2022[/* MAX_STATES_2022 */] = {
447 /* 0 1 2 3 4 5 6 7 8 9 */
448 1, 34, 36, 39, 55, 57, 60, 61, 1093, 1096,
449 1097, 1098, 1099, 1100, 1101, 1102, 1103, 1104, 1105, 1106,
450 1109, 1154, 1157, 1160, 1161, 1176, 1178, 1179, 1254, 1257,
451 1768, 1773, 1957, 35105, 36933, 36936, 36937, 36938, 36939, 36940,
452 36942, 36943, 36944, 36945, 36946, 36947, 36948, 37640, 37642, 37644,
453 37646, 37711, 37744, 37745, 37746, 37747, 37748, 40133, 40136, 40138,
454 40139, 40140, 40141, 1123363, 35947624, 35947625, 35947626, 35947627, 35947629, 35947630,
455 35947631, 35947635, 35947636, 35947638
458 private static final byte escSeqStateTable_Value_2022[/* MAX_STATES_2022 */] = {
460 VALID_NON_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_NON_TERMINAL_2022,
461 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022,
462 VALID_MAYBE_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022,
463 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022,
464 VALID_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022,
465 VALID_NON_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_TERMINAL_2022,
466 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_TERMINAL_2022,
467 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022,
468 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022,
469 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022,
470 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022,
471 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022,
472 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_TERMINAL_2022,
473 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022,
474 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022
477 /* Type def for refactoring changeState_2022 code */
479 private static final byte ISO_2022_JP = 1;
480 private static final byte ISO_2022_KR = 2;
481 private static final byte ISO_2022_CN = 3;
484 /* const UConverterSharedData _ISO2022Data; */
485 //private UConverterSharedData _ISO2022JPData;
486 //private UConverterSharedData _ISO2022KRData;
487 //private UConverterSharedData _ISO2022CNData;
489 /******************** to unicode ********************/
490 /****************************************************
491 * Recognized escape sequenes are
503 private final static byte nextStateToUnicodeJP[/* MAX_STATES_2022 */] = {
504 /* 0 1 2 3 4 5 6 7 8 9 */
505 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, SS2_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE,
506 ASCII, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, JISX201, HWKANA_7BIT, JISX201, INVALID_STATE,
507 INVALID_STATE, INVALID_STATE, JISX208, GB2312, JISX208, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE,
508 ISO8859_1, ISO8859_7, JISX208, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, KSC5601, JISX212, INVALID_STATE,
509 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE,
510 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE,
511 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE,
512 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE
515 private final static byte nextStateToUnicodeCN[/* MAX_STATES_2022 */] = {
516 /* 0 1 2 3 4 5 6 7 8 9 */
517 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, SS2_STATE, SS3_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE,
518 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE,
519 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE,
520 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE,
521 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, GB2312_1, INVALID_STATE, ISO_IR_165,
522 CNS_11643_1, CNS_11643_2, CNS_11643_3, CNS_11643_4, CNS_11643_5, CNS_11643_6, CNS_11643_7, INVALID_STATE, INVALID_STATE, INVALID_STATE,
523 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE,
524 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE
527 /* runs through a state machine to determine the escape sequence - codepage correspondence */
528 @SuppressWarnings("fallthrough")
529 private CoderResult changeState_2022(CharsetDecoderICU decoder, ByteBuffer source, int var) {
530 CoderResult err = CoderResult.UNDERFLOW;
531 boolean DONE = false;
533 int key[] = {myConverterData.key};
535 int initialToULength = decoder.toULength;
537 int malformLength = 0;
539 value = VALID_NON_TERMINAL_2022;
540 while (source.hasRemaining()) {
543 decoder.toUBytesArray[decoder.toULength++] = c;
544 value = getKey_2022(c, key, offset);
548 case VALID_NON_TERMINAL_2022:
549 /* continue with the loop */
552 case VALID_TERMINAL_2022:
561 case VALID_MAYBE_TERMINAL_2022:
562 /* not ISO_2022 itself, finish here */
563 value = VALID_TERMINAL_2022;
573 myConverterData.key = key[0];
575 if (value == VALID_NON_TERMINAL_2022) {
576 /* indicate that the escape sequence is incomplete: key !=0 */
578 } else if (value == INVALID_2022) {
579 err = CoderResult.malformedForLength(malformLength);
580 } else /* value == VALID_TERMINAL_2022 */ {
583 byte tempState = nextStateToUnicodeJP[offset[0]];
586 err = CoderResult.malformedForLength(malformLength);
589 if (myConverterData.toU2022State.cs[2] != 0) {
590 if (myConverterData.toU2022State.g < 2) {
591 myConverterData.toU2022State.prevG = myConverterData.toU2022State.g;
593 myConverterData.toU2022State.g = 2;
595 /* illegal to have SS2 before a matching designator */
596 err = CoderResult.malformedForLength(malformLength);
599 /* case SS3_STATE: not used in ISO-2022-JP-x */
602 if ((jpCharsetMasks[myConverterData.version] & CSM(tempState)) == 0) {
603 err = CoderResult.unmappableForLength(malformLength);
605 /* G2 charset for SS2 */
606 myConverterData.toU2022State.cs[2] = tempState;
610 if ((jpCharsetMasks[myConverterData.version] & CSM(tempState)) == 0) {
611 err = CoderResult.unmappableForLength(source.position() - 1);
614 myConverterData.toU2022State.cs[0] = tempState;
621 byte tempState = nextStateToUnicodeCN[offset[0]];
624 err = CoderResult.unmappableForLength(malformLength);
627 if (myConverterData.toU2022State.cs[2] != 0) {
628 if (myConverterData.toU2022State.g < 2) {
629 myConverterData.toU2022State.prevG = myConverterData.toU2022State.g;
631 myConverterData.toU2022State.g = 2;
633 /* illegal to have SS2 before a matching designator */
634 err = CoderResult.malformedForLength(malformLength);
638 if (myConverterData.toU2022State.cs[3] != 0) {
639 if (myConverterData.toU2022State.g < 2) {
640 myConverterData.toU2022State.prevG = myConverterData.toU2022State.g;
642 myConverterData.toU2022State.g = 3;
644 /* illegal to have SS3 before a matching designator */
645 err = CoderResult.malformedForLength(malformLength);
649 if (myConverterData.version == 0) {
650 err = CoderResult.unmappableForLength(malformLength);
657 myConverterData.toU2022State.cs[1] = tempState;
660 myConverterData.toU2022State.cs[2] = tempState;
663 /* other CNS 11643 planes */
664 if (myConverterData.version == 0) {
665 err = CoderResult.unmappableForLength(source.position() - 1);
667 myConverterData.toU2022State.cs[3] = tempState;
674 if (offset[0] == 0x30) {
675 /* nothing to be done, just accept this one escape sequence */
677 err = CoderResult.unmappableForLength(malformLength);
681 err = CoderResult.malformedForLength(malformLength);
685 if (!err.isError()) {
686 decoder.toULength = 0;
687 } else if (err.isMalformed()) {
688 if (decoder.toULength > 1) {
690 * Ticket 5691: consistent illegal sequences:
691 * - We include at least the first byte (ESC) in the illegal sequence.
692 * - If any of the non-initial bytes could be the start of a character,
693 * we stop the illegal sequece before the first one of those.
694 * In escape sequences, all following bytes are "printable", that is,
695 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
696 * they are valid single/lead bytes.
697 * For simplicity, we always only report the initial ESC byte as the
698 * illegal sequence and back out all other bytes we looked at.
700 /* Back out some bytes. */
701 int backOutDistance = decoder.toULength - 1;
702 int bytesFromThisBuffer = decoder.toULength - initialToULength;
703 if (backOutDistance <= bytesFromThisBuffer) {
704 /* same as initialToULength<=1 */
705 source.position(source.position() - backOutDistance);
707 /* Back out bytes from the previous buffer: Need to replay them. */
708 decoder.preToULength = (byte)(bytesFromThisBuffer - backOutDistance);
709 /* same as -(initalToULength-1) */
710 /* preToULength is negative! */
711 for (int i = 0; i < -(decoder.preToULength); i++) {
712 decoder.preToUArray[i] = decoder.toUBytesArray[i+1];
714 source.position(source.position() - bytesFromThisBuffer);
716 decoder.toULength = 1;
723 private static byte getKey_2022(byte c, int[]key, int[]offset) {
726 int hi = MAX_STATES_2022;
729 togo = normalize_esq_chars_2022[(short)c&UConverterConstants.UNSIGNED_BYTE_MASK];
732 /* not a valid character anywhere in an escape sequence */
737 togo = (key[0] << 5) + togo;
739 while (hi != low) { /* binary search */
740 int mid = (hi+low) >> 1; /* Finds median */
746 if (escSeqStateTable_Key_2022[mid] > togo) {
748 } else if (escSeqStateTable_Key_2022[mid] < togo) {
750 } else /* we found it */ {
753 return escSeqStateTable_Value_2022[mid];
761 * To Unicode Callback helper function
763 private static CoderResult toUnicodeCallback(CharsetDecoderICU cnv, int sourceChar, int targetUniChar) {
764 CoderResult err = CoderResult.UNDERFLOW;
765 if (sourceChar > 0xff) {
766 cnv.toUBytesArray[0] = (byte)(sourceChar>>8);
767 cnv.toUBytesArray[1] = (byte)sourceChar;
770 cnv.toUBytesArray[0] = (byte)sourceChar;
774 if (targetUniChar == (UConverterConstants.missingCharMarker-1/* 0xfffe */)) {
775 err = CoderResult.unmappableForLength(1);
777 err = CoderResult.malformedForLength(1);
783 /****************************ISO-2022-JP************************************/
784 private class CharsetDecoderISO2022JP extends CharsetDecoderICU {
785 public CharsetDecoderISO2022JP(CharsetICU cs) {
789 protected void implReset() {
791 myConverterData.reset();
794 * Map 00..7F to Unicode according to JIS X 0201.
796 private int jisx201ToU(int value) {
799 } else if (value == 0x5c) {
801 } else if (value == 0x7e) {
803 } else { /* value <= 0x7f */
808 * Convert a pair of JIS X 208 21..7E bytes to Shift-JIS.
809 * If either byte is outside 21..7E make sure that the result is not valid
810 * for Shift-JIS so that the converter catches it.
811 * Some invalid byte values already turn into equally invalid Shift-JIS
812 * byte values and need not be tested explicitly.
814 private void _2022ToSJIS(char c1, char c2, byte []bytes) {
819 } else if (c2 <= 0x7e) {
822 c2 = 0; /* invalid */
825 if ((c2 >= 0x21) && (c2 <= 0x7e)) {
828 c2 = 0; /* invalid */
835 } else if (c1 <= 0x3f) {
838 c1 = 0; /* invalid */
840 bytes[0] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & c1);
841 bytes[1] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & c2);
844 @SuppressWarnings("fallthrough")
845 protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
846 boolean gotoGetTrail = false;
847 boolean gotoEscape = false;
848 CoderResult err = CoderResult.UNDERFLOW;
849 byte []tempBuf = new byte[2];
850 int targetUniChar = 0x0000;
851 int mySourceChar = 0x0000;
852 int mySourceCharTemp = 0x0000; // use for getTrail label call.
853 byte cs; /* StateEnum */
854 byte csTemp= 0; // use for getTrail label call.
856 if (myConverterData.key != 0) {
857 /* continue with a partial escape sequence */
860 } else if (toULength == 1 && source.hasRemaining() && target.hasRemaining()) {
861 /* continue with a partial double-byte character */
862 mySourceChar = (toUBytesArray[0] & UConverterConstants.UNSIGNED_BYTE_MASK);
864 cs = myConverterData.toU2022State.cs[myConverterData.toU2022State.g];
865 // goto getTrailByte;
866 mySourceCharTemp = 0x99;
870 while (source.hasRemaining() || gotoEscape || gotoGetTrail) {
871 // This code is here for the goto escape label call above.
873 mySourceCharTemp = ESC_2022;
876 targetUniChar = UConverterConstants.missingCharMarker;
878 if (gotoEscape || gotoGetTrail || target.hasRemaining()) {
879 if (!gotoEscape && !gotoGetTrail) {
880 mySourceChar = source.get() & UConverterConstants.UNSIGNED_BYTE_MASK;
881 mySourceCharTemp = mySourceChar;
884 switch (mySourceCharTemp) {
885 case UConverterConstants.SI:
886 if (myConverterData.version == 3) {
887 myConverterData.toU2022State.g = 0;
890 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
891 myConverterData.isEmptySegment = false;
895 case UConverterConstants.SO:
896 if (myConverterData.version == 3) {
897 /* JIS7: switch to G1 half-width Katakana */
898 myConverterData.toU2022State.cs[1] = HWKANA_7BIT;
899 myConverterData.toU2022State.g = 1;
902 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
903 myConverterData.isEmptySegment = false; /* reset this, we have a different error */
909 source.position(source.position() - 1);
915 int mySourceBefore = source.position();
916 int toULengthBefore = this.toULength;
918 err = changeState_2022(this, source, variant);
920 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
921 if(myConverterData.version == 0 && myConverterData.key == 0 && !err.isError() && myConverterData.isEmptySegment) {
922 err = CoderResult.malformedForLength(source.position() - mySourceBefore);
923 this.toULength = toULengthBefore + (source.position() - mySourceBefore);
927 /* invalid or illegal escape sequence */
929 myConverterData.isEmptySegment = false; /* Reset to avoid future spurious errors */
932 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
933 if(myConverterData.key == 0) {
934 myConverterData.isEmptySegment = true;
938 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
942 /* automatically reset to single-byte mode */
943 if (myConverterData.toU2022State.cs[0] != ASCII && myConverterData.toU2022State.cs[0] != JISX201) {
944 myConverterData.toU2022State.cs[0] = ASCII;
946 myConverterData.toU2022State.cs[2] = 0;
947 myConverterData.toU2022State.g = 0;
950 /* convert one or two bytes */
951 myConverterData.isEmptySegment = false;
952 cs = myConverterData.toU2022State.cs[myConverterData.toU2022State.g];
957 if (!gotoGetTrail && ((mySourceChar >= 0xa1) && (mySourceChar <= 0xdf) && myConverterData.version == 4 && !IS_JP_DBCS(cs))) {
958 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
959 targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
961 /* return from a single-shift state to the previous one */
962 if (myConverterData.toU2022State.g >= 2) {
963 myConverterData.toU2022State.g = myConverterData.toU2022State.prevG;
968 if (mySourceChar <= 0x7f) {
969 targetUniChar = mySourceChar;
973 if (mySourceChar <= 0x7f) {
974 targetUniChar = mySourceChar + 0x80;
976 /* return from a single-shift state to the prevous one */
977 myConverterData.toU2022State.g = myConverterData.toU2022State.prevG;
980 if (mySourceChar <= 0x7f) {
981 /* convert mySourceChar+0x80 to use a normal 8-bit table */
982 targetUniChar = CharsetMBCS.MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(myConverterData.myConverterArray[cs].mbcs,
985 /* return from a single-shift state to the previous one */
986 myConverterData.toU2022State.g = myConverterData.toU2022State.prevG;
989 if (mySourceChar <= 0x7f) {
990 targetUniChar = jisx201ToU(mySourceChar);
994 if ((mySourceChar >= 0x21) && (mySourceChar <= 0x5f)) {
995 /* 7-bit halfwidth Katakana */
996 targetUniChar = mySourceChar + (HWKANA_START - 0x21);
1001 if (gotoGetTrail || source.hasRemaining()) {
1004 gotoGetTrail = false;
1006 boolean leadIsOk, trailIsOk;
1008 trailByte = (short)(source.get(source.position()) & UConverterConstants.UNSIGNED_BYTE_MASK);
1010 * Ticket 5691: consistent illegal sequences:
1011 * - We include at least the first byte in the illegal sequence.
1012 * - If any of the non-initial bytes could be the start of a character,
1013 * we stop the illegal sequence before the first one of those.
1015 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
1016 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
1017 * Otherwise we convert or report the pair of bytes.
1019 leadIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (mySourceChar - 0x21)) <= (0x7e - 0x21);
1020 trailIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (trailByte - 0x21)) <= (0x7e - 0x21);
1021 if (leadIsOk && trailIsOk) {
1023 tmpSourceChar = (mySourceChar << 8) | trailByte;
1024 if (cs == JISX208) {
1025 _2022ToSJIS((char)mySourceChar, (char)trailByte, tempBuf);
1026 mySourceChar = tmpSourceChar;
1028 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
1029 mySourceChar = tmpSourceChar;
1030 if (cs == KSC5601) {
1031 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
1033 tempBuf[0] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & (tmpSourceChar >> 8));
1034 tempBuf[1] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & tmpSourceChar);
1036 targetUniChar = MBCSSimpleGetNextUChar(myConverterData.myConverterArray[cs], ByteBuffer.wrap(tempBuf), false);
1037 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
1038 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
1040 /* add another bit so that the code below writes 2 bytes in case of error */
1041 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
1044 toUBytesArray[0] = (byte)mySourceChar;
1049 } /* end of inner switch */
1052 } /* end of outer switch */
1054 if (targetUniChar < (UConverterConstants.missingCharMarker-1/*0xfffe*/)) {
1055 if (offsets != null) {
1056 offsets.put(target.remaining(), source.remaining() - (mySourceChar <= 0xff ? 1 : 2));
1058 target.put((char)targetUniChar);
1059 } else if (targetUniChar > UConverterConstants.missingCharMarker) {
1060 /* disassemble the surrogate pair and write to output */
1061 targetUniChar -= 0x0010000;
1062 target.put((char)(0xd800 + (char)(targetUniChar>>10)));
1063 target.position(target.position()-1);
1064 if (offsets != null) {
1065 offsets.put(target.remaining(), source.remaining() - (mySourceChar <= 0xff ? 1 : 2));
1068 if (target.hasRemaining()) {
1069 target.put((char)(0xdc00+(char)(targetUniChar&0x3ff)));
1070 target.position(target.position()-1);
1071 if (offsets != null) {
1072 offsets.put(target.remaining(), source.remaining() - (mySourceChar <= 0xff ? 1 : 2));
1076 charErrorBufferArray[charErrorBufferLength++] =
1077 (char)(0xdc00+(char)(targetUniChar&0x3ff));
1080 /* Call the callback function */
1081 err = toUnicodeCallback(this, mySourceChar, targetUniChar);
1084 } else { /* goes with "if (target.hasRemaining())" way up near the top of the function */
1085 err = CoderResult.OVERFLOW;
1092 } // end of class CharsetDecoderISO2022JP
1094 /****************************ISO-2022-CN************************************/
1095 private class CharsetDecoderISO2022CN extends CharsetDecoderICU {
1096 public CharsetDecoderISO2022CN(CharsetICU cs) {
1100 protected void implReset() {
1102 myConverterData.reset();
1105 @SuppressWarnings("fallthrough")
1106 protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
1107 CoderResult err = CoderResult.UNDERFLOW;
1108 byte[] tempBuf = new byte[3];
1109 int targetUniChar = 0x0000;
1110 int mySourceChar = 0x0000;
1111 int mySourceCharTemp = 0x0000;
1112 boolean gotoEscape = false;
1113 boolean gotoGetTrailByte = false;
1115 if (myConverterData.key != 0) {
1116 /* continue with a partial escape sequence */
1119 } else if (toULength == 1 && source.hasRemaining() && target.hasRemaining()) {
1120 /* continue with a partial double-byte character */
1121 mySourceChar = (toUBytesArray[0] & UConverterConstants.UNSIGNED_BYTE_MASK);
1123 targetUniChar = UConverterConstants.missingCharMarker;
1124 // goto getTrailByte
1125 gotoGetTrailByte = true;
1128 while (source.hasRemaining() || gotoGetTrailByte || gotoEscape) {
1129 targetUniChar = UConverterConstants.missingCharMarker;
1131 if (target.hasRemaining() || gotoEscape) {
1133 mySourceChar = ESC_2022; // goto escape label
1134 mySourceCharTemp = mySourceChar;
1135 } else if (gotoGetTrailByte) {
1136 mySourceCharTemp = 0xff; // goto getTrailByte; set mySourceCharTemp to go to default
1138 mySourceChar = UConverterConstants.UNSIGNED_BYTE_MASK & source.get();
1139 mySourceCharTemp = mySourceChar;
1142 switch (mySourceCharTemp) {
1143 case UConverterConstants.SI:
1144 myConverterData.toU2022State.g = 0;
1145 if (myConverterData.isEmptySegment) {
1146 myConverterData.isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */
1147 err = CoderResult.malformedForLength(1);
1148 this.toUBytesArray[0] = (byte)mySourceChar;
1154 case UConverterConstants.SO:
1155 if (myConverterData.toU2022State.cs[1] != 0) {
1156 myConverterData.toU2022State.g = 1;
1157 myConverterData.isEmptySegment = true; /* Begin a new segment, empty so far */
1160 /* illegal to have SO before a matching designator */
1161 myConverterData.isEmptySegment = false; /* Handling a different error, reset this to avoid future spurious errs */
1167 source.position(source.position()-1);
1172 int mySourceBefore = source.position();
1173 int toULengthBefore = this.toULength;
1175 err = changeState_2022(this, source, ISO_2022_CN);
1177 /* After SO there must be at least one character before a designator (designator error handled separately) */
1178 if(myConverterData.key == 0 && !err.isError() && myConverterData.isEmptySegment) {
1179 err = CoderResult.malformedForLength(source.position() - mySourceBefore);
1180 this.toULength = toULengthBefore + (source.position() - mySourceBefore);
1184 /* invalid or illegal escape sequence */
1186 myConverterData.isEmptySegment = false; /* Reset to avoid future spurious errors */
1191 /*ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
1195 myConverterData.toU2022State.reset();
1198 /* converter one or two bytes */
1199 myConverterData.isEmptySegment = false;
1200 if (myConverterData.toU2022State.g != 0 || gotoGetTrailByte) {
1201 if (source.hasRemaining() || gotoGetTrailByte) {
1202 UConverterSharedData cnv;
1205 boolean leadIsOk, trailIsOk;
1207 // getTrailByte: label
1208 gotoGetTrailByte = false; // reset gotoGetTrailByte
1210 trailByte = (short)(source.get(source.position()) & UConverterConstants.UNSIGNED_BYTE_MASK);
1212 * Ticket 5691: consistent illegal sequences:
1213 * - We include at least the first byte in the illegal sequence.
1214 * - If any of the non-initial bytes could be the start of a character,
1215 * we stop the illegal sequence before the first one of those.
1217 * In ISO-2022 DBCS, if the second byte is in the range 21..7e range or is
1218 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
1219 * Otherwise we convert or report the pair of bytes.
1221 leadIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (mySourceChar - 0x21)) <= (0x7e - 0x21);
1222 trailIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (trailByte - 0x21)) <= (0x7e - 0x21);
1223 if (leadIsOk && trailIsOk) {
1225 tempState = myConverterData.toU2022State.cs[myConverterData.toU2022State.g];
1226 if (tempState > CNS_11643_0) {
1227 cnv = myConverterData.myConverterArray[CNS_11643];
1228 tempBuf[0] = (byte)(0x80 + (tempState - CNS_11643_0));
1229 tempBuf[1] = (byte)mySourceChar;
1230 tempBuf[2] = (byte)trailByte;
1233 cnv = myConverterData.myConverterArray[tempState];
1234 tempBuf[0] = (byte)mySourceChar;
1235 tempBuf[1] = (byte)trailByte;
1238 ByteBuffer tempBuffer = ByteBuffer.wrap(tempBuf);
1239 tempBuffer.limit(tempBufLen);
1240 targetUniChar = MBCSSimpleGetNextUChar(cnv, tempBuffer, false);
1241 mySourceChar = (mySourceChar << 8) | trailByte;
1243 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
1244 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
1246 /* add another bit so that the code below writes 2 bytes in case of error */
1247 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
1249 if (myConverterData.toU2022State.g >= 2) {
1250 /* return from a single-shift state to the previous one */
1251 myConverterData.toU2022State.g = myConverterData.toU2022State.prevG;
1254 toUBytesArray[0] = (byte)mySourceChar;
1260 if (mySourceChar <= 0x7f) {
1261 targetUniChar = (char)mySourceChar;
1266 if ((UConverterConstants.UNSIGNED_INT_MASK&targetUniChar) < (UConverterConstants.UNSIGNED_INT_MASK&(UConverterConstants.missingCharMarker-1))) {
1267 if (offsets != null) {
1268 offsets.array()[target.position()] = source.remaining() - (mySourceChar <= 0xff ? 1 : 2);
1270 target.put((char)targetUniChar);
1271 } else if ((UConverterConstants.UNSIGNED_INT_MASK&targetUniChar) > (UConverterConstants.UNSIGNED_INT_MASK&(UConverterConstants.missingCharMarker))) {
1272 /* disassemble the surrogate pair and write to output */
1273 targetUniChar -= 0x0010000;
1274 target.put((char)(0xd800+(char)(targetUniChar>>10)));
1275 if (offsets != null) {
1276 offsets.array()[target.position()-1] = source.position() - (mySourceChar <= 0xff ? 1 : 2);
1278 if (target.hasRemaining()) {
1279 target.put((char)(0xdc00+(char)(targetUniChar&0x3ff)));
1280 if (offsets != null) {
1281 offsets.array()[target.position()-1] = source.position() - (mySourceChar <= 0xff ? 1 : 2);
1284 charErrorBufferArray[charErrorBufferLength++] = (char)(0xdc00+(char)(targetUniChar&0x3ff));
1287 /* Call the callback function */
1288 err = toUnicodeCallback(this, mySourceChar, targetUniChar);
1293 err = CoderResult.OVERFLOW;
1302 /************************ ISO-2022-KR ********************/
1303 private class CharsetDecoderISO2022KR extends CharsetDecoderICU {
1304 public CharsetDecoderISO2022KR(CharsetICU cs) {
1308 protected void implReset() {
1310 setInitialStateToUnicodeKR();
1311 myConverterData.reset();
1314 protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
1315 CoderResult err = CoderResult.UNDERFLOW;
1316 int mySourceChar = 0x0000;
1317 int targetUniChar = 0x0000;
1318 byte[] tempBuf = new byte[2];
1319 boolean usingFallback;
1320 boolean gotoGetTrailByte = false;
1321 boolean gotoEscape = false;
1323 if (myConverterData.version == 1) {
1324 return decodeLoopIBM(myConverterData.currentDecoder, source, target, offsets, flush);
1327 /* initialize state */
1328 usingFallback = isFallbackUsed();
1330 if (myConverterData.key != 0) {
1331 /* continue with a partial escape sequence */
1333 } else if (toULength == 1 && source.hasRemaining() && target.hasRemaining()) {
1334 /* continue with a partial double-byte character */
1335 mySourceChar = (toUBytesArray[0] & UConverterConstants.UNSIGNED_BYTE_MASK);
1337 gotoGetTrailByte = true;
1340 while (source.hasRemaining() || gotoGetTrailByte || gotoEscape) {
1341 if (target.hasRemaining() || gotoGetTrailByte || gotoEscape) {
1342 if (!gotoGetTrailByte && !gotoEscape) {
1343 mySourceChar = (char)(source.get() & UConverterConstants.UNSIGNED_BYTE_MASK);
1346 if (!gotoGetTrailByte && !gotoEscape && mySourceChar == UConverterConstants.SI) {
1347 myConverterData.toU2022State.g = 0;
1348 if (myConverterData.isEmptySegment) {
1349 myConverterData.isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */
1350 err = CoderResult.malformedForLength(1);
1351 this.toUBytesArray[0] = (byte)mySourceChar;
1355 /* consume the source */
1357 } else if (!gotoGetTrailByte && !gotoEscape && mySourceChar == UConverterConstants.SO) {
1358 myConverterData.toU2022State.g = 1;
1359 myConverterData.isEmptySegment = true;
1360 /* consume the source */
1362 } else if (!gotoGetTrailByte && (gotoEscape || mySourceChar == ESC_2022)) {
1364 source.position(source.position()-1);
1367 gotoEscape = false; // reset gotoEscape flag
1368 myConverterData.isEmptySegment = false; /* Any invalid ESC sequences will be detected separately, so just reset this */
1369 err = changeState_2022(this, source, ISO_2022_KR);
1370 if (err.isError()) {
1375 myConverterData.isEmptySegment = false; /* Any invalid char errors will be detected separately, so just reset this */
1376 if (myConverterData.toU2022State.g == 1 || gotoGetTrailByte) {
1377 if (source.hasRemaining() || gotoGetTrailByte) {
1378 boolean leadIsOk, trailIsOk;
1380 // getTrailByte label
1381 gotoGetTrailByte = false; // reset gotoGetTrailByte flag
1383 trailByte = (short)(source.get(source.position()) & UConverterConstants.UNSIGNED_BYTE_MASK);
1384 targetUniChar = UConverterConstants.missingCharMarker;
1386 * Ticket 5691: consistent illegal sequences:
1387 * - We include at least the first byte in the illegal sequence.
1388 * - If any of the non-initial bytes could be the start of a character,
1389 * we stop the illegal sequence before the first one of those.
1391 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
1392 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
1393 * Otherwise we convert or report the pair of bytes.
1395 leadIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (mySourceChar - 0x21)) <= (0x7e - 0x21);
1396 trailIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (trailByte - 0x21)) <= (0x7e - 0x21);
1397 if (leadIsOk && trailIsOk) {
1399 tempBuf[0] = (byte)(mySourceChar + 0x80);
1400 tempBuf[1] = (byte)(trailByte + 0x80);
1401 targetUniChar = MBCSSimpleGetNextUChar(myConverterData.currentConverter.sharedData, ByteBuffer.wrap(tempBuf), usingFallback);
1402 mySourceChar = (char)((mySourceChar << 8) | trailByte);
1403 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
1404 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
1406 /* add another bit so that the code below writes 2 bytes in case of error */
1407 mySourceChar = (char)(0x10000 | (mySourceChar << 8) | trailByte);
1410 toUBytesArray[0] = (byte)mySourceChar;
1414 } else if (mySourceChar <= 0x7f) {
1415 int savedSourceLimit = source.limit();
1416 int savedSourcePosition = source.position();
1417 source.limit(source.position());
1418 source.position(source.position()-1);
1419 targetUniChar = MBCSSimpleGetNextUChar(myConverterData.currentConverter.sharedData, source, usingFallback);
1420 source.limit(savedSourceLimit);
1421 source.position(savedSourcePosition);
1423 targetUniChar = 0xffff;
1425 if (targetUniChar < 0xfffe) {
1426 target.put((char)targetUniChar);
1427 if (offsets != null) {
1428 offsets.array()[target.position()] = source.position() - (mySourceChar <= 0xff ? 1 : 2);
1431 /* Call the callback function */
1432 err = toUnicodeCallback(this, mySourceChar, targetUniChar);
1436 err = CoderResult.OVERFLOW;
1444 protected CoderResult decodeLoopIBM(CharsetDecoderMBCS cnv, ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
1445 CoderResult err = CoderResult.UNDERFLOW;
1450 boolean gotoEscape = false;
1453 /* remember the original start of the input for offsets */
1454 sourceStart = argSource = source.position();
1456 if (myConverterData.key != 0) {
1457 /* continue with a partial escape sequence */
1461 while (gotoEscape || (!err.isError() && source.hasRemaining())) {
1463 /* Find the end of the buffer e.g : Next Escape Seq | end of Buffer */
1464 int oldSourcePos = source.position();
1465 sourceLimit = getEndOfBuffer_2022(source);
1466 source.position(oldSourcePos);
1467 if (source.position() != sourceLimit) {
1469 * get the current partial byte sequence
1471 * it needs to be moved between the public and the subconverter
1472 * so that the conversion frameword, which only sees the public
1473 * converter, can handle truncated and illegal input etc.
1475 if (toULength > 0) {
1476 cnv.toUBytesArray = toUBytesArray.clone();
1478 cnv.toULength = toULength;
1481 * Convert up to the end of the input, or to before the next escape character.
1482 * Does not handle conversion extensions because the preToU[] state etc.
1485 argTarget = target.position();
1486 oldSourceLimit = source.limit(); // save the old source limit change to new one
1487 source.limit(sourceLimit);
1488 err = myConverterData.currentDecoder.cnvMBCSToUnicodeWithOffsets(source, target, offsets, flush);
1489 source.limit(oldSourceLimit); // restore source limit;
1490 if (offsets != null && sourceStart != argSource) {
1491 /* update offsets to base them on the actual start of the input */
1492 int delta = argSource - sourceStart;
1493 while (argTarget < target.position()) {
1494 int currentOffset = offsets.get();
1495 offsets.position(offsets.position()-1);
1496 if (currentOffset >= 0) {
1497 offsets.put(currentOffset + delta);
1498 offsets.position(offsets.position()-1);
1504 argSource = source.position();
1506 /* copy input/error/overflow buffers */
1507 if (cnv.toULength > 0) {
1508 toUBytesArray = cnv.toUBytesArray.clone();
1510 toULength = cnv.toULength;
1512 if (err.isOverflow()) {
1513 if (cnv.charErrorBufferLength > 0) {
1514 charErrorBufferArray = cnv.charErrorBufferArray.clone();
1516 charErrorBufferLength = cnv.charErrorBufferLength;
1517 cnv.charErrorBufferLength = 0;
1521 if (err.isError() || err.isOverflow() || (source.position() == source.limit())) {
1527 err = changeState_2022(this, source, ISO_2022_KR);
1533 /******************** from unicode **********************/
1534 /* preference order of JP charsets */
1535 private final static byte []jpCharsetPref = {
1547 * The escape sequences must be in order of the enum constants like JISX201 = 3,
1548 * not in order of jpCharsetPref[]!
1550 private final static byte [][]escSeqChars = {
1551 { 0x1B, 0x28, 0x42}, /* <ESC>(B ASCII */
1552 { 0x1B, 0x2E, 0x41}, /* <ESC>.A ISO-8859-1 */
1553 { 0x1B, 0x2E, 0x46}, /* <ESC>.F ISO-8859-7 */
1554 { 0x1B, 0x28, 0x4A}, /* <ESC>(J JISX-201 */
1555 { 0x1B, 0x24, 0x42}, /* <ESC>$B JISX-208 */
1556 { 0x1B, 0x24, 0x28, 0x44}, /* <ESC>$(D JISX-212 */
1557 { 0x1B, 0x24, 0x41}, /* <ESC>$A GB2312 */
1558 { 0x1B, 0x24, 0x28, 0x43}, /* <ESC>$(C KSC5601 */
1559 { 0x1B, 0x28, 0x49} /* <ESC>(I HWKANA_7BIT */
1562 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1564 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1565 * because Shift-JIS roundtrips half-width Katakana to single bytes.
1566 * These were the only fallbacks in ICU's jisx-208.ucm file.
1568 private final static char []hwkana_fb = {
1569 0x2123, /* U+FF61 */
1584 0x213C, /* U+FF70 */
1600 0x253F, /* U+FF80 */
1616 0x255F, /* U+FF90 */
1634 protected byte [][]fromUSubstitutionChar = new byte[][]{ { (byte)0x1A }, { (byte)0x2F, (byte)0x7E} };
1635 /****************************ISO-2022-JP************************************/
1636 private class CharsetEncoderISO2022JP extends CharsetEncoderICU {
1637 public CharsetEncoderISO2022JP(CharsetICU cs) {
1638 super(cs, fromUSubstitutionChar[0]);
1641 protected void implReset() {
1643 myConverterData.reset();
1645 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1646 private int jisx201FromU(int value) {
1647 if (value <= 0x7f) {
1648 if (value != 0x5c && value != 0x7e) {
1651 } else if (value == 0xa5) {
1653 } else if (value == 0x203e) {
1656 return (int)(UConverterConstants.UNSIGNED_INT_MASK & 0xfffe);
1660 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1661 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1662 * Return 0 if the byte pair is out of range.
1664 private int _2022FromSJIS(int value) {
1667 if (value > 0xEFFC) {
1668 return 0; /* beyond JIS X 0208 */
1671 trail = (short)(value & UConverterConstants.UNSIGNED_BYTE_MASK);
1673 value &= 0xff00; /* lead byte */
1674 if (value <= 0x9f00) {
1676 } else { /* 0xe000 <= value <= 0xef00 */
1682 if (trail <= 0x9e) {
1684 if (trail <= 0x7e) {
1685 value |= ((trail - 0x1f) & UConverterConstants.UNSIGNED_BYTE_MASK);
1687 value |= ((trail - 0x20) & UConverterConstants.UNSIGNED_BYTE_MASK);
1689 } else { /* trail <= 0xfc */
1690 value |= ((trail - 0x7e) & UConverterConstants.UNSIGNED_BYTE_MASK);
1695 /* This overrides the cbFromUWriteSub method in CharsetEncoderICU */
1696 CoderResult cbFromUWriteSub (CharsetEncoderICU encoder,
1697 CharBuffer source, ByteBuffer target, IntBuffer offsets){
1698 CoderResult err = CoderResult.UNDERFLOW;
1699 byte[] buffer = new byte[8];
1702 subchar = encoder.replacement();
1705 if (myConverterData.fromU2022State.g == 1) {
1706 /* JIS7: switch from G1 to G0 */
1707 myConverterData.fromU2022State.g = 0;
1708 buffer[i++] = UConverterConstants.SI;
1710 cs = myConverterData.fromU2022State.cs[0];
1712 if (cs != ASCII && cs != JISX201) {
1713 /* not in ASCII or JIS X 0201: switch to ASCII */
1714 myConverterData.fromU2022State.cs[0] = ASCII;
1720 buffer[i++] = subchar[0];
1722 err = CharsetEncoderICU.fromUWriteBytes(this, buffer, 0, i, target, offsets, source.position() - 1);
1727 protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
1728 CoderResult err = CoderResult.UNDERFLOW;
1733 byte[] choices = new byte[10];
1734 int targetValue = 0;
1735 boolean usingFallback;
1736 byte[] buffer = new byte[8];
1737 boolean getTrail = false; // use for getTrail label
1738 int oldSourcePos; // for proper error handling
1742 /* check if the last codepoint of previous buffer was a lead surrogate */
1743 if ((sourceChar = fromUChar32) != 0 && target.hasRemaining()) {
1747 while (getTrail || source.hasRemaining()) {
1748 if (getTrail || target.hasRemaining()) {
1749 oldSourcePos = source.position();
1750 if (!getTrail) { /* skip if going to getTrail label */
1751 sourceChar = source.get();
1753 /* check if the char is a First surrogate */
1754 if (getTrail || UTF16.isSurrogate((char)sourceChar)) {
1755 if (getTrail || UTF16.isLeadSurrogate((char)sourceChar)) {
1760 /* look ahead to find the trail surrogate */
1761 if (source.hasRemaining()) {
1762 /* test the following code unit */
1763 char trail = source.get();
1764 /* go back to the previous position */
1765 source.position(source.position()-1);
1766 if (UTF16.isTrailSurrogate(trail)) {
1768 sourceChar = UCharacter.getCodePoint((char)sourceChar, trail);
1770 /* convert this supplementary code point */
1771 /* exit this condition tree */
1773 /* this is an unmatched lead code unit (1st surrogate) */
1774 /* callback(illegal) */
1775 err = CoderResult.malformedForLength(1);
1776 fromUChar32 = sourceChar;
1781 fromUChar32 = sourceChar;
1785 /* this is an unmatched trail code unit (2nd surrogate) */
1786 /* callback(illegal) */
1787 err = CoderResult.malformedForLength(1);
1788 fromUChar32 = sourceChar;
1793 /* do not convert SO/SI/ESC */
1794 if (IS_2022_CONTROL(sourceChar)) {
1795 /* callback(illegal) */
1796 err = CoderResult.malformedForLength(1);
1797 fromUChar32 = sourceChar;
1801 /* do the conversion */
1803 if (choiceCount == 0) {
1806 * The csm variable keeps track of which charsets are allowed
1807 * and not used yet while building the choices[].
1809 csm = (char)jpCharsetMasks[myConverterData.version];
1812 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1813 if (myConverterData.version == 3 || myConverterData.version == 4) {
1814 choices[choiceCount++] = HWKANA_7BIT;
1816 /* Do not try single-bit half-width Katakana for other versions. */
1817 csm &= ~CSM(HWKANA_7BIT);
1819 /* try the current G0 charset */
1820 choices[choiceCount++] = cs = myConverterData.fromU2022State.cs[0];
1823 /* try the current G2 charset */
1824 if ((cs = myConverterData.fromU2022State.cs[2]) != 0) {
1825 choices[choiceCount++] = cs;
1829 /* try all the other charsets */
1830 for (int i = 0; i < jpCharsetPref.length; i++) {
1831 cs = jpCharsetPref[i];
1832 if ((CSM(cs) & csm) != 0) {
1833 choices[choiceCount++] = cs;
1841 * len==0: no mapping found yet
1842 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1843 * len>0: found a roundtrip result, done
1847 * We will turn off usingFallBack after finding a fallback,
1848 * but we still get fallbacks from PUA code points as usual.
1849 * Therefore, we will also need to check that we don't overwrite
1850 * an early fallback with a later one.
1852 usingFallback = useFallback;
1854 for (int i = 0; i < choiceCount && len <= 0; i++) {
1855 int[] value = new int[1];
1857 byte cs0 = choices[i];
1860 if (sourceChar <= 0x7f) {
1861 targetValue = sourceChar;
1868 if (GR96_START <= sourceChar && sourceChar <= GR96_END) {
1869 targetValue = sourceChar - 0x80;
1876 if (sourceChar <= HWKANA_END && sourceChar >= HWKANA_START) {
1877 if (myConverterData.version == 3) {
1878 /* JIS7: use G1 (SO) */
1879 /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1880 targetValue = (int)(UConverterConstants.UNSIGNED_INT_MASK & (sourceChar - (HWKANA_START - 0x21)));
1882 myConverterData.fromU2022State.cs[1] = cs = cs0; /* do not output an escape sequence */
1884 } else if (myConverterData.version == 4) {
1885 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1886 /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1887 targetValue = (int)(UConverterConstants.UNSIGNED_INT_MASK & (sourceChar - (HWKANA_START - 0xa1)));
1890 cs = myConverterData.fromU2022State.cs[0];
1891 if (IS_JP_DBCS(cs)) {
1892 /* switch from a DBCS charset to JISX201 */
1895 /* else stay in the current G0 charset */
1898 /* else do not use HWKANA_7BIT with other versions */
1903 value[0] = jisx201FromU(sourceChar);
1904 if (value[0] <= 0x7f) {
1905 targetValue = value[0];
1909 usingFallback = false;
1913 /* G0 DBCS from JIS table */
1914 myConverterData.currentConverter.sharedData = myConverterData.myConverterArray[cs0];
1915 myConverterData.currentConverter.sharedData.mbcs.outputType = CharsetMBCS.MBCS_OUTPUT_2;
1916 len2 = myConverterData.currentEncoder.fromUChar32(sourceChar, value, usingFallback);
1917 //len2 = MBCSFromUChar32_ISO2022(myConverterData.myConverterArray[cs0], sourceChar, value, usingFallback, CharsetMBCS.MBCS_OUTPUT_2);
1918 if (len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len) == 2 */
1919 value[0] = _2022FromSJIS(value[0]);
1920 if (value[0] != 0) {
1921 targetValue = value[0];
1925 usingFallback = false;
1927 } else if (len == 0 && usingFallback && sourceChar <= HWKANA_END && sourceChar >= HWKANA_START) {
1928 targetValue = hwkana_fb[sourceChar - HWKANA_START];
1932 usingFallback = false;
1936 /* G0 SBCS forced to 7-bit output */
1937 len2 = MBCSSingleFromUChar32(myConverterData.myConverterArray[cs0], sourceChar, value, usingFallback);
1938 if (len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value[0] && value[0] <= GR96_END) {
1939 targetValue = value[0] - 0x80;
1943 usingFallback = false;
1948 myConverterData.currentConverter.sharedData = myConverterData.myConverterArray[cs0];
1949 myConverterData.currentConverter.sharedData.mbcs.outputType = CharsetMBCS.MBCS_OUTPUT_2;
1950 len2 = myConverterData.currentEncoder.fromUChar32(sourceChar, value, usingFallback);
1951 //len2 = MBCSFromUChar32_ISO2022(myConverterData.myConverterArray[cs0], sourceChar, value, usingFallback, CharsetMBCS.MBCS_OUTPUT_2);
1952 if (len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1953 if (cs0 == KSC5601) {
1955 * Check for valid bytes for the encoding scheme.
1956 * This is necessary because the sub-converter (windows-949)
1957 * has a broader encoding scheme than is valid for 2022.
1959 value[0] = _2022FromGR94DBCS(value[0]);
1960 if (value[0] == 0) {
1964 targetValue = value[0];
1968 usingFallback = false;
1976 len = -len; /* fallback */
1980 /* write SI if necessary (only for JIS7 */
1981 if (myConverterData.fromU2022State.g == 1 && g == 0) {
1982 buffer[outLen++] = UConverterConstants.SI;
1983 myConverterData.fromU2022State.g = 0;
1986 /* write the designation sequence if necessary */
1987 if (cs != myConverterData.fromU2022State.cs[g]) {
1988 for (int i = 0; i < escSeqChars[cs].length; i++) {
1989 buffer[outLen++] = escSeqChars[cs][i];
1991 myConverterData.fromU2022State.cs[g] = cs;
1993 /* invalidate the choices[] */
1997 /* write the shift sequence if necessary */
1998 if (g != myConverterData.fromU2022State.g) {
2000 /* case 0 handled before writing escapes */
2002 buffer[outLen++] = UConverterConstants.SO;
2003 myConverterData.fromU2022State.g = 1;
2005 default : /* case 2 */
2006 buffer[outLen++] = 0x1b;
2007 buffer[outLen++] = 0x4e;
2009 /* case 3: no SS3 in ISO-2022-JP-x */
2013 /* write the output bytes */
2015 buffer[outLen++] = (byte)targetValue;
2016 } else { /* len == 2 */
2017 buffer[outLen++] = (byte)(targetValue >> 8);
2018 buffer[outLen++] = (byte)targetValue;
2022 * if we cannot find the character after checking all codepages
2023 * then this is an error.
2025 err = CoderResult.unmappableForLength(source.position()-oldSourcePos);
2026 fromUChar32 = sourceChar;
2030 if (sourceChar == CR || sourceChar == LF) {
2031 /* reset the G2 state at the end of a line (conversion got use into ASCII or JISX201 already) */
2032 myConverterData.fromU2022State.cs[2] = 0;
2036 /* output outLen>0 bytes in buffer[] */
2038 target.put(buffer[0]);
2039 if (offsets != null) {
2040 offsets.put(source.remaining() - 1); /* -1 known to be ASCII */
2042 } else if (outLen == 2 && (target.position() + 2) <= target.limit()) {
2043 target.put(buffer[0]);
2044 target.put(buffer[1]);
2045 if (offsets != null) {
2046 int sourceIndex = source.position() - 1;
2047 offsets.put(sourceIndex);
2048 offsets.put(sourceIndex);
2051 err = CharsetEncoderICU.fromUWriteBytes(this, buffer, 0, outLen, target, offsets, source.position()-1);
2054 err = CoderResult.OVERFLOW;
2060 * the end of the input stream and detection of truncated input
2061 * are handled by the framework, but for ISO-2022-JP conversion
2062 * we need to be in ASCII mode at the very end
2066 * in SO mode or not in ASCII mode
2067 * end of input and no truncated input
2069 if (!err.isError() &&
2070 (myConverterData.fromU2022State.g != 0 || myConverterData.fromU2022State.cs[0] != ASCII) &&
2071 flush && !source.hasRemaining() && fromUChar32 == 0) {
2076 if (myConverterData.fromU2022State.g != 0) {
2077 buffer[outLen++] = UConverterConstants.SI;
2078 myConverterData.fromU2022State.g = 0;
2081 if (myConverterData.fromU2022State.cs[0] != ASCII) {
2082 for (int i = 0; i < escSeqChars[ASCII].length; i++) {
2083 buffer[outLen++] = escSeqChars[ASCII][i];
2085 myConverterData.fromU2022State.cs[0] = ASCII;
2088 /* get the source index of the last input character */
2089 sourceIndex = source.position();
2090 if (sourceIndex > 0) {
2092 if (UTF16.isTrailSurrogate(source.get(sourceIndex)) &&
2093 (sourceIndex == 0 || UTF16.isLeadSurrogate(source.get(sourceIndex-1)))) {
2100 err = CharsetEncoderICU.fromUWriteBytes(this, buffer, 0, outLen, target, offsets, sourceIndex);
2105 /****************************ISO-2022-CN************************************/
2107 * Rules for ISO-2022-CN Encoding:
2108 * i) The designator sequence must appear once on a line before any instance
2109 * of chracter set it designates.
2110 * ii) If two lines contain characters from the same character set, both lines
2111 * must include the designator sequence.
2112 * iii) Once the designator sequence is known, a shifting sequence has to be found
2113 * to invoke the shifting
2114 * iv) All lines start in ASCII and end in ASCII.
2115 * v) Four shifting sequences are employed for this purpose:
2116 * Sequence ASCII Eq Charsets
2117 * --------- --------- --------
2119 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2120 * SS2 <ESC>N CNS-11643-1992 Plane 2
2121 * SS3 <ESC>O CNS-11643-1992 Planes 3-7
2123 * SOdesignator : ESC "$" ")" finalchar_for_SO
2124 * SS2designator : ESC "$" "*" finalchar_for_SS2
2125 * SS3designator : ESC "$" "+" finalchar_for_SS3
2127 * ESC $ ) A Indicates the bytes following SO are Chinese
2128 * characters as defined in GB 2312-80, until
2129 * another SOdesignation appears
2131 * ESC $ ) E Indicates the bytes following SO are as defined
2132 * in ISO-IR-165 (for details, see section 2.1),
2133 * until another SOdesignation appears
2135 * ESC $ ) G Indicates the bytes following SO are as defined
2136 * in CNS 11643-plane-1, until another SOdesignation appears
2138 * ESC $ * H Indicates teh two bytes immediately following
2139 * SS2 is a Chinese character as defined in CNS
2140 * 11643-plane-2, until another SS2designation
2142 * (Meaning <ESC>N must preceed ever 2 byte sequence.)
2144 * ESC $ + I Indicates the immediate two bytes following SS3
2145 * is a Chinese character as defined in CNS
2146 * 11643-plane-3, until another SS3designation
2148 * (Meaning <ESC>O must preceed every 2 byte sequence.)
2150 * ESC $ + J Indicates the immediate two bytes following SS3
2151 * is a Chinese character as defined in CNS
2152 * 11643-plane-4, until another SS3designation
2154 * (In English: <ESC>O must preceed every 2 byte sequence.)
2156 * ESC $ + K Indicates the immediate two bytes following SS3
2157 * is a Chinese character as defined in CNS
2158 * 11643-plane-5, until another SS3designation
2161 * ESC $ + L Indicates the immediate two bytes following SS3
2162 * is a Chinese character as defined in CNS
2163 * 11643-plane-6, until another SS3designation
2166 * ESC $ + M Indicates the immediate two bytes following SS3
2167 * is a Chinese character as defined in CNS
2168 * 11643-plane-7, until another SS3designation
2171 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2172 * has its own designation information before any Chinese chracters
2176 /* The following are defined this way to make strings truely readonly */
2177 private final static byte[] GB_2312_80_STR = { 0x1B, 0x24, 0x29, 0x41 };
2178 private final static byte[] ISO_IR_165_STR = { 0x1B, 0x24, 0x29, 0x45 };
2179 private final static byte[] CNS_11643_1992_Plane_1_STR = { 0x1B, 0x24, 0x29, 0x47 };
2180 private final static byte[] CNS_11643_1992_Plane_2_STR = { 0x1B, 0x24, 0x2A, 0x48 };
2181 private final static byte[] CNS_11643_1992_Plane_3_STR = { 0x1B, 0x24, 0x2B, 0x49 };
2182 private final static byte[] CNS_11643_1992_Plane_4_STR = { 0x1B, 0x24, 0x2B, 0x4A };
2183 private final static byte[] CNS_11643_1992_Plane_5_STR = { 0x1B, 0x24, 0x2B, 0x4B };
2184 private final static byte[] CNS_11643_1992_Plane_6_STR = { 0x1B, 0x24, 0x2B, 0x4C };
2185 private final static byte[] CNS_11643_1992_Plane_7_STR = { 0x1B, 0x24, 0x2B, 0x4D };
2187 /************************ ISO2022-CN Data *****************************/
2188 private final static byte[][] escSeqCharsCN = {
2192 CNS_11643_1992_Plane_1_STR,
2193 CNS_11643_1992_Plane_2_STR,
2194 CNS_11643_1992_Plane_3_STR,
2195 CNS_11643_1992_Plane_4_STR,
2196 CNS_11643_1992_Plane_5_STR,
2197 CNS_11643_1992_Plane_6_STR,
2198 CNS_11643_1992_Plane_7_STR,
2201 private class CharsetEncoderISO2022CN extends CharsetEncoderICU {
2202 public CharsetEncoderISO2022CN(CharsetICU cs) {
2203 super(cs, fromUSubstitutionChar[0]);
2206 protected void implReset() {
2208 myConverterData.reset();
2211 /* This overrides the cbFromUWriteSub method in CharsetEncoderICU */
2212 CoderResult cbFromUWriteSub (CharsetEncoderICU encoder,
2213 CharBuffer source, ByteBuffer target, IntBuffer offsets){
2214 CoderResult err = CoderResult.UNDERFLOW;
2215 byte[] buffer = new byte[8];
2218 subchar = encoder.replacement();
2220 if (myConverterData.fromU2022State.g != 0) {
2221 /* not in ASCII mode: switch to ASCII */
2222 myConverterData.fromU2022State.g = 0;
2223 buffer[i++] = UConverterConstants.SI;
2225 buffer[i++] = subchar[0];
2227 err = CharsetEncoderICU.fromUWriteBytes(this, buffer, 0, i, target, offsets, source.position() - 1);
2232 protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
2233 CoderResult err = CoderResult.UNDERFLOW;
2235 byte[] buffer = new byte[8];
2237 byte[] choices = new byte[3];
2239 int targetValue = 0;
2240 boolean usingFallback;
2241 boolean gotoGetTrail = false;
2242 int oldSourcePos; // For proper error handling
2246 /* check if the last codepoint of previous buffer was a lead surrogate */
2247 if ((sourceChar = fromUChar32) != 0 && target.hasRemaining()) {
2248 // goto getTrail label
2249 gotoGetTrail = true;
2252 while (source.hasRemaining() || gotoGetTrail) {
2253 if (target.hasRemaining() || gotoGetTrail) {
2254 oldSourcePos = source.position();
2255 if (!gotoGetTrail) {
2256 sourceChar = source.get();
2258 /* check if the char is a First surrogate */
2259 if (UTF16.isSurrogate((char)sourceChar) || gotoGetTrail) {
2260 if (UTF16.isLeadSurrogate((char)sourceChar) || gotoGetTrail) {
2262 /* reset gotoGetTrail flag*/
2263 gotoGetTrail = false;
2265 /* look ahead to find the trail surrogate */
2266 if (source.hasRemaining()) {
2267 /* test the following code unit */
2268 char trail = source.get();
2269 source.position(source.position()-1);
2270 if (UTF16.isTrailSurrogate(trail)) {
2272 sourceChar = UCharacter.getCodePoint((char)sourceChar, trail);
2274 /* convert this supplementary code point */
2275 /* exit this condition tree */
2277 /* this is an unmatched lead code unit (1st surrogate) */
2278 /* callback(illegal) */
2279 err = CoderResult.malformedForLength(1);
2280 fromUChar32 = sourceChar;
2285 fromUChar32 = sourceChar;
2289 /* this is an unmatched trail code unit (2nd surrogate) */
2290 /* callback(illegal) */
2291 err = CoderResult.malformedForLength(1);
2292 fromUChar32 = sourceChar;
2297 /* do the conversion */
2298 if (sourceChar <= 0x007f) {
2299 /* do not converter SO/SI/ESC */
2300 if (IS_2022_CONTROL(sourceChar)) {
2301 /* callback(illegal) */
2302 err = CoderResult.malformedForLength(1);
2303 fromUChar32 = sourceChar;
2308 if (myConverterData.fromU2022State.g == 0) {
2309 buffer[0] = (byte)sourceChar;
2312 buffer[0] = UConverterConstants.SI;
2313 buffer[1] = (byte)sourceChar;
2315 myConverterData.fromU2022State.g = 0;
2319 if (sourceChar == CR || sourceChar == LF) {
2320 /* reset the state at the end of a line */
2321 myConverterData.fromU2022State.reset();
2325 /* convert U+0080..U+10ffff */
2329 if (choiceCount == 0) {
2330 /* try the current SO/G1 converter first */
2331 choices[0] = myConverterData.fromU2022State.cs[1];
2333 /* default to GB2312_1 if none is designated yet */
2334 if (choices[0] == 0) {
2335 choices[0] = GB2312_1;
2337 if (myConverterData.version == 0) {
2339 /* try other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
2340 if (choices[0] == GB2312_1) {
2341 choices[1] = CNS_11643_1;
2343 choices[1] = GB2312_1;
2347 } else if (myConverterData.version == 1) {
2348 /* ISO-2022-CN-EXT */
2350 /* try one of the other converters */
2351 switch (choices[0]) {
2353 choices[1] = CNS_11643_1;
2354 choices[2] = ISO_IR_165;
2357 choices[1] = GB2312_1;
2358 choices[2] = CNS_11643_1;
2361 choices[1] = GB2312_1;
2362 choices[2] = ISO_IR_165;
2368 /* ISO-2022-CN-CNS */
2369 choices[0] = CNS_11643_1;
2370 choices[1] = GB2312_1;
2378 * len==0: no mapping found yet
2379 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
2380 * len>0: found a roundtrip result, done
2384 * We will turn off usingFallback after finding a fallback,
2385 * but we still get fallbacks from PUA code points as usual.
2386 * Therefore, we will also need to check that we don't overwrite
2387 * an early fallback with a later one.
2389 usingFallback = useFallback;
2391 for (i = 0; i < choiceCount && len <= 0; ++i) {
2392 byte cs0 = choices[i];
2394 int[] value = new int[1];
2396 if (cs0 > CNS_11643_0) {
2397 myConverterData.currentConverter.sharedData = myConverterData.myConverterArray[CNS_11643];
2398 myConverterData.currentConverter.sharedData.mbcs.outputType = CharsetMBCS.MBCS_OUTPUT_3;
2399 len2 = myConverterData.currentEncoder.fromUChar32(sourceChar, value, usingFallback);
2400 //len2 = MBCSFromUChar32_ISO2022(myConverterData.myConverterArray[CNS_11643],
2401 // sourceChar, value, usingFallback, CharsetMBCS.MBCS_OUTPUT_3);
2402 if (len2 == 3 || (len2 == -3 && len == 0)) {
2403 targetValue = value[0];
2404 cs = (byte)(CNS_11643_0 + (value[0] >> 16) - 0x80);
2409 usingFallback = false;
2411 if (cs == CNS_11643_1) {
2413 } else if (cs == CNS_11643_2) {
2415 } else if (myConverterData.version == 1) { /* plane 3..7 */
2418 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
2423 /* GB2312_1 or ISO-IR-165 */
2424 myConverterData.currentConverter.sharedData = myConverterData.myConverterArray[cs0];
2425 myConverterData.currentConverter.sharedData.mbcs.outputType = CharsetMBCS.MBCS_OUTPUT_2;
2426 len2 = myConverterData.currentEncoder.fromUChar32(sourceChar, value, usingFallback);
2427 //len2 = MBCSFromUChar32_ISO2022(myConverterData.myConverterArray[cs0],
2428 // sourceChar, value, usingFallback, CharsetMBCS.MBCS_OUTPUT_2);
2429 if (len2 == 2 || (len2 == -2 && len == 0)) {
2430 targetValue = value[0];
2434 usingFallback = false;
2441 len = 0; /* count output bytes; it must have ben abs(len) == 2 */
2443 /* write the designation sequence if necessary */
2444 if (cs != myConverterData.fromU2022State.cs[g]) {
2445 if (cs < CNS_11643) {
2446 for (int n = 0; n < escSeqCharsCN[cs].length; n++) {
2447 buffer[n] = escSeqCharsCN[cs][n];
2450 for (int n = 0; n < escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)].length; n++) {
2451 buffer[n] = escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)][n];
2455 myConverterData.fromU2022State.cs[g] = cs;
2457 /* changing the SO/G1 charset invalidates the choices[] */
2462 /* write the shift sequence if necessary */
2463 if (g != myConverterData.fromU2022State.g) {
2466 buffer[len++] = UConverterConstants.SO;
2468 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
2469 myConverterData.fromU2022State.g = 1;
2472 buffer[len++] = 0x1b;
2473 buffer[len++] = 0x4e;
2475 default: /* case 3 */
2476 buffer[len++] = 0x1b;
2477 buffer[len++] = 0x4f;
2482 /* write the two output bytes */
2483 buffer[len++] = (byte)(targetValue >> 8);
2484 buffer[len++] = (byte)targetValue;
2486 /* if we cannot find the character after checking all codepages
2487 * then this is an error
2489 err = CoderResult.unmappableForLength(source.position()-oldSourcePos);
2490 fromUChar32 = sourceChar;
2494 /* output len>0 bytes in buffer[] */
2496 target.put(buffer[0]);
2497 if (offsets != null) {
2498 offsets.put(source.position()-1);
2500 } else if (len == 2 && (target.remaining() >= 2)) {
2501 target.put(buffer[0]);
2502 target.put(buffer[1]);
2503 if (offsets != null) {
2504 int sourceIndex = source.position();
2505 offsets.put(sourceIndex);
2506 offsets.put(sourceIndex);
2509 err = CharsetEncoderICU.fromUWriteBytes(this, buffer, 0, len, target, offsets, source.position()-1);
2510 if (err.isError()) {
2515 err = CoderResult.OVERFLOW;
2518 } /* end while (source.hasRemaining() */
2521 * the end of the input stream and detection of truncated input
2522 * are handled by the framework, but for ISO-2022-CN conversion
2523 * we need to be in ASCII mode at the very end
2528 * end of input and no truncated input
2530 if (!err.isError() && myConverterData.fromU2022State.g != 0 && flush && !source.hasRemaining() && fromUChar32 == 0) {
2533 /* we are switching to ASCII */
2534 myConverterData.fromU2022State.g = 0;
2536 /* get the source index of the last input character */
2537 sourceIndex = source.position();
2538 if (sourceIndex > 0) {
2540 if (UTF16.isTrailSurrogate(source.get(sourceIndex)) &&
2541 (sourceIndex == 0 || UTF16.isLeadSurrogate(source.get(sourceIndex-1)))) {
2548 err = CharsetEncoderICU.fromUWriteBytes(this, SHIFT_IN_STR, 0, 1, target, offsets, sourceIndex);
2554 /******************************** ISO-2022-KR *****************************/
2556 * Rules for ISO-2022-KR encoding
2557 * i) The KSC5601 designator sequence should appear only once in a file,
2558 * at the begining of a line before any KSC5601 characters. This usually
2559 * means that it appears by itself on the first line of the file
2560 * ii) There are only 2 shifting sequences SO to shift into double byte mode
2561 * and SI to shift into single byte mode
2563 private class CharsetEncoderISO2022KR extends CharsetEncoderICU {
2564 public CharsetEncoderISO2022KR(CharsetICU cs) {
2565 super(cs, fromUSubstitutionChar[myConverterData.version]);
2568 protected void implReset() {
2570 myConverterData.reset();
2571 setInitialStateFromUnicodeKR(this);
2574 /* This overrides the cbFromUWriteSub method in CharsetEncoderICU */
2575 CoderResult cbFromUWriteSub (CharsetEncoderICU encoder,
2576 CharBuffer source, ByteBuffer target, IntBuffer offsets){
2577 CoderResult err = CoderResult.UNDERFLOW;
2578 byte[] buffer = new byte[8];
2582 subchar = encoder.replacement();
2583 length = subchar.length;
2585 if (myConverterData.version == 0) {
2587 if (encoder.fromUnicodeStatus != 0) {
2588 /* in DBCS mode: switch to SBCS */
2589 encoder.fromUnicodeStatus = 0;
2590 buffer[i++] = UConverterConstants.SI;
2592 buffer[i++] = subchar[0];
2593 } else { /* length == 2 */
2594 if (encoder.fromUnicodeStatus == 0) {
2595 /* in SBCS mode: switch to DBCS */
2596 encoder.fromUnicodeStatus = 1;
2597 buffer[i++] = UConverterConstants.SO;
2599 buffer[i++] = subchar[0];
2600 buffer[i++] = subchar[1];
2602 err = CharsetEncoderICU.fromUWriteBytes(this, buffer, 0, i, target, offsets, source.position() - 1);
2604 /* save the subvonverter's substitution string */
2605 byte[] currentSubChars = myConverterData.currentEncoder.replacement();
2607 /* set our substitution string into the subconverter */
2608 myConverterData.currentEncoder.replaceWith(subchar);
2609 myConverterData.currentConverter.subChar1 = fromUSubstitutionChar[0][0];
2610 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
2611 myConverterData.currentEncoder.fromUChar32 = encoder.fromUChar32;
2612 err = myConverterData.currentEncoder.cbFromUWriteSub(myConverterData.currentEncoder, source, target, offsets);
2613 encoder.fromUChar32 = myConverterData.currentEncoder.fromUChar32;
2615 /* restore the subconverter's substitution string */
2616 myConverterData.currentEncoder.replaceWith(currentSubChars);
2618 if (err.isOverflow()) {
2619 if (myConverterData.currentEncoder.errorBufferLength > 0) {
2620 encoder.errorBuffer = myConverterData.currentEncoder.errorBuffer.clone();
2622 encoder.errorBufferLength = myConverterData.currentEncoder.errorBufferLength;
2623 myConverterData.currentEncoder.errorBufferLength = 0;
2630 private CoderResult encodeLoopIBM(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
2631 CoderResult err = CoderResult.UNDERFLOW;
2633 myConverterData.currentEncoder.fromUChar32 = fromUChar32;
2634 err = myConverterData.currentEncoder.cnvMBCSFromUnicodeWithOffsets(source, target, offsets, flush);
2635 fromUChar32 = myConverterData.currentEncoder.fromUChar32;
2637 if (err.isOverflow()) {
2638 if (myConverterData.currentEncoder.errorBufferLength > 0) {
2639 errorBuffer = myConverterData.currentEncoder.errorBuffer.clone();
2641 errorBufferLength = myConverterData.currentEncoder.errorBufferLength;
2642 myConverterData.currentEncoder.errorBufferLength = 0;
2648 protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
2649 CoderResult err = CoderResult.UNDERFLOW;
2650 int[] targetByteUnit = { 0x0000 };
2651 int sourceChar = 0x0000;
2652 boolean isTargetByteDBCS;
2653 boolean oldIsTargetByteDBCS;
2654 boolean usingFallback;
2656 boolean gotoGetTrail = false; // for goto getTrail label call
2659 * if the version is 1 then the user is requesting
2660 * conversion with ibm-25546 pass the argument to
2661 * MBCS converter and return
2663 if (myConverterData.version == 1) {
2664 return encodeLoopIBM(source, target, offsets, flush);
2667 usingFallback = useFallback;
2668 isTargetByteDBCS = fromUnicodeStatus == 0 ? false : true;
2669 if ((sourceChar = fromUChar32) != 0 && target.hasRemaining()) {
2670 gotoGetTrail = true;
2673 while (source.hasRemaining() || gotoGetTrail) {
2674 targetByteUnit[0] = UConverterConstants.missingCharMarker;
2676 if (target.hasRemaining() || gotoGetTrail) {
2677 if (!gotoGetTrail) {
2678 sourceChar = source.get();
2680 /* do not convert SO/SI/ESC */
2681 if (IS_2022_CONTROL(sourceChar)) {
2682 /* callback(illegal) */
2683 err = CoderResult.malformedForLength(1);
2684 fromUChar32 = sourceChar;
2687 myConverterData.currentConverter.sharedData.mbcs.outputType = CharsetMBCS.MBCS_OUTPUT_2;
2688 length = myConverterData.currentEncoder.fromUChar32(sourceChar, targetByteUnit, usingFallback);
2689 //length = MBCSFromUChar32_ISO2022(myConverterData.currentConverter.sharedData, sourceChar, targetByteUnit, usingFallback, CharsetMBCS.MBCS_OUTPUT_2);
2691 length = -length; /* fallback */
2693 /* only DBCS or SBCS characters are expected */
2694 /* DB characters with high bit set to 1 are expected */
2695 if (length > 2 || length == 0 ||
2696 (length == 1 && targetByteUnit[0] > 0x7f) ||
2698 ((char)(targetByteUnit[0] - 0xa1a1) > (0xfefe - 0xa1a1) ||
2699 ((targetByteUnit[0] - 0xa1) & UConverterConstants.UNSIGNED_BYTE_MASK) > (0xfe - 0xa1)))) {
2700 targetByteUnit[0] = UConverterConstants.missingCharMarker;
2703 if (!gotoGetTrail && targetByteUnit[0] != UConverterConstants.missingCharMarker) {
2704 oldIsTargetByteDBCS = isTargetByteDBCS;
2705 isTargetByteDBCS = (targetByteUnit[0] > 0x00FF);
2706 /* append the shift sequence */
2707 if (oldIsTargetByteDBCS != isTargetByteDBCS) {
2708 if (isTargetByteDBCS) {
2709 target.put((byte)UConverterConstants.SO);
2711 target.put((byte)UConverterConstants.SI);
2713 if (offsets != null) {
2714 offsets.put(source.position()-1);
2717 /* write the targetUniChar to target */
2718 if (targetByteUnit[0] <= 0x00FF) {
2719 if (target.hasRemaining()) {
2720 target.put((byte)targetByteUnit[0]);
2721 if (offsets != null) {
2722 offsets.put(source.position()-1);
2725 errorBuffer[errorBufferLength++] = (byte)targetByteUnit[0];
2726 err = CoderResult.OVERFLOW;
2729 if (target.hasRemaining()) {
2730 target.put((byte)(UConverterConstants.UNSIGNED_BYTE_MASK & ((targetByteUnit[0]>>8) - 0x80)));
2731 if (offsets != null) {
2732 offsets.put(source.position()-1);
2734 if (target.hasRemaining()) {
2735 target.put((byte)(UConverterConstants.UNSIGNED_BYTE_MASK & (targetByteUnit[0]- 0x80)));
2736 if (offsets != null) {
2737 offsets.put(source.position()-1);
2740 errorBuffer[errorBufferLength++] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & (targetByteUnit[0] - 0x80));
2741 err = CoderResult.OVERFLOW;
2745 errorBuffer[errorBufferLength++] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & ((targetByteUnit[0]>>8) - 0x80));
2746 errorBuffer[errorBufferLength++] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & (targetByteUnit[0]- 0x80));
2747 err = CoderResult.OVERFLOW;
2751 /* oops.. the code point is unassigned
2752 * set the error and reason
2755 /* check if the char is a First surrogate */
2756 if (gotoGetTrail || UTF16.isSurrogate((char)sourceChar)) {
2757 if (gotoGetTrail || UTF16.isLeadSurrogate((char)sourceChar)) {
2759 // reset gotoGetTrail flag
2760 gotoGetTrail = false;
2762 /* look ahead to find the trail surrogate */
2763 if (source.hasRemaining()) {
2764 /* test the following code unit */
2765 char trail = source.get();
2766 source.position(source.position()-1);
2767 if (UTF16.isTrailSurrogate(trail)) {
2769 sourceChar = UCharacter.getCodePoint((char)sourceChar, trail);
2770 err = CoderResult.unmappableForLength(2);
2771 /* convert this surrogate code point */
2772 /* exit this condition tree */
2774 /* this is an unmatched lead code unit (1st surrogate) */
2775 /* callback(illegal) */
2776 err = CoderResult.malformedForLength(1);
2780 err = CoderResult.UNDERFLOW;
2783 /* this is an unmatched trail code unit (2nd surrogate ) */
2784 /* callback(illegal) */
2785 err = CoderResult.malformedForLength(1);
2788 /* callback(unassigned) for a BMP code point */
2789 err = CoderResult.unmappableForLength(1);
2792 fromUChar32 = sourceChar;
2796 err = CoderResult.OVERFLOW;
2801 * the end of the input stream and detection of truncated input
2802 * are handled by the framework, but for ISO-2022-KR conversion
2803 * we need to be inASCII mode at the very end
2808 * end of input and no truncated input
2810 if (!err.isError() && isTargetByteDBCS && flush && !source.hasRemaining() && fromUChar32 == 0) {
2813 /* we are switching to ASCII */
2814 isTargetByteDBCS = false;
2816 /* get the source index of the last input character */
2817 sourceIndex = source.position();
2818 if (sourceIndex > 0) {
2820 if (UTF16.isTrailSurrogate(source.get(sourceIndex)) && UTF16.isLeadSurrogate(source.get(sourceIndex-1))) {
2827 CharsetEncoderICU.fromUWriteBytes(this, SHIFT_IN_STR, 0, 1, target, offsets, sourceIndex);
2829 /*save the state and return */
2830 fromUnicodeStatus = isTargetByteDBCS ? 1 : 0;
2836 public CharsetDecoder newDecoder() {
2839 return new CharsetDecoderISO2022JP(this);
2842 return new CharsetDecoderISO2022CN(this);
2845 setInitialStateToUnicodeKR();
2846 return new CharsetDecoderISO2022KR(this);
2848 default: /* should not happen */
2853 public CharsetEncoder newEncoder() {
2854 CharsetEncoderICU cnv;
2858 return new CharsetEncoderISO2022JP(this);
2861 return new CharsetEncoderISO2022CN(this);
2864 cnv = new CharsetEncoderISO2022KR(this);
2865 setInitialStateFromUnicodeKR(cnv);
2868 default: /* should not happen */
2873 private void setInitialStateToUnicodeKR() {
2874 if (myConverterData.version == 1) {
2875 myConverterData.currentDecoder.toUnicodeStatus = 0; /* offset */
2876 myConverterData.currentDecoder.mode = 0; /* state */
2877 myConverterData.currentDecoder.toULength = 0; /* byteIndex */
2880 private void setInitialStateFromUnicodeKR(CharsetEncoderICU cnv) {
2881 /* ISO-2022-KR the designator sequence appears only once
2882 * in a file so we append it only once
2884 if (cnv.errorBufferLength == 0) {
2885 cnv.errorBufferLength = 4;
2886 cnv.errorBuffer[0] = 0x1b;
2887 cnv.errorBuffer[1] = 0x24;
2888 cnv.errorBuffer[2] = 0x29;
2889 cnv.errorBuffer[3] = 0x43;
2891 if (myConverterData.version == 1) {
2892 ((CharsetMBCS)myConverterData.currentEncoder.charset()).subChar1 = 0x1A;
2893 myConverterData.currentEncoder.fromUChar32 = 0;
2894 myConverterData.currentEncoder.fromUnicodeStatus = 1; /* prevLength */
2898 void getUnicodeSetImpl(UnicodeSet setFillIn, int which) {
2900 /*open a set and initialize it with code points that are algorithmically round-tripped */
2904 /*include JIS X 0201 which is hardcoded */
2905 setFillIn.add(0xa5);
2906 setFillIn.add(0x203e);
2907 if((jpCharsetMasks[myConverterData.version]&CSM(ISO8859_1))!=0){
2908 /*include Latin-1 some variants of JP */
2909 setFillIn.add(0, 0xff);
2913 /* include ASCII for JP */
2914 setFillIn.add(0, 0x7f);
2916 if(myConverterData.version==3 || myConverterData.version==4 ||which == ROUNDTRIP_AND_FALLBACK_SET){
2918 * Do not test(jpCharsetMasks[myConverterData.version]&CSM(HWKANA_7BIT))!=0 because the bit
2919 * is on for all JP versions although version 3 & 4 (JIS7 and JIS8) use half-width Katakana.
2920 * This is because all ISO_2022_JP variant are lenient in that they accept (in toUnicode) half-width
2922 * However, we only emit (fromUnicode) half-width Katakana according to the
2923 * definition of each variant.
2925 * When including fallbacks,
2926 * we need to include half-width Katakana Unicode code points for all JP variants because
2927 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
2929 /* include half-width Katakana for JP */
2930 setFillIn.add(HWKANA_START, HWKANA_END);
2934 /* Include ASCII for CN */
2935 setFillIn.add(0, 0x7f);
2938 /* there is only one converter for KR */
2939 myConverterData.currentConverter.getUnicodeSetImpl(setFillIn, which);
2945 //TODO Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until
2946 for(i=0; i<UCNV_2022_MAX_CONVERTERS;i++){
2948 if(myConverterData.myConverterArray[i]!=null){
2949 if(variant==ISO_2022_CN && myConverterData.version==0 && i==CNS_11643){
2952 * version -specific for CN:
2953 * CN version 0 does not map CNS planes 3..7 although
2954 * they are all available in the CNS conversion table;
2955 * CN version 1 (-EXT) does map them all.
2956 * The two versions create different Unicode sets.
2958 filter=CharsetMBCS.UCNV_SET_FILTER_2022_CN;
2959 } else if(variant==ISO_2022_JP && i == JISX208){
2961 * Only add code points that map to Shift-JIS codes
2962 * corrosponding to JIS X 208
2964 filter=CharsetMBCS.UCNV_SET_FILTER_SJIS;
2965 } else if(i==KSC5601){
2967 * Some of the KSC 5601 tables (Convrtrs.txt has this aliases on multiple tables)
2968 * are broader than GR94.
2970 filter=CharsetMBCS.UCNV_SET_FILTER_GR94DBCS;
2972 filter=CharsetMBCS.UCNV_SET_FILTER_NONE;
2975 myConverterData.currentConverter.MBCSGetFilteredUnicodeSetForUnicode(myConverterData.myConverterArray[i],setFillIn, which, filter);
2979 * ISO Converter must not convert SO/SI/ESC despite what sub-converters do by themselves
2980 * Remove these characters from the set.
2982 setFillIn.remove(0x0e);
2983 setFillIn.remove(0x0f);
2984 setFillIn.remove(0x1b);
2986 /* ISO 2022 converter do not convert C! controls either */
2987 setFillIn.remove(0x80, 0x9f);