2 *******************************************************************************
\r
3 * Copyright (C) 2008-2009, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.charset;
\r
9 import java.nio.ByteBuffer;
\r
10 import java.nio.CharBuffer;
\r
11 import java.nio.IntBuffer;
\r
12 import java.nio.charset.CharsetDecoder;
\r
13 import java.nio.charset.CharsetEncoder;
\r
14 import java.nio.charset.CoderResult;
\r
15 import java.util.Arrays;
\r
17 import com.ibm.icu.charset.CharsetMBCS.CharsetDecoderMBCS;
\r
18 import com.ibm.icu.charset.CharsetMBCS.CharsetEncoderMBCS;
\r
19 import com.ibm.icu.lang.UCharacter;
\r
20 import com.ibm.icu.text.UTF16;
\r
21 import com.ibm.icu.text.UnicodeSet;
\r
23 class CharsetISO2022 extends CharsetICU {
\r
24 private UConverterDataISO2022 myConverterData;
\r
25 private int variant; // one of enum {ISO_2022_JP, ISO_2022_KR, or ISO_2022_CN}
\r
27 private static final byte[] SHIFT_IN_STR = { 0x0f };
\r
28 // private static final byte[] SHIFT_OUT_STR = { 0x0e };
\r
30 private static final byte CR = 0x0D;
\r
31 private static final byte LF = 0x0A;
\r
33 private static final byte H_TAB = 0x09;
\r
34 private static final byte SPACE = 0x20;
\r
36 private static final char HWKANA_START = 0xff61;
\r
37 private static final char HWKANA_END = 0xff9f;
\r
40 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
\r
41 * as bytes 21..7E. (Subtract 0x80.)
\r
42 * 96-character sets with native bit values A0..FF are encoded in ISO 2022
\r
43 * as bytes 20..7F. (Subtract 0x80.)
\r
44 * Do not encode C1 control codes with native bytes 80..9F
\r
45 * as bytes 00..1F (C0 control codes).
\r
48 private static final char GR94_START = 0xa1;
\r
49 private static final char GR94_END = 0xfe;
\r
51 private static final char GR96_START = 0xa0;
\r
52 private static final char GR96_END = 0xff;
\r
54 /* for ISO-2022-JP and -CN implementations */
\r
57 private static final byte INVALID_STATE = -1;
\r
58 private static final byte ASCII = 0;
\r
60 private static final byte SS2_STATE = 0x10;
\r
61 private static final byte SS3_STATE = 0x11;
\r
64 private static final byte ISO8859_1 = 1;
\r
65 private static final byte ISO8859_7 = 2;
\r
66 private static final byte JISX201 = 3;
\r
67 private static final byte JISX208 = 4;
\r
68 private static final byte JISX212 = 5;
\r
69 private static final byte GB2312 = 6;
\r
70 private static final byte KSC5601 = 7;
\r
71 private static final byte HWKANA_7BIT = 8; /* Halfwidth Katakana 7 bit */
\r
74 /* the first few enum constants must keep their values because they corresponds to myConverterArray[] */
\r
75 private static final byte GB2312_1 = 1;
\r
76 private static final byte ISO_IR_165= 2;
\r
77 private static final byte CNS_11643 = 3;
\r
80 * these are used in StateEnum and ISO2022State variables,
\r
81 * but CNS_11643 must be used to index into myConverterArray[]
\r
83 private static final byte CNS_11643_0 = 0x20;
\r
84 private static final byte CNS_11643_1 = 0x21;
\r
85 private static final byte CNS_11643_2 = 0x22;
\r
86 private static final byte CNS_11643_3 = 0x23;
\r
87 private static final byte CNS_11643_4 = 0x24;
\r
88 private static final byte CNS_11643_5 = 0x25;
\r
89 private static final byte CNS_11643_6 = 0x26;
\r
90 private static final byte CNS_11643_7 = 0x27;
\r
94 public CharsetISO2022(String icuCanonicalName, String javaCanonicalName, String[] aliases) {
\r
95 super(icuCanonicalName, javaCanonicalName, aliases);
\r
97 myConverterData = new UConverterDataISO2022();
\r
99 int versionIndex = icuCanonicalName.indexOf("version=");
\r
100 int version = Integer.decode(icuCanonicalName.substring(versionIndex+8, versionIndex+9)).intValue();
\r
102 myConverterData.version = version;
\r
104 if (icuCanonicalName.indexOf("locale=ja") > 0) {
\r
105 ISO2022InitJP(version);
\r
106 } else if (icuCanonicalName.indexOf("locale=zh") > 0) {
\r
107 ISO2022InitCN(version);
\r
108 } else /* if (icuCanonicalName.indexOf("locale=ko") > 0) */ {
\r
109 ISO2022InitKR(version);
\r
112 myConverterData.currentEncoder = (CharsetEncoderMBCS)myConverterData.currentConverter.newEncoder();
\r
113 myConverterData.currentDecoder = (CharsetDecoderMBCS)myConverterData.currentConverter.newDecoder();
\r
116 private void ISO2022InitJP(int version) {
\r
117 variant = ISO_2022_JP;
\r
119 maxBytesPerChar = 6;
\r
120 minBytesPerChar = 1;
\r
121 maxCharsPerByte = 1;
\r
122 // open the required converters and cache them
\r
123 if((jpCharsetMasks[version]&CSM(ISO8859_7)) != 0) {
\r
124 myConverterData.myConverterArray[ISO8859_7] = ((CharsetMBCS)CharsetICU.forNameICU("ISO8859_7")).sharedData;
\r
126 // myConverterData.myConverterArray[JISX201] = ((CharsetMBCS)CharsetICU.forNameICU("jisx-201")).sharedData;
\r
127 myConverterData.myConverterArray[JISX208] = ((CharsetMBCS)CharsetICU.forNameICU("Shift-JIS")).sharedData;
\r
128 if ((jpCharsetMasks[version]&CSM(JISX212)) != 0) {
\r
129 myConverterData.myConverterArray[JISX212] = ((CharsetMBCS)CharsetICU.forNameICU("jisx-212")).sharedData;
\r
131 if ((jpCharsetMasks[version]&CSM(GB2312)) != 0) {
\r
132 myConverterData.myConverterArray[GB2312] = ((CharsetMBCS)CharsetICU.forNameICU("ibm-5478")).sharedData;
\r
134 if ((jpCharsetMasks[version]&CSM(KSC5601)) != 0) {
\r
135 myConverterData.myConverterArray[KSC5601] = ((CharsetMBCS)CharsetICU.forNameICU("ksc_5601")).sharedData;
\r
138 // create a generic CharsetMBCS object
\r
139 myConverterData.currentConverter = (CharsetMBCS)CharsetICU.forNameICU("icu-internal-25546");
\r
142 private void ISO2022InitCN(int version) {
\r
143 variant = ISO_2022_CN;
\r
145 maxBytesPerChar = 8;
\r
146 minBytesPerChar = 1;
\r
147 maxCharsPerByte = 1;
\r
148 // open the required coverters and cache them.
\r
149 myConverterData.myConverterArray[GB2312_1] = ((CharsetMBCS)CharsetICU.forNameICU("ibm-5478")).sharedData;
\r
150 if (version == 1) {
\r
151 myConverterData.myConverterArray[ISO_IR_165] = ((CharsetMBCS)CharsetICU.forNameICU("iso-ir-165")).sharedData;
\r
153 myConverterData.myConverterArray[CNS_11643] = ((CharsetMBCS)CharsetICU.forNameICU("cns-11643-1992")).sharedData;
\r
155 // create a generic CharsetMBCS object
\r
156 myConverterData.currentConverter = (CharsetMBCS)CharsetICU.forNameICU("icu-internal-25546");
\r
159 private void ISO2022InitKR(int version) {
\r
160 variant = ISO_2022_KR;
\r
162 maxBytesPerChar = 3;
\r
163 minBytesPerChar = 1;
\r
164 maxCharsPerByte = 1;
\r
166 if (version == 1) {
\r
167 myConverterData.currentConverter = (CharsetMBCS)CharsetICU.forNameICU("icu-internal-25546");
\r
168 myConverterData.currentConverter.subChar1 = fromUSubstitutionChar[0][0];
\r
170 myConverterData.currentConverter = (CharsetMBCS)CharsetICU.forNameICU("ibm-949");
\r
173 myConverterData.currentEncoder = (CharsetEncoderMBCS)myConverterData.currentConverter.newEncoder();
\r
174 myConverterData.currentDecoder = (CharsetDecoderMBCS)myConverterData.currentConverter.newDecoder();
\r
178 * ISO 2022 control codes must not be converted from Unicode
\r
179 * because they would mess up the byte stream.
\r
180 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
\r
181 * corresponding to SO, SI, and ESC.
\r
183 private static boolean IS_2022_CONTROL(int c) {
\r
184 return (c<0x20) && (((1<<c) & 0x0800c000) != 0);
\r
188 * Check that the result is a 2-byte value with each byte in the range A1..FE
\r
189 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
\r
190 * to move it to the ISO 2022 range 21..7E.
\r
191 * return 0 if out of range.
\r
193 private static int _2022FromGR94DBCS(int value) {
\r
194 if ((value <= 0xfefe && value >= 0xa1a1) &&
\r
195 ((short)(value&UConverterConstants.UNSIGNED_BYTE_MASK) <= 0xfe && ((short)(value&UConverterConstants.UNSIGNED_BYTE_MASK) >= 0xa1))) {
\r
196 return (value - 0x8080); /* shift down to 21..7e byte range */
\r
198 return 0; /* not valid for ISO 2022 */
\r
203 * Commented out because Ticket 5691: Call sites now check for validity. They can just += 0x8080 after that.
\r
205 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
\r
206 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
\r
209 private static int _2022ToGR94DBCS(int value) {
\r
210 int returnValue = value + 0x8080;
\r
212 if ((returnValue <= 0xfefe && returnValue >= 0xa1a1) &&
\r
213 ((short)(returnValue&UConverterConstants.UNSIGNED_BYTE_MASK) <= 0xfe && ((short)(returnValue&UConverterConstants.UNSIGNED_BYTE_MASK) >= 0xa1))) {
\r
214 return returnValue;
\r
220 /* is the StateEnum charset value for a DBCS charset? */
\r
221 private static boolean IS_JP_DBCS(byte cs) {
\r
222 return ((JISX208 <= cs) && (cs <= KSC5601));
\r
225 private static short CSM(short cs) {
\r
226 return (short)(1<<cs);
\r
229 /* This gets the valid index of the end of buffer when decoding. */
\r
230 private static int getEndOfBuffer_2022(ByteBuffer source) {
\r
231 int sourceIndex = source.position();
\r
233 mySource = source.get(sourceIndex);
\r
235 while (source.hasRemaining() && mySource != ESC_2022) {
\r
236 mySource = source.get();
\r
237 if (mySource == ESC_2022) {
\r
242 return sourceIndex;
\r
246 * This is a simple version of _MBCSGetNextUChar() calls the method in CharsetDecoderMBCS and returns
\r
250 * U+fffe unassigned
\r
252 * otherwise the Unicode code point
\r
254 private int MBCSSimpleGetNextUChar(UConverterSharedData sharedData,
\r
255 ByteBuffer source,
\r
256 boolean useFallback) {
\r
258 UConverterSharedData tempSharedData = myConverterData.currentConverter.sharedData;
\r
259 myConverterData.currentConverter.sharedData = sharedData;
\r
260 returnValue = myConverterData.currentDecoder.simpleGetNextUChar(source, useFallback);
\r
261 myConverterData.currentConverter.sharedData = tempSharedData;
\r
263 return returnValue;
\r
267 * @param is the the output byte
\r
268 * @return 1 roundtrip byte 0 no mapping -1 fallback byte
\r
270 static int MBCSSingleFromUChar32(UConverterSharedData sharedData, int c, int[] retval, boolean useFallback) {
\r
273 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
\r
274 if (c >= 0x10000 && (sharedData.mbcs.unicodeMask&UConverterConstants.HAS_SUPPLEMENTARY) == 0) {
\r
277 /* convert the Unicode code point in c into codepage bytes */
\r
278 table = sharedData.mbcs.fromUnicodeTable;
\r
279 /* get the byte for the output */
\r
280 value = CharsetMBCS.MBCS_SINGLE_RESULT_FROM_U(table, sharedData.mbcs.fromUnicodeBytes, c);
\r
281 /* get the byte for the output */
\r
282 retval[0] = value & 0xff;
\r
283 if (value >= 0xf00) {
\r
284 return 1; /* roundtrip */
\r
285 } else if (useFallback ? value>=0x800 : value>=0xc00) {
\r
286 return -1; /* fallback taken */
\r
288 return 0; /* no mapping */
\r
293 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
\r
294 * to whether that charset is used in the corresponding version x of ISO_2022, locale=ja,version=x
\r
296 * Note: The converter uses some leniency:
\r
297 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
\r
298 * all versions, not just JIS7 and JIS8.
\r
299 * - ICU does not distinguish between different version so of JIS X 0208.
\r
301 private static final short jpCharsetMasks[] = {
\r
302 (short)(CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)),
\r
303 (short)(CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)),
\r
304 (short)(CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)),
\r
305 (short)(CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)),
\r
306 (short)(CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7))
\r
311 private static final byte ASCII1 = 0;
\r
312 private static final byte LATIN1 = 1;
\r
313 private static final byte SBCS = 2;
\r
314 private static final byte DBCS = 3;
\r
315 private static final byte MBCS = 4;
\r
316 private static final byte HWKANA = 5;
\r
320 private class ISO2022State {
\r
321 private byte []cs; /* Charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
\r
322 private byte g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
\r
323 private byte prevG; /* g before single shift (SS2 or SS3) */
\r
330 Arrays.fill(cs, (byte)0);
\r
336 // private static final byte UCNV_OPTIONS_VERSION_MASK = 0xf;
\r
337 private static final byte UCNV_2022_MAX_CONVERTERS = 10;
\r
339 @SuppressWarnings("unused")
\r
340 private class UConverterDataISO2022 {
\r
341 UConverterSharedData []myConverterArray;
\r
342 CharsetEncoderMBCS currentEncoder;
\r
343 CharsetDecoderMBCS currentDecoder;
\r
344 CharsetMBCS currentConverter;
\r
345 int currentType; // Cnv2022Type;
\r
346 ISO2022State toU2022State;
\r
347 ISO2022State fromU2022State;
\r
350 boolean isEmptySegment;
\r
352 UConverterDataISO2022() {
\r
353 myConverterArray = new UConverterSharedData[UCNV_2022_MAX_CONVERTERS];
\r
354 toU2022State = new ISO2022State();
\r
355 fromU2022State = new ISO2022State();
\r
359 isEmptySegment = false;
\r
363 toU2022State.reset();
\r
364 fromU2022State.reset();
\r
365 isEmptySegment = false;
\r
369 private static final byte ESC_2022 = 0x1B; /* ESC */
\r
372 private static final byte INVALID_2022 = -1; /* Doesn't correspond to a valid iso 2022 escape sequence */
\r
373 private static final byte VALID_NON_TERMINAL_2022 = 0; /* so far corresponds to a valid iso 2022 escape sequence */
\r
374 private static final byte VALID_TERMINAL_2022 = 1; /* corresponds to a valid iso 2022 escape sequence */
\r
375 private static final byte VALID_MAYBE_TERMINAL_2022 = 2; /* so far matches one iso 2022 escape sequence, but by adding
\r
376 more characters might match another escape sequence */
\r
377 // } UCNV_TableStates_2022;
\r
380 * The way these state transition arrays work is:
\r
381 * ex : ESC$B is the sequence for JISX208
\r
382 * a) First Iteration: char is ESC
\r
383 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
\r
384 * int x = normalize_esq_chars_2022[27] which is equal to 1
\r
385 * ii) Search for this value in escSeqStateTable_Key_2022[]
\r
386 * value of x is stored at escSeqStateTable_Key_2022[0]
\r
387 * iii) Save this index as offset
\r
388 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
\r
389 * escSeqStateTable_value_2022[offset], which is VALID_NON_TERMINAL_2022
\r
390 * b) Switch on this state and continue to next char
\r
391 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
\r
392 * which is normalize_esq_chars_2022[36] == 4
\r
393 * ii) x is currently 1(from above)
\r
394 * x<<=5 -- x is now 32
\r
395 * x+=normalize_esq_chars_2022[36]
\r
397 * iii) Search for this value in escSeqStateTable_Key_2022[]
\r
398 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
\r
399 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
\r
400 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
\r
401 * c) Switch on this state and continue to next char
\r
402 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
\r
403 * ii) x is currently 36 (from above)
\r
404 * x<<=5 -- x is now 1152
\r
405 * x+= normalize_esq_chars_2022[66]
\r
407 * iii) Search for this value in escSeqStateTable_Key_2022[]
\r
408 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
\r
409 * iv) Get state of this sequence from escSeqStateTable_Value_2022[1]
\r
410 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
\r
411 * v) Get the converter name from escSeqStateTable_Result_2022[21] which is JISX208
\r
413 /* Below are the 3 arrays depicting a state transition table */
\r
414 private static final byte normalize_esq_chars_2022[] = {
\r
415 /* 0 1 2 3 4 5 6 7 8 9 */
\r
416 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
417 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
418 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
\r
419 0, 0, 0, 0, 0, 0, 4, 7, 29, 0,
\r
420 2, 24, 26, 27, 0, 3, 23, 6, 0, 0,
\r
421 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
422 0, 0, 0, 0, 5, 8, 9, 10, 11, 12,
\r
423 13, 14, 15, 16, 17, 18, 19, 20, 25, 28,
\r
424 0, 0, 21, 0, 0, 0, 0, 0, 0, 0,
\r
425 22, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
426 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
427 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
428 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
429 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
430 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
431 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
432 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
433 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
434 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
435 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
436 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
437 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
438 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
439 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
440 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
441 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
445 private static final short MAX_STATES_2022 = 74;
\r
446 private static final int escSeqStateTable_Key_2022[/* MAX_STATES_2022 */] = {
\r
447 /* 0 1 2 3 4 5 6 7 8 9 */
\r
448 1, 34, 36, 39, 55, 57, 60, 61, 1093, 1096,
\r
449 1097, 1098, 1099, 1100, 1101, 1102, 1103, 1104, 1105, 1106,
\r
450 1109, 1154, 1157, 1160, 1161, 1176, 1178, 1179, 1254, 1257,
\r
451 1768, 1773, 1957, 35105, 36933, 36936, 36937, 36938, 36939, 36940,
\r
452 36942, 36943, 36944, 36945, 36946, 36947, 36948, 37640, 37642, 37644,
\r
453 37646, 37711, 37744, 37745, 37746, 37747, 37748, 40133, 40136, 40138,
\r
454 40139, 40140, 40141, 1123363, 35947624, 35947625, 35947626, 35947627, 35947629, 35947630,
\r
455 35947631, 35947635, 35947636, 35947638
\r
458 private static final byte escSeqStateTable_Value_2022[/* MAX_STATES_2022 */] = {
\r
460 VALID_NON_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_NON_TERMINAL_2022,
\r
461 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022,
\r
462 VALID_MAYBE_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022,
\r
463 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022,
\r
464 VALID_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022,
\r
465 VALID_NON_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_TERMINAL_2022,
\r
466 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_TERMINAL_2022,
\r
467 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022,
\r
468 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022,
\r
469 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022,
\r
470 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022,
\r
471 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022,
\r
472 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_TERMINAL_2022,
\r
473 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022,
\r
474 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022
\r
477 /* Type def for refactoring changeState_2022 code */
\r
479 private static final byte ISO_2022_JP = 1;
\r
480 private static final byte ISO_2022_KR = 2;
\r
481 private static final byte ISO_2022_CN = 3;
\r
484 /* const UConverterSharedData _ISO2022Data; */
\r
485 //private UConverterSharedData _ISO2022JPData;
\r
486 //private UConverterSharedData _ISO2022KRData;
\r
487 //private UConverterSharedData _ISO2022CNData;
\r
489 /******************** to unicode ********************/
\r
490 /****************************************************
\r
491 * Recognized escape sequenes are
\r
493 * <ESC>.A ISO-8859-1
\r
494 * <ESC>.F ISO-8859-7
\r
499 * <ESC>$(D JISX-212
\r
503 private final static byte nextStateToUnicodeJP[/* MAX_STATES_2022 */] = {
\r
504 /* 0 1 2 3 4 5 6 7 8 9 */
\r
505 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, SS2_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE,
\r
506 ASCII, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, JISX201, HWKANA_7BIT, JISX201, INVALID_STATE,
\r
507 INVALID_STATE, INVALID_STATE, JISX208, GB2312, JISX208, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE,
\r
508 ISO8859_1, ISO8859_7, JISX208, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, KSC5601, JISX212, INVALID_STATE,
\r
509 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE,
\r
510 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE,
\r
511 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE,
\r
512 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE
\r
515 private final static byte nextStateToUnicodeCN[/* MAX_STATES_2022 */] = {
\r
516 /* 0 1 2 3 4 5 6 7 8 9 */
\r
517 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, SS2_STATE, SS3_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE,
\r
518 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE,
\r
519 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE,
\r
520 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE,
\r
521 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, GB2312_1, INVALID_STATE, ISO_IR_165,
\r
522 CNS_11643_1, CNS_11643_2, CNS_11643_3, CNS_11643_4, CNS_11643_5, CNS_11643_6, CNS_11643_7, INVALID_STATE, INVALID_STATE, INVALID_STATE,
\r
523 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE,
\r
524 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE
\r
527 /* runs through a state machine to determine the escape sequence - codepage correspondence */
\r
528 @SuppressWarnings("fallthrough")
\r
529 private CoderResult changeState_2022(CharsetDecoderICU decoder, ByteBuffer source, int var) {
\r
530 CoderResult err = CoderResult.UNDERFLOW;
\r
531 boolean DONE = false;
\r
533 int key[] = {myConverterData.key};
\r
534 int offset[] = {0};
\r
535 int initialToULength = decoder.toULength;
\r
537 int malformLength = 0;
\r
539 value = VALID_NON_TERMINAL_2022;
\r
540 while (source.hasRemaining()) {
\r
543 decoder.toUBytesArray[decoder.toULength++] = c;
\r
544 value = getKey_2022(c, key, offset);
\r
548 case VALID_NON_TERMINAL_2022:
\r
549 /* continue with the loop */
\r
552 case VALID_TERMINAL_2022:
\r
561 case VALID_MAYBE_TERMINAL_2022:
\r
562 /* not ISO_2022 itself, finish here */
\r
563 value = VALID_TERMINAL_2022;
\r
573 myConverterData.key = key[0];
\r
575 if (value == VALID_NON_TERMINAL_2022) {
\r
576 /* indicate that the escape sequence is incomplete: key !=0 */
\r
578 } else if (value == INVALID_2022) {
\r
579 err = CoderResult.malformedForLength(malformLength);
\r
580 } else /* value == VALID_TERMINAL_2022 */ {
\r
582 case ISO_2022_JP: {
\r
583 byte tempState = nextStateToUnicodeJP[offset[0]];
\r
584 switch (tempState) {
\r
585 case INVALID_STATE:
\r
586 err = CoderResult.malformedForLength(malformLength);
\r
589 if (myConverterData.toU2022State.cs[2] != 0) {
\r
590 if (myConverterData.toU2022State.g < 2) {
\r
591 myConverterData.toU2022State.prevG = myConverterData.toU2022State.g;
\r
593 myConverterData.toU2022State.g = 2;
\r
595 /* illegal to have SS2 before a matching designator */
\r
596 err = CoderResult.malformedForLength(malformLength);
\r
599 /* case SS3_STATE: not used in ISO-2022-JP-x */
\r
602 if ((jpCharsetMasks[myConverterData.version] & CSM(tempState)) == 0) {
\r
603 err = CoderResult.unmappableForLength(malformLength);
\r
605 /* G2 charset for SS2 */
\r
606 myConverterData.toU2022State.cs[2] = tempState;
\r
610 if ((jpCharsetMasks[myConverterData.version] & CSM(tempState)) == 0) {
\r
611 err = CoderResult.unmappableForLength(source.position() - 1);
\r
614 myConverterData.toU2022State.cs[0] = tempState;
\r
620 case ISO_2022_CN: {
\r
621 byte tempState = nextStateToUnicodeCN[offset[0]];
\r
622 switch (tempState) {
\r
623 case INVALID_STATE:
\r
624 err = CoderResult.unmappableForLength(malformLength);
\r
627 if (myConverterData.toU2022State.cs[2] != 0) {
\r
628 if (myConverterData.toU2022State.g < 2) {
\r
629 myConverterData.toU2022State.prevG = myConverterData.toU2022State.g;
\r
631 myConverterData.toU2022State.g = 2;
\r
633 /* illegal to have SS2 before a matching designator */
\r
634 err = CoderResult.malformedForLength(malformLength);
\r
638 if (myConverterData.toU2022State.cs[3] != 0) {
\r
639 if (myConverterData.toU2022State.g < 2) {
\r
640 myConverterData.toU2022State.prevG = myConverterData.toU2022State.g;
\r
642 myConverterData.toU2022State.g = 3;
\r
644 /* illegal to have SS3 before a matching designator */
\r
645 err = CoderResult.malformedForLength(malformLength);
\r
649 if (myConverterData.version == 0) {
\r
650 err = CoderResult.unmappableForLength(malformLength);
\r
657 myConverterData.toU2022State.cs[1] = tempState;
\r
660 myConverterData.toU2022State.cs[2] = tempState;
\r
663 /* other CNS 11643 planes */
\r
664 if (myConverterData.version == 0) {
\r
665 err = CoderResult.unmappableForLength(source.position() - 1);
\r
667 myConverterData.toU2022State.cs[3] = tempState;
\r
674 if (offset[0] == 0x30) {
\r
675 /* nothing to be done, just accept this one escape sequence */
\r
677 err = CoderResult.unmappableForLength(malformLength);
\r
681 err = CoderResult.malformedForLength(malformLength);
\r
685 if (!err.isError()) {
\r
686 decoder.toULength = 0;
\r
687 } else if (err.isMalformed()) {
\r
688 if (decoder.toULength > 1) {
\r
690 * Ticket 5691: consistent illegal sequences:
\r
691 * - We include at least the first byte (ESC) in the illegal sequence.
\r
692 * - If any of the non-initial bytes could be the start of a character,
\r
693 * we stop the illegal sequece before the first one of those.
\r
694 * In escape sequences, all following bytes are "printable", that is,
\r
695 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
\r
696 * they are valid single/lead bytes.
\r
697 * For simplicity, we always only report the initial ESC byte as the
\r
698 * illegal sequence and back out all other bytes we looked at.
\r
700 /* Back out some bytes. */
\r
701 int backOutDistance = decoder.toULength - 1;
\r
702 int bytesFromThisBuffer = decoder.toULength - initialToULength;
\r
703 if (backOutDistance <= bytesFromThisBuffer) {
\r
704 /* same as initialToULength<=1 */
\r
705 source.position(source.position() - backOutDistance);
\r
707 /* Back out bytes from the previous buffer: Need to replay them. */
\r
708 decoder.preToULength = (byte)(bytesFromThisBuffer - backOutDistance);
\r
709 /* same as -(initalToULength-1) */
\r
710 /* preToULength is negative! */
\r
711 for (int i = 0; i < -(decoder.preToULength); i++) {
\r
712 decoder.preToUArray[i] = decoder.toUBytesArray[i+1];
\r
714 source.position(source.position() - bytesFromThisBuffer);
\r
716 decoder.toULength = 1;
\r
723 private static byte getKey_2022(byte c, int[]key, int[]offset) {
\r
726 int hi = MAX_STATES_2022;
\r
729 togo = normalize_esq_chars_2022[(short)c&UConverterConstants.UNSIGNED_BYTE_MASK];
\r
732 /* not a valid character anywhere in an escape sequence */
\r
735 return INVALID_2022;
\r
737 togo = (key[0] << 5) + togo;
\r
739 while (hi != low) { /* binary search */
\r
740 int mid = (hi+low) >> 1; /* Finds median */
\r
742 if (mid == oldmid) {
\r
746 if (escSeqStateTable_Key_2022[mid] > togo) {
\r
748 } else if (escSeqStateTable_Key_2022[mid] < togo) {
\r
750 } else /* we found it */ {
\r
753 return escSeqStateTable_Value_2022[mid];
\r
757 return INVALID_2022;
\r
761 * To Unicode Callback helper function
\r
763 private static CoderResult toUnicodeCallback(CharsetDecoderICU cnv, int sourceChar, int targetUniChar) {
\r
764 CoderResult err = CoderResult.UNDERFLOW;
\r
765 if (sourceChar > 0xff) {
\r
766 cnv.toUBytesArray[0] = (byte)(sourceChar>>8);
\r
767 cnv.toUBytesArray[1] = (byte)sourceChar;
\r
770 cnv.toUBytesArray[0] = (byte)sourceChar;
\r
774 if (targetUniChar == (UConverterConstants.missingCharMarker-1/* 0xfffe */)) {
\r
775 err = CoderResult.unmappableForLength(1);
\r
777 err = CoderResult.malformedForLength(1);
\r
783 /****************************ISO-2022-JP************************************/
\r
784 private class CharsetDecoderISO2022JP extends CharsetDecoderICU {
\r
785 public CharsetDecoderISO2022JP(CharsetICU cs) {
\r
789 protected void implReset() {
\r
791 myConverterData.reset();
\r
794 * Map 00..7F to Unicode according to JIS X 0201.
\r
796 private int jisx201ToU(int value) {
\r
797 if (value < 0x5c) {
\r
799 } else if (value == 0x5c) {
\r
801 } else if (value == 0x7e) {
\r
803 } else { /* value <= 0x7f */
\r
808 * Convert a pair of JIS X 208 21..7E bytes to Shift-JIS.
\r
809 * If either byte is outside 21..7E make sure that the result is not valid
\r
810 * for Shift-JIS so that the converter catches it.
\r
811 * Some invalid byte values already turn into equally invalid Shift-JIS
\r
812 * byte values and need not be tested explicitly.
\r
814 private void _2022ToSJIS(char c1, char c2, byte []bytes) {
\r
819 } else if (c2 <= 0x7e) {
\r
822 c2 = 0; /* invalid */
\r
825 if ((c2 >= 0x21) && (c2 <= 0x7e)) {
\r
828 c2 = 0; /* invalid */
\r
835 } else if (c1 <= 0x3f) {
\r
838 c1 = 0; /* invalid */
\r
840 bytes[0] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & c1);
\r
841 bytes[1] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & c2);
\r
844 @SuppressWarnings("fallthrough")
\r
845 protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
\r
846 boolean gotoGetTrail = false;
\r
847 boolean gotoEscape = false;
\r
848 CoderResult err = CoderResult.UNDERFLOW;
\r
849 byte []tempBuf = new byte[2];
\r
850 int targetUniChar = 0x0000;
\r
851 int mySourceChar = 0x0000;
\r
852 int mySourceCharTemp = 0x0000; // use for getTrail label call.
\r
853 byte cs; /* StateEnum */
\r
854 byte csTemp= 0; // use for getTrail label call.
\r
856 if (myConverterData.key != 0) {
\r
857 /* continue with a partial escape sequence */
\r
860 } else if (toULength == 1 && source.hasRemaining() && target.hasRemaining()) {
\r
861 /* continue with a partial double-byte character */
\r
862 mySourceChar = (toUBytesArray[0] & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
864 cs = myConverterData.toU2022State.cs[myConverterData.toU2022State.g];
\r
865 // goto getTrailByte;
\r
866 mySourceCharTemp = 0x99;
\r
867 gotoGetTrail = true;
\r
870 while (source.hasRemaining() || gotoEscape || gotoGetTrail) {
\r
871 // This code is here for the goto escape label call above.
\r
873 mySourceCharTemp = ESC_2022;
\r
876 targetUniChar = UConverterConstants.missingCharMarker;
\r
878 if (gotoEscape || gotoGetTrail || target.hasRemaining()) {
\r
879 if (!gotoEscape && !gotoGetTrail) {
\r
880 mySourceChar = source.get() & UConverterConstants.UNSIGNED_BYTE_MASK;
\r
881 mySourceCharTemp = mySourceChar;
\r
884 switch (mySourceCharTemp) {
\r
885 case UConverterConstants.SI:
\r
886 if (myConverterData.version == 3) {
\r
887 myConverterData.toU2022State.g = 0;
\r
890 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
\r
891 myConverterData.isEmptySegment = false;
\r
895 case UConverterConstants.SO:
\r
896 if (myConverterData.version == 3) {
\r
897 /* JIS7: switch to G1 half-width Katakana */
\r
898 myConverterData.toU2022State.cs[1] = HWKANA_7BIT;
\r
899 myConverterData.toU2022State.g = 1;
\r
902 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
\r
903 myConverterData.isEmptySegment = false; /* reset this, we have a different error */
\r
909 source.position(source.position() - 1);
\r
911 gotoEscape = false;
\r
915 int mySourceBefore = source.position();
\r
916 int toULengthBefore = this.toULength;
\r
918 err = changeState_2022(this, source, variant);
\r
920 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
\r
921 if(myConverterData.version == 0 && myConverterData.key == 0 && !err.isError() && myConverterData.isEmptySegment) {
\r
922 err = CoderResult.malformedForLength(source.position() - mySourceBefore);
\r
923 this.toULength = toULengthBefore + (source.position() - mySourceBefore);
\r
927 /* invalid or illegal escape sequence */
\r
929 myConverterData.isEmptySegment = false; /* Reset to avoid future spurious errors */
\r
932 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
\r
933 if(myConverterData.key == 0) {
\r
934 myConverterData.isEmptySegment = true;
\r
938 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
\r
940 /* falls through */
\r
942 /* automatically reset to single-byte mode */
\r
943 if (myConverterData.toU2022State.cs[0] != ASCII && myConverterData.toU2022State.cs[0] != JISX201) {
\r
944 myConverterData.toU2022State.cs[0] = ASCII;
\r
946 myConverterData.toU2022State.cs[2] = 0;
\r
947 myConverterData.toU2022State.g = 0;
\r
948 /* falls through */
\r
950 /* convert one or two bytes */
\r
951 myConverterData.isEmptySegment = false;
\r
952 cs = myConverterData.toU2022State.cs[myConverterData.toU2022State.g];
\r
954 if (gotoGetTrail) {
\r
955 csTemp = (byte)0x99;
\r
957 if (!gotoGetTrail && ((mySourceChar >= 0xa1) && (mySourceChar <= 0xdf) && myConverterData.version == 4 && !IS_JP_DBCS(cs))) {
\r
958 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
\r
959 targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
\r
961 /* return from a single-shift state to the previous one */
\r
962 if (myConverterData.toU2022State.g >= 2) {
\r
963 myConverterData.toU2022State.g = myConverterData.toU2022State.prevG;
\r
968 if (mySourceChar <= 0x7f) {
\r
969 targetUniChar = mySourceChar;
\r
973 if (mySourceChar <= 0x7f) {
\r
974 targetUniChar = mySourceChar + 0x80;
\r
976 /* return from a single-shift state to the prevous one */
\r
977 myConverterData.toU2022State.g = myConverterData.toU2022State.prevG;
\r
980 if (mySourceChar <= 0x7f) {
\r
981 /* convert mySourceChar+0x80 to use a normal 8-bit table */
\r
982 targetUniChar = CharsetMBCS.MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(myConverterData.myConverterArray[cs].mbcs,
\r
983 mySourceChar+0x80);
\r
985 /* return from a single-shift state to the previous one */
\r
986 myConverterData.toU2022State.g = myConverterData.toU2022State.prevG;
\r
989 if (mySourceChar <= 0x7f) {
\r
990 targetUniChar = jisx201ToU(mySourceChar);
\r
994 if ((mySourceChar >= 0x21) && (mySourceChar <= 0x5f)) {
\r
995 /* 7-bit halfwidth Katakana */
\r
996 targetUniChar = mySourceChar + (HWKANA_START - 0x21);
\r
1001 if (gotoGetTrail || source.hasRemaining()) {
\r
1003 int tmpSourceChar;
\r
1004 gotoGetTrail = false;
\r
1006 boolean leadIsOk, trailIsOk;
\r
1008 trailByte = (short)(source.get(source.position()) & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
1010 * Ticket 5691: consistent illegal sequences:
\r
1011 * - We include at least the first byte in the illegal sequence.
\r
1012 * - If any of the non-initial bytes could be the start of a character,
\r
1013 * we stop the illegal sequence before the first one of those.
\r
1015 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
\r
1016 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
\r
1017 * Otherwise we convert or report the pair of bytes.
\r
1019 leadIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (mySourceChar - 0x21)) <= (0x7e - 0x21);
\r
1020 trailIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (trailByte - 0x21)) <= (0x7e - 0x21);
\r
1021 if (leadIsOk && trailIsOk) {
\r
1023 tmpSourceChar = (mySourceChar << 8) | trailByte;
\r
1024 if (cs == JISX208) {
\r
1025 _2022ToSJIS((char)mySourceChar, (char)trailByte, tempBuf);
\r
1026 mySourceChar = tmpSourceChar;
\r
1028 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
\r
1029 mySourceChar = tmpSourceChar;
\r
1030 if (cs == KSC5601) {
\r
1031 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
\r
1033 tempBuf[0] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & (tmpSourceChar >> 8));
\r
1034 tempBuf[1] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & tmpSourceChar);
\r
1036 targetUniChar = MBCSSimpleGetNextUChar(myConverterData.myConverterArray[cs], ByteBuffer.wrap(tempBuf), false);
\r
1037 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
\r
1038 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
\r
1040 /* add another bit so that the code below writes 2 bytes in case of error */
\r
1041 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
\r
1044 toUBytesArray[0] = (byte)mySourceChar;
\r
1049 } /* end of inner switch */
\r
1052 } /* end of outer switch */
\r
1054 if (targetUniChar < (UConverterConstants.missingCharMarker-1/*0xfffe*/)) {
\r
1055 if (offsets != null) {
\r
1056 offsets.put(target.remaining(), source.remaining() - (mySourceChar <= 0xff ? 1 : 2));
\r
1058 target.put((char)targetUniChar);
\r
1059 } else if (targetUniChar > UConverterConstants.missingCharMarker) {
\r
1060 /* disassemble the surrogate pair and write to output */
\r
1061 targetUniChar -= 0x0010000;
\r
1062 target.put((char)(0xd800 + (char)(targetUniChar>>10)));
\r
1063 target.position(target.position()-1);
\r
1064 if (offsets != null) {
\r
1065 offsets.put(target.remaining(), source.remaining() - (mySourceChar <= 0xff ? 1 : 2));
\r
1068 if (target.hasRemaining()) {
\r
1069 target.put((char)(0xdc00+(char)(targetUniChar&0x3ff)));
\r
1070 target.position(target.position()-1);
\r
1071 if (offsets != null) {
\r
1072 offsets.put(target.remaining(), source.remaining() - (mySourceChar <= 0xff ? 1 : 2));
\r
1076 charErrorBufferArray[charErrorBufferLength++] =
\r
1077 (char)(0xdc00+(char)(targetUniChar&0x3ff));
\r
1080 /* Call the callback function */
\r
1081 err = toUnicodeCallback(this, mySourceChar, targetUniChar);
\r
1084 } else { /* goes with "if (target.hasRemaining())" way up near the top of the function */
\r
1085 err = CoderResult.OVERFLOW;
\r
1092 } // end of class CharsetDecoderISO2022JP
\r
1094 /****************************ISO-2022-CN************************************/
\r
1095 private class CharsetDecoderISO2022CN extends CharsetDecoderICU {
\r
1096 public CharsetDecoderISO2022CN(CharsetICU cs) {
\r
1100 protected void implReset() {
\r
1101 super.implReset();
\r
1102 myConverterData.reset();
\r
1105 @SuppressWarnings("fallthrough")
\r
1106 protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
\r
1107 CoderResult err = CoderResult.UNDERFLOW;
\r
1108 byte[] tempBuf = new byte[3];
\r
1109 int targetUniChar = 0x0000;
\r
1110 int mySourceChar = 0x0000;
\r
1111 int mySourceCharTemp = 0x0000;
\r
1112 boolean gotoEscape = false;
\r
1113 boolean gotoGetTrailByte = false;
\r
1115 if (myConverterData.key != 0) {
\r
1116 /* continue with a partial escape sequence */
\r
1118 gotoEscape = true;
\r
1119 } else if (toULength == 1 && source.hasRemaining() && target.hasRemaining()) {
\r
1120 /* continue with a partial double-byte character */
\r
1121 mySourceChar = (toUBytesArray[0] & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
1123 targetUniChar = UConverterConstants.missingCharMarker;
\r
1124 // goto getTrailByte
\r
1125 gotoGetTrailByte = true;
\r
1128 while (source.hasRemaining() || gotoGetTrailByte || gotoEscape) {
\r
1129 targetUniChar = UConverterConstants.missingCharMarker;
\r
1131 if (target.hasRemaining() || gotoEscape) {
\r
1133 mySourceChar = ESC_2022; // goto escape label
\r
1134 mySourceCharTemp = mySourceChar;
\r
1135 } else if (gotoGetTrailByte) {
\r
1136 mySourceCharTemp = 0xff; // goto getTrailByte; set mySourceCharTemp to go to default
\r
1138 mySourceChar = UConverterConstants.UNSIGNED_BYTE_MASK & source.get();
\r
1139 mySourceCharTemp = mySourceChar;
\r
1142 switch (mySourceCharTemp) {
\r
1143 case UConverterConstants.SI:
\r
1144 myConverterData.toU2022State.g = 0;
\r
1145 if (myConverterData.isEmptySegment) {
\r
1146 myConverterData.isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */
\r
1147 err = CoderResult.malformedForLength(1);
\r
1148 this.toUBytesArray[0] = (byte)mySourceChar;
\r
1149 this.toULength = 1;
\r
1154 case UConverterConstants.SO:
\r
1155 if (myConverterData.toU2022State.cs[1] != 0) {
\r
1156 myConverterData.toU2022State.g = 1;
\r
1157 myConverterData.isEmptySegment = true; /* Begin a new segment, empty so far */
\r
1160 /* illegal to have SO before a matching designator */
\r
1161 myConverterData.isEmptySegment = false; /* Handling a different error, reset this to avoid future spurious errs */
\r
1166 if (!gotoEscape) {
\r
1167 source.position(source.position()-1);
\r
1170 gotoEscape = false;
\r
1172 int mySourceBefore = source.position();
\r
1173 int toULengthBefore = this.toULength;
\r
1175 err = changeState_2022(this, source, ISO_2022_CN);
\r
1177 /* After SO there must be at least one character before a designator (designator error handled separately) */
\r
1178 if(myConverterData.key == 0 && !err.isError() && myConverterData.isEmptySegment) {
\r
1179 err = CoderResult.malformedForLength(source.position() - mySourceBefore);
\r
1180 this.toULength = toULengthBefore + (source.position() - mySourceBefore);
\r
1184 /* invalid or illegal escape sequence */
\r
1185 if(err.isError()){
\r
1186 myConverterData.isEmptySegment = false; /* Reset to avoid future spurious errors */
\r
1191 /*ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
\r
1193 /* falls through */
\r
1195 myConverterData.toU2022State.reset();
\r
1196 /* falls through */
\r
1198 /* converter one or two bytes */
\r
1199 myConverterData.isEmptySegment = false;
\r
1200 if (myConverterData.toU2022State.g != 0 || gotoGetTrailByte) {
\r
1201 if (source.hasRemaining() || gotoGetTrailByte) {
\r
1202 UConverterSharedData cnv;
\r
1205 boolean leadIsOk, trailIsOk;
\r
1207 // getTrailByte: label
\r
1208 gotoGetTrailByte = false; // reset gotoGetTrailByte
\r
1210 trailByte = (short)(source.get(source.position()) & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
1212 * Ticket 5691: consistent illegal sequences:
\r
1213 * - We include at least the first byte in the illegal sequence.
\r
1214 * - If any of the non-initial bytes could be the start of a character,
\r
1215 * we stop the illegal sequence before the first one of those.
\r
1217 * In ISO-2022 DBCS, if the second byte is in the range 21..7e range or is
\r
1218 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
\r
1219 * Otherwise we convert or report the pair of bytes.
\r
1221 leadIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (mySourceChar - 0x21)) <= (0x7e - 0x21);
\r
1222 trailIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (trailByte - 0x21)) <= (0x7e - 0x21);
\r
1223 if (leadIsOk && trailIsOk) {
\r
1225 tempState = myConverterData.toU2022State.cs[myConverterData.toU2022State.g];
\r
1226 if (tempState > CNS_11643_0) {
\r
1227 cnv = myConverterData.myConverterArray[CNS_11643];
\r
1228 tempBuf[0] = (byte)(0x80 + (tempState - CNS_11643_0));
\r
1229 tempBuf[1] = (byte)mySourceChar;
\r
1230 tempBuf[2] = (byte)trailByte;
\r
1233 cnv = myConverterData.myConverterArray[tempState];
\r
1234 tempBuf[0] = (byte)mySourceChar;
\r
1235 tempBuf[1] = (byte)trailByte;
\r
1238 ByteBuffer tempBuffer = ByteBuffer.wrap(tempBuf);
\r
1239 tempBuffer.limit(tempBufLen);
\r
1240 targetUniChar = MBCSSimpleGetNextUChar(cnv, tempBuffer, false);
\r
1241 mySourceChar = (mySourceChar << 8) | trailByte;
\r
1243 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
\r
1244 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
\r
1246 /* add another bit so that the code below writes 2 bytes in case of error */
\r
1247 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
\r
1249 if (myConverterData.toU2022State.g >= 2) {
\r
1250 /* return from a single-shift state to the previous one */
\r
1251 myConverterData.toU2022State.g = myConverterData.toU2022State.prevG;
\r
1254 toUBytesArray[0] = (byte)mySourceChar;
\r
1260 if (mySourceChar <= 0x7f) {
\r
1261 targetUniChar = (char)mySourceChar;
\r
1266 if ((UConverterConstants.UNSIGNED_INT_MASK&targetUniChar) < (UConverterConstants.UNSIGNED_INT_MASK&(UConverterConstants.missingCharMarker-1))) {
\r
1267 if (offsets != null) {
\r
1268 offsets.array()[target.position()] = source.remaining() - (mySourceChar <= 0xff ? 1 : 2);
\r
1270 target.put((char)targetUniChar);
\r
1271 } else if ((UConverterConstants.UNSIGNED_INT_MASK&targetUniChar) > (UConverterConstants.UNSIGNED_INT_MASK&(UConverterConstants.missingCharMarker))) {
\r
1272 /* disassemble the surrogate pair and write to output */
\r
1273 targetUniChar -= 0x0010000;
\r
1274 target.put((char)(0xd800+(char)(targetUniChar>>10)));
\r
1275 if (offsets != null) {
\r
1276 offsets.array()[target.position()-1] = source.position() - (mySourceChar <= 0xff ? 1 : 2);
\r
1278 if (target.hasRemaining()) {
\r
1279 target.put((char)(0xdc00+(char)(targetUniChar&0x3ff)));
\r
1280 if (offsets != null) {
\r
1281 offsets.array()[target.position()-1] = source.position() - (mySourceChar <= 0xff ? 1 : 2);
\r
1284 charErrorBufferArray[charErrorBufferLength++] = (char)(0xdc00+(char)(targetUniChar&0x3ff));
\r
1287 /* Call the callback function */
\r
1288 err = toUnicodeCallback(this, mySourceChar, targetUniChar);
\r
1293 err = CoderResult.OVERFLOW;
\r
1302 /************************ ISO-2022-KR ********************/
\r
1303 private class CharsetDecoderISO2022KR extends CharsetDecoderICU {
\r
1304 public CharsetDecoderISO2022KR(CharsetICU cs) {
\r
1308 protected void implReset() {
\r
1309 super.implReset();
\r
1310 setInitialStateToUnicodeKR();
\r
1311 myConverterData.reset();
\r
1314 protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
\r
1315 CoderResult err = CoderResult.UNDERFLOW;
\r
1316 int mySourceChar = 0x0000;
\r
1317 int targetUniChar = 0x0000;
\r
1318 byte[] tempBuf = new byte[2];
\r
1319 boolean usingFallback;
\r
1320 boolean gotoGetTrailByte = false;
\r
1321 boolean gotoEscape = false;
\r
1323 if (myConverterData.version == 1) {
\r
1324 return decodeLoopIBM(myConverterData.currentDecoder, source, target, offsets, flush);
\r
1327 /* initialize state */
\r
1328 usingFallback = isFallbackUsed();
\r
1330 if (myConverterData.key != 0) {
\r
1331 /* continue with a partial escape sequence */
\r
1332 gotoEscape = true;
\r
1333 } else if (toULength == 1 && source.hasRemaining() && target.hasRemaining()) {
\r
1334 /* continue with a partial double-byte character */
\r
1335 mySourceChar = (toUBytesArray[0] & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
1337 gotoGetTrailByte = true;
\r
1340 while (source.hasRemaining() || gotoGetTrailByte || gotoEscape) {
\r
1341 if (target.hasRemaining() || gotoGetTrailByte || gotoEscape) {
\r
1342 if (!gotoGetTrailByte && !gotoEscape) {
\r
1343 mySourceChar = (char)(source.get() & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
1346 if (!gotoGetTrailByte && !gotoEscape && mySourceChar == UConverterConstants.SI) {
\r
1347 myConverterData.toU2022State.g = 0;
\r
1348 if (myConverterData.isEmptySegment) {
\r
1349 myConverterData.isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */
\r
1350 err = CoderResult.malformedForLength(1);
\r
1351 this.toUBytesArray[0] = (byte)mySourceChar;
\r
1352 this.toULength = 1;
\r
1355 /* consume the source */
\r
1357 } else if (!gotoGetTrailByte && !gotoEscape && mySourceChar == UConverterConstants.SO) {
\r
1358 myConverterData.toU2022State.g = 1;
\r
1359 myConverterData.isEmptySegment = true;
\r
1360 /* consume the source */
\r
1362 } else if (!gotoGetTrailByte && (gotoEscape || mySourceChar == ESC_2022)) {
\r
1363 if (!gotoEscape) {
\r
1364 source.position(source.position()-1);
\r
1367 gotoEscape = false; // reset gotoEscape flag
\r
1368 myConverterData.isEmptySegment = false; /* Any invalid ESC sequences will be detected separately, so just reset this */
\r
1369 err = changeState_2022(this, source, ISO_2022_KR);
\r
1370 if (err.isError()) {
\r
1375 myConverterData.isEmptySegment = false; /* Any invalid char errors will be detected separately, so just reset this */
\r
1376 if (myConverterData.toU2022State.g == 1 || gotoGetTrailByte) {
\r
1377 if (source.hasRemaining() || gotoGetTrailByte) {
\r
1378 boolean leadIsOk, trailIsOk;
\r
1380 // getTrailByte label
\r
1381 gotoGetTrailByte = false; // reset gotoGetTrailByte flag
\r
1383 trailByte = (short)(source.get(source.position()) & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
1384 targetUniChar = UConverterConstants.missingCharMarker;
\r
1386 * Ticket 5691: consistent illegal sequences:
\r
1387 * - We include at least the first byte in the illegal sequence.
\r
1388 * - If any of the non-initial bytes could be the start of a character,
\r
1389 * we stop the illegal sequence before the first one of those.
\r
1391 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
\r
1392 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
\r
1393 * Otherwise we convert or report the pair of bytes.
\r
1395 leadIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (mySourceChar - 0x21)) <= (0x7e - 0x21);
\r
1396 trailIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (trailByte - 0x21)) <= (0x7e - 0x21);
\r
1397 if (leadIsOk && trailIsOk) {
\r
1399 tempBuf[0] = (byte)(mySourceChar + 0x80);
\r
1400 tempBuf[1] = (byte)(trailByte + 0x80);
\r
1401 targetUniChar = MBCSSimpleGetNextUChar(myConverterData.currentConverter.sharedData, ByteBuffer.wrap(tempBuf), usingFallback);
\r
1402 mySourceChar = (char)((mySourceChar << 8) | trailByte);
\r
1403 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
\r
1404 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
\r
1406 /* add another bit so that the code below writes 2 bytes in case of error */
\r
1407 mySourceChar = (char)(0x10000 | (mySourceChar << 8) | trailByte);
\r
1410 toUBytesArray[0] = (byte)mySourceChar;
\r
1414 } else if (mySourceChar <= 0x7f) {
\r
1415 int savedSourceLimit = source.limit();
\r
1416 int savedSourcePosition = source.position();
\r
1417 source.limit(source.position());
\r
1418 source.position(source.position()-1);
\r
1419 targetUniChar = MBCSSimpleGetNextUChar(myConverterData.currentConverter.sharedData, source, usingFallback);
\r
1420 source.limit(savedSourceLimit);
\r
1421 source.position(savedSourcePosition);
\r
1423 targetUniChar = 0xffff;
\r
1425 if (targetUniChar < 0xfffe) {
\r
1426 target.put((char)targetUniChar);
\r
1427 if (offsets != null) {
\r
1428 offsets.array()[target.position()] = source.position() - (mySourceChar <= 0xff ? 1 : 2);
\r
1431 /* Call the callback function */
\r
1432 err = toUnicodeCallback(this, mySourceChar, targetUniChar);
\r
1436 err = CoderResult.OVERFLOW;
\r
1444 protected CoderResult decodeLoopIBM(CharsetDecoderMBCS cnv, ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
\r
1445 CoderResult err = CoderResult.UNDERFLOW;
\r
1450 boolean gotoEscape = false;
\r
1451 int oldSourceLimit;
\r
1453 /* remember the original start of the input for offsets */
\r
1454 sourceStart = argSource = source.position();
\r
1456 if (myConverterData.key != 0) {
\r
1457 /* continue with a partial escape sequence */
\r
1458 gotoEscape = true;
\r
1461 while (gotoEscape || (!err.isError() && source.hasRemaining())) {
\r
1462 if (!gotoEscape) {
\r
1463 /* Find the end of the buffer e.g : Next Escape Seq | end of Buffer */
\r
1464 int oldSourcePos = source.position();
\r
1465 sourceLimit = getEndOfBuffer_2022(source);
\r
1466 source.position(oldSourcePos);
\r
1467 if (source.position() != sourceLimit) {
\r
1469 * get the current partial byte sequence
\r
1471 * it needs to be moved between the public and the subconverter
\r
1472 * so that the conversion frameword, which only sees the public
\r
1473 * converter, can handle truncated and illegal input etc.
\r
1475 if (toULength > 0) {
\r
1476 cnv.toUBytesArray = toUBytesArray.clone();
\r
1478 cnv.toULength = toULength;
\r
1481 * Convert up to the end of the input, or to before the next escape character.
\r
1482 * Does not handle conversion extensions because the preToU[] state etc.
\r
1485 argTarget = target.position();
\r
1486 oldSourceLimit = source.limit(); // save the old source limit change to new one
\r
1487 source.limit(sourceLimit);
\r
1488 err = myConverterData.currentDecoder.cnvMBCSToUnicodeWithOffsets(source, target, offsets, flush);
\r
1489 source.limit(oldSourceLimit); // restore source limit;
\r
1490 if (offsets != null && sourceStart != argSource) {
\r
1491 /* update offsets to base them on the actual start of the input */
\r
1492 int delta = argSource - sourceStart;
\r
1493 while (argTarget < target.position()) {
\r
1494 int currentOffset = offsets.get();
\r
1495 offsets.position(offsets.position()-1);
\r
1496 if (currentOffset >= 0) {
\r
1497 offsets.put(currentOffset + delta);
\r
1498 offsets.position(offsets.position()-1);
\r
1504 argSource = source.position();
\r
1506 /* copy input/error/overflow buffers */
\r
1507 if (cnv.toULength > 0) {
\r
1508 toUBytesArray = cnv.toUBytesArray.clone();
\r
1510 toULength = cnv.toULength;
\r
1512 if (err.isOverflow()) {
\r
1513 if (cnv.charErrorBufferLength > 0) {
\r
1514 charErrorBufferArray = cnv.charErrorBufferArray.clone();
\r
1516 charErrorBufferLength = cnv.charErrorBufferLength;
\r
1517 cnv.charErrorBufferLength = 0;
\r
1521 if (err.isError() || err.isOverflow() || (source.position() == source.limit())) {
\r
1526 gotoEscape = false;
\r
1527 err = changeState_2022(this, source, ISO_2022_KR);
\r
1533 /******************** from unicode **********************/
\r
1534 /* preference order of JP charsets */
\r
1535 private final static byte []jpCharsetPref = {
\r
1547 * The escape sequences must be in order of the enum constants like JISX201 = 3,
\r
1548 * not in order of jpCharsetPref[]!
\r
1550 private final static byte [][]escSeqChars = {
\r
1551 { 0x1B, 0x28, 0x42}, /* <ESC>(B ASCII */
\r
1552 { 0x1B, 0x2E, 0x41}, /* <ESC>.A ISO-8859-1 */
\r
1553 { 0x1B, 0x2E, 0x46}, /* <ESC>.F ISO-8859-7 */
\r
1554 { 0x1B, 0x28, 0x4A}, /* <ESC>(J JISX-201 */
\r
1555 { 0x1B, 0x24, 0x42}, /* <ESC>$B JISX-208 */
\r
1556 { 0x1B, 0x24, 0x28, 0x44}, /* <ESC>$(D JISX-212 */
\r
1557 { 0x1B, 0x24, 0x41}, /* <ESC>$A GB2312 */
\r
1558 { 0x1B, 0x24, 0x28, 0x43}, /* <ESC>$(C KSC5601 */
\r
1559 { 0x1B, 0x28, 0x49} /* <ESC>(I HWKANA_7BIT */
\r
1562 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
\r
1564 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
\r
1565 * because Shift-JIS roundtrips half-width Katakana to single bytes.
\r
1566 * These were the only fallbacks in ICU's jisx-208.ucm file.
\r
1568 private final static char []hwkana_fb = {
\r
1569 0x2123, /* U+FF61 */
\r
1584 0x213C, /* U+FF70 */
\r
1600 0x253F, /* U+FF80 */
\r
1616 0x255F, /* U+FF90 */
\r
1631 0x212C /* U+FF9F */
\r
1634 protected byte [][]fromUSubstitutionChar = new byte[][]{ { (byte)0x1A }, { (byte)0x2F, (byte)0x7E} };
\r
1635 /****************************ISO-2022-JP************************************/
\r
1636 private class CharsetEncoderISO2022JP extends CharsetEncoderICU {
\r
1637 public CharsetEncoderISO2022JP(CharsetICU cs) {
\r
1638 super(cs, fromUSubstitutionChar[0]);
\r
1641 protected void implReset() {
\r
1642 super.implReset();
\r
1643 myConverterData.reset();
\r
1645 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
\r
1646 private int jisx201FromU(int value) {
\r
1647 if (value <= 0x7f) {
\r
1648 if (value != 0x5c && value != 0x7e) {
\r
1651 } else if (value == 0xa5) {
\r
1653 } else if (value == 0x203e) {
\r
1656 return (int)(UConverterConstants.UNSIGNED_INT_MASK & 0xfffe);
\r
1660 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
\r
1661 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
\r
1662 * Return 0 if the byte pair is out of range.
\r
1664 private int _2022FromSJIS(int value) {
\r
1667 if (value > 0xEFFC) {
\r
1668 return 0; /* beyond JIS X 0208 */
\r
1671 trail = (short)(value & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
1673 value &= 0xff00; /* lead byte */
\r
1674 if (value <= 0x9f00) {
\r
1676 } else { /* 0xe000 <= value <= 0xef00 */
\r
1682 if (trail <= 0x9e) {
\r
1684 if (trail <= 0x7e) {
\r
1685 value |= ((trail - 0x1f) & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
1687 value |= ((trail - 0x20) & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
1689 } else { /* trail <= 0xfc */
\r
1690 value |= ((trail - 0x7e) & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
1695 /* This overrides the cbFromUWriteSub method in CharsetEncoderICU */
\r
1696 CoderResult cbFromUWriteSub (CharsetEncoderICU encoder,
\r
1697 CharBuffer source, ByteBuffer target, IntBuffer offsets){
\r
1698 CoderResult err = CoderResult.UNDERFLOW;
\r
1699 byte[] buffer = new byte[8];
\r
1702 subchar = encoder.replacement();
\r
1705 if (myConverterData.fromU2022State.g == 1) {
\r
1706 /* JIS7: switch from G1 to G0 */
\r
1707 myConverterData.fromU2022State.g = 0;
\r
1708 buffer[i++] = UConverterConstants.SI;
\r
1710 cs = myConverterData.fromU2022State.cs[0];
\r
1712 if (cs != ASCII && cs != JISX201) {
\r
1713 /* not in ASCII or JIS X 0201: switch to ASCII */
\r
1714 myConverterData.fromU2022State.cs[0] = ASCII;
\r
1715 buffer[i++] = 0x1B;
\r
1716 buffer[i++] = 0x28;
\r
1717 buffer[i++] = 0x42;
\r
1720 buffer[i++] = subchar[0];
\r
1722 err = CharsetEncoderICU.fromUWriteBytes(this, buffer, 0, i, target, offsets, source.position() - 1);
\r
1727 protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
\r
1728 CoderResult err = CoderResult.UNDERFLOW;
\r
1733 byte[] choices = new byte[10];
\r
1734 int targetValue = 0;
\r
1735 boolean usingFallback;
\r
1736 byte[] buffer = new byte[8];
\r
1737 boolean getTrail = false; // use for getTrail label
\r
1738 int oldSourcePos; // for proper error handling
\r
1742 /* check if the last codepoint of previous buffer was a lead surrogate */
\r
1743 if ((sourceChar = fromUChar32) != 0 && target.hasRemaining()) {
\r
1747 while (getTrail || source.hasRemaining()) {
\r
1748 if (getTrail || target.hasRemaining()) {
\r
1749 oldSourcePos = source.position();
\r
1750 if (!getTrail) { /* skip if going to getTrail label */
\r
1751 sourceChar = source.get();
\r
1753 /* check if the char is a First surrogate */
\r
1754 if (getTrail || UTF16.isSurrogate((char)sourceChar)) {
\r
1755 if (getTrail || UTF16.isLeadSurrogate((char)sourceChar)) {
\r
1760 /* look ahead to find the trail surrogate */
\r
1761 if (source.hasRemaining()) {
\r
1762 /* test the following code unit */
\r
1763 char trail = source.get();
\r
1764 /* go back to the previous position */
\r
1765 source.position(source.position()-1);
\r
1766 if (UTF16.isTrailSurrogate(trail)) {
\r
1768 sourceChar = UCharacter.getCodePoint((char)sourceChar, trail);
\r
1769 fromUChar32 = 0x00;
\r
1770 /* convert this supplementary code point */
\r
1771 /* exit this condition tree */
\r
1773 /* this is an unmatched lead code unit (1st surrogate) */
\r
1774 /* callback(illegal) */
\r
1775 err = CoderResult.malformedForLength(1);
\r
1776 fromUChar32 = sourceChar;
\r
1780 /* no more input */
\r
1781 fromUChar32 = sourceChar;
\r
1785 /* this is an unmatched trail code unit (2nd surrogate) */
\r
1786 /* callback(illegal) */
\r
1787 err = CoderResult.malformedForLength(1);
\r
1788 fromUChar32 = sourceChar;
\r
1793 /* do not convert SO/SI/ESC */
\r
1794 if (IS_2022_CONTROL(sourceChar)) {
\r
1795 /* callback(illegal) */
\r
1796 err = CoderResult.malformedForLength(1);
\r
1797 fromUChar32 = sourceChar;
\r
1801 /* do the conversion */
\r
1803 if (choiceCount == 0) {
\r
1806 * The csm variable keeps track of which charsets are allowed
\r
1807 * and not used yet while building the choices[].
\r
1809 csm = (char)jpCharsetMasks[myConverterData.version];
\r
1812 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
\r
1813 if (myConverterData.version == 3 || myConverterData.version == 4) {
\r
1814 choices[choiceCount++] = HWKANA_7BIT;
\r
1816 /* Do not try single-bit half-width Katakana for other versions. */
\r
1817 csm &= ~CSM(HWKANA_7BIT);
\r
1819 /* try the current G0 charset */
\r
1820 choices[choiceCount++] = cs = myConverterData.fromU2022State.cs[0];
\r
1823 /* try the current G2 charset */
\r
1824 if ((cs = myConverterData.fromU2022State.cs[2]) != 0) {
\r
1825 choices[choiceCount++] = cs;
\r
1829 /* try all the other charsets */
\r
1830 for (int i = 0; i < jpCharsetPref.length; i++) {
\r
1831 cs = jpCharsetPref[i];
\r
1832 if ((CSM(cs) & csm) != 0) {
\r
1833 choices[choiceCount++] = cs;
\r
1841 * len==0: no mapping found yet
\r
1842 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
\r
1843 * len>0: found a roundtrip result, done
\r
1847 * We will turn off usingFallBack after finding a fallback,
\r
1848 * but we still get fallbacks from PUA code points as usual.
\r
1849 * Therefore, we will also need to check that we don't overwrite
\r
1850 * an early fallback with a later one.
\r
1852 usingFallback = useFallback;
\r
1854 for (int i = 0; i < choiceCount && len <= 0; i++) {
\r
1855 int[] value = new int[1];
\r
1857 byte cs0 = choices[i];
\r
1860 if (sourceChar <= 0x7f) {
\r
1861 targetValue = sourceChar;
\r
1868 if (GR96_START <= sourceChar && sourceChar <= GR96_END) {
\r
1869 targetValue = sourceChar - 0x80;
\r
1876 if (sourceChar <= HWKANA_END && sourceChar >= HWKANA_START) {
\r
1877 if (myConverterData.version == 3) {
\r
1878 /* JIS7: use G1 (SO) */
\r
1879 /* Shift U+FF61..U+FF9F to bytes 21..5F. */
\r
1880 targetValue = (int)(UConverterConstants.UNSIGNED_INT_MASK & (sourceChar - (HWKANA_START - 0x21)));
\r
1882 myConverterData.fromU2022State.cs[1] = cs = cs0; /* do not output an escape sequence */
\r
1884 } else if (myConverterData.version == 4) {
\r
1885 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
\r
1886 /* Shift U+FF61..U+FF9F to bytes A1..DF. */
\r
1887 targetValue = (int)(UConverterConstants.UNSIGNED_INT_MASK & (sourceChar - (HWKANA_START - 0xa1)));
\r
1890 cs = myConverterData.fromU2022State.cs[0];
\r
1891 if (IS_JP_DBCS(cs)) {
\r
1892 /* switch from a DBCS charset to JISX201 */
\r
1895 /* else stay in the current G0 charset */
\r
1898 /* else do not use HWKANA_7BIT with other versions */
\r
1903 value[0] = jisx201FromU(sourceChar);
\r
1904 if (value[0] <= 0x7f) {
\r
1905 targetValue = value[0];
\r
1909 usingFallback = false;
\r
1913 /* G0 DBCS from JIS table */
\r
1914 myConverterData.currentConverter.sharedData = myConverterData.myConverterArray[cs0];
\r
1915 myConverterData.currentConverter.sharedData.mbcs.outputType = CharsetMBCS.MBCS_OUTPUT_2;
\r
1916 len2 = myConverterData.currentEncoder.fromUChar32(sourceChar, value, usingFallback);
\r
1917 //len2 = MBCSFromUChar32_ISO2022(myConverterData.myConverterArray[cs0], sourceChar, value, usingFallback, CharsetMBCS.MBCS_OUTPUT_2);
\r
1918 if (len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len) == 2 */
\r
1919 value[0] = _2022FromSJIS(value[0]);
\r
1920 if (value[0] != 0) {
\r
1921 targetValue = value[0];
\r
1925 usingFallback = false;
\r
1927 } else if (len == 0 && usingFallback && sourceChar <= HWKANA_END && sourceChar >= HWKANA_START) {
\r
1928 targetValue = hwkana_fb[sourceChar - HWKANA_START];
\r
1932 usingFallback = false;
\r
1936 /* G0 SBCS forced to 7-bit output */
\r
1937 len2 = MBCSSingleFromUChar32(myConverterData.myConverterArray[cs0], sourceChar, value, usingFallback);
\r
1938 if (len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value[0] && value[0] <= GR96_END) {
\r
1939 targetValue = value[0] - 0x80;
\r
1943 usingFallback = false;
\r
1948 myConverterData.currentConverter.sharedData = myConverterData.myConverterArray[cs0];
\r
1949 myConverterData.currentConverter.sharedData.mbcs.outputType = CharsetMBCS.MBCS_OUTPUT_2;
\r
1950 len2 = myConverterData.currentEncoder.fromUChar32(sourceChar, value, usingFallback);
\r
1951 //len2 = MBCSFromUChar32_ISO2022(myConverterData.myConverterArray[cs0], sourceChar, value, usingFallback, CharsetMBCS.MBCS_OUTPUT_2);
\r
1952 if (len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
\r
1953 if (cs0 == KSC5601) {
\r
1955 * Check for valid bytes for the encoding scheme.
\r
1956 * This is necessary because the sub-converter (windows-949)
\r
1957 * has a broader encoding scheme than is valid for 2022.
\r
1959 value[0] = _2022FromGR94DBCS(value[0]);
\r
1960 if (value[0] == 0) {
\r
1964 targetValue = value[0];
\r
1968 usingFallback = false;
\r
1976 len = -len; /* fallback */
\r
1980 /* write SI if necessary (only for JIS7 */
\r
1981 if (myConverterData.fromU2022State.g == 1 && g == 0) {
\r
1982 buffer[outLen++] = UConverterConstants.SI;
\r
1983 myConverterData.fromU2022State.g = 0;
\r
1986 /* write the designation sequence if necessary */
\r
1987 if (cs != myConverterData.fromU2022State.cs[g]) {
\r
1988 for (int i = 0; i < escSeqChars[cs].length; i++) {
\r
1989 buffer[outLen++] = escSeqChars[cs][i];
\r
1991 myConverterData.fromU2022State.cs[g] = cs;
\r
1993 /* invalidate the choices[] */
\r
1997 /* write the shift sequence if necessary */
\r
1998 if (g != myConverterData.fromU2022State.g) {
\r
2000 /* case 0 handled before writing escapes */
\r
2002 buffer[outLen++] = UConverterConstants.SO;
\r
2003 myConverterData.fromU2022State.g = 1;
\r
2005 default : /* case 2 */
\r
2006 buffer[outLen++] = 0x1b;
\r
2007 buffer[outLen++] = 0x4e;
\r
2009 /* case 3: no SS3 in ISO-2022-JP-x */
\r
2013 /* write the output bytes */
\r
2015 buffer[outLen++] = (byte)targetValue;
\r
2016 } else { /* len == 2 */
\r
2017 buffer[outLen++] = (byte)(targetValue >> 8);
\r
2018 buffer[outLen++] = (byte)targetValue;
\r
2022 * if we cannot find the character after checking all codepages
\r
2023 * then this is an error.
\r
2025 err = CoderResult.unmappableForLength(source.position()-oldSourcePos);
\r
2026 fromUChar32 = sourceChar;
\r
2030 if (sourceChar == CR || sourceChar == LF) {
\r
2031 /* reset the G2 state at the end of a line (conversion got use into ASCII or JISX201 already) */
\r
2032 myConverterData.fromU2022State.cs[2] = 0;
\r
2036 /* output outLen>0 bytes in buffer[] */
\r
2037 if (outLen == 1) {
\r
2038 target.put(buffer[0]);
\r
2039 if (offsets != null) {
\r
2040 offsets.put(source.remaining() - 1); /* -1 known to be ASCII */
\r
2042 } else if (outLen == 2 && (target.position() + 2) <= target.limit()) {
\r
2043 target.put(buffer[0]);
\r
2044 target.put(buffer[1]);
\r
2045 if (offsets != null) {
\r
2046 int sourceIndex = source.position() - 1;
\r
2047 offsets.put(sourceIndex);
\r
2048 offsets.put(sourceIndex);
\r
2051 err = CharsetEncoderICU.fromUWriteBytes(this, buffer, 0, outLen, target, offsets, source.position()-1);
\r
2054 err = CoderResult.OVERFLOW;
\r
2060 * the end of the input stream and detection of truncated input
\r
2061 * are handled by the framework, but for ISO-2022-JP conversion
\r
2062 * we need to be in ASCII mode at the very end
\r
2066 * in SO mode or not in ASCII mode
\r
2067 * end of input and no truncated input
\r
2069 if (!err.isError() &&
\r
2070 (myConverterData.fromU2022State.g != 0 || myConverterData.fromU2022State.cs[0] != ASCII) &&
\r
2071 flush && !source.hasRemaining() && fromUChar32 == 0) {
\r
2076 if (myConverterData.fromU2022State.g != 0) {
\r
2077 buffer[outLen++] = UConverterConstants.SI;
\r
2078 myConverterData.fromU2022State.g = 0;
\r
2081 if (myConverterData.fromU2022State.cs[0] != ASCII) {
\r
2082 for (int i = 0; i < escSeqChars[ASCII].length; i++) {
\r
2083 buffer[outLen++] = escSeqChars[ASCII][i];
\r
2085 myConverterData.fromU2022State.cs[0] = ASCII;
\r
2088 /* get the source index of the last input character */
\r
2089 sourceIndex = source.position();
\r
2090 if (sourceIndex > 0) {
\r
2092 if (UTF16.isTrailSurrogate(source.get(sourceIndex)) &&
\r
2093 (sourceIndex == 0 || UTF16.isLeadSurrogate(source.get(sourceIndex-1)))) {
\r
2100 err = CharsetEncoderICU.fromUWriteBytes(this, buffer, 0, outLen, target, offsets, sourceIndex);
\r
2105 /****************************ISO-2022-CN************************************/
\r
2107 * Rules for ISO-2022-CN Encoding:
\r
2108 * i) The designator sequence must appear once on a line before any instance
\r
2109 * of chracter set it designates.
\r
2110 * ii) If two lines contain characters from the same character set, both lines
\r
2111 * must include the designator sequence.
\r
2112 * iii) Once the designator sequence is known, a shifting sequence has to be found
\r
2113 * to invoke the shifting
\r
2114 * iv) All lines start in ASCII and end in ASCII.
\r
2115 * v) Four shifting sequences are employed for this purpose:
\r
2116 * Sequence ASCII Eq Charsets
\r
2117 * --------- --------- --------
\r
2118 * SI <SI> US-ASCII
\r
2119 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
\r
2120 * SS2 <ESC>N CNS-11643-1992 Plane 2
\r
2121 * SS3 <ESC>O CNS-11643-1992 Planes 3-7
\r
2123 * SOdesignator : ESC "$" ")" finalchar_for_SO
\r
2124 * SS2designator : ESC "$" "*" finalchar_for_SS2
\r
2125 * SS3designator : ESC "$" "+" finalchar_for_SS3
\r
2127 * ESC $ ) A Indicates the bytes following SO are Chinese
\r
2128 * characters as defined in GB 2312-80, until
\r
2129 * another SOdesignation appears
\r
2131 * ESC $ ) E Indicates the bytes following SO are as defined
\r
2132 * in ISO-IR-165 (for details, see section 2.1),
\r
2133 * until another SOdesignation appears
\r
2135 * ESC $ ) G Indicates the bytes following SO are as defined
\r
2136 * in CNS 11643-plane-1, until another SOdesignation appears
\r
2138 * ESC $ * H Indicates teh two bytes immediately following
\r
2139 * SS2 is a Chinese character as defined in CNS
\r
2140 * 11643-plane-2, until another SS2designation
\r
2142 * (Meaning <ESC>N must preceed ever 2 byte sequence.)
\r
2144 * ESC $ + I Indicates the immediate two bytes following SS3
\r
2145 * is a Chinese character as defined in CNS
\r
2146 * 11643-plane-3, until another SS3designation
\r
2148 * (Meaning <ESC>O must preceed every 2 byte sequence.)
\r
2150 * ESC $ + J Indicates the immediate two bytes following SS3
\r
2151 * is a Chinese character as defined in CNS
\r
2152 * 11643-plane-4, until another SS3designation
\r
2154 * (In English: <ESC>O must preceed every 2 byte sequence.)
\r
2156 * ESC $ + K Indicates the immediate two bytes following SS3
\r
2157 * is a Chinese character as defined in CNS
\r
2158 * 11643-plane-5, until another SS3designation
\r
2161 * ESC $ + L Indicates the immediate two bytes following SS3
\r
2162 * is a Chinese character as defined in CNS
\r
2163 * 11643-plane-6, until another SS3designation
\r
2166 * ESC $ + M Indicates the immediate two bytes following SS3
\r
2167 * is a Chinese character as defined in CNS
\r
2168 * 11643-plane-7, until another SS3designation
\r
2171 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
\r
2172 * has its own designation information before any Chinese chracters
\r
2176 /* The following are defined this way to make strings truely readonly */
\r
2177 private final static byte[] GB_2312_80_STR = { 0x1B, 0x24, 0x29, 0x41 };
\r
2178 private final static byte[] ISO_IR_165_STR = { 0x1B, 0x24, 0x29, 0x45 };
\r
2179 private final static byte[] CNS_11643_1992_Plane_1_STR = { 0x1B, 0x24, 0x29, 0x47 };
\r
2180 private final static byte[] CNS_11643_1992_Plane_2_STR = { 0x1B, 0x24, 0x2A, 0x48 };
\r
2181 private final static byte[] CNS_11643_1992_Plane_3_STR = { 0x1B, 0x24, 0x2B, 0x49 };
\r
2182 private final static byte[] CNS_11643_1992_Plane_4_STR = { 0x1B, 0x24, 0x2B, 0x4A };
\r
2183 private final static byte[] CNS_11643_1992_Plane_5_STR = { 0x1B, 0x24, 0x2B, 0x4B };
\r
2184 private final static byte[] CNS_11643_1992_Plane_6_STR = { 0x1B, 0x24, 0x2B, 0x4C };
\r
2185 private final static byte[] CNS_11643_1992_Plane_7_STR = { 0x1B, 0x24, 0x2B, 0x4D };
\r
2187 /************************ ISO2022-CN Data *****************************/
\r
2188 private final static byte[][] escSeqCharsCN = {
\r
2192 CNS_11643_1992_Plane_1_STR,
\r
2193 CNS_11643_1992_Plane_2_STR,
\r
2194 CNS_11643_1992_Plane_3_STR,
\r
2195 CNS_11643_1992_Plane_4_STR,
\r
2196 CNS_11643_1992_Plane_5_STR,
\r
2197 CNS_11643_1992_Plane_6_STR,
\r
2198 CNS_11643_1992_Plane_7_STR,
\r
2201 private class CharsetEncoderISO2022CN extends CharsetEncoderICU {
\r
2202 public CharsetEncoderISO2022CN(CharsetICU cs) {
\r
2203 super(cs, fromUSubstitutionChar[0]);
\r
2206 protected void implReset() {
\r
2207 super.implReset();
\r
2208 myConverterData.reset();
\r
2211 /* This overrides the cbFromUWriteSub method in CharsetEncoderICU */
\r
2212 CoderResult cbFromUWriteSub (CharsetEncoderICU encoder,
\r
2213 CharBuffer source, ByteBuffer target, IntBuffer offsets){
\r
2214 CoderResult err = CoderResult.UNDERFLOW;
\r
2215 byte[] buffer = new byte[8];
\r
2218 subchar = encoder.replacement();
\r
2220 if (myConverterData.fromU2022State.g != 0) {
\r
2221 /* not in ASCII mode: switch to ASCII */
\r
2222 myConverterData.fromU2022State.g = 0;
\r
2223 buffer[i++] = UConverterConstants.SI;
\r
2225 buffer[i++] = subchar[0];
\r
2227 err = CharsetEncoderICU.fromUWriteBytes(this, buffer, 0, i, target, offsets, source.position() - 1);
\r
2232 protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
\r
2233 CoderResult err = CoderResult.UNDERFLOW;
\r
2235 byte[] buffer = new byte[8];
\r
2237 byte[] choices = new byte[3];
\r
2239 int targetValue = 0;
\r
2240 boolean usingFallback;
\r
2241 boolean gotoGetTrail = false;
\r
2242 int oldSourcePos; // For proper error handling
\r
2246 /* check if the last codepoint of previous buffer was a lead surrogate */
\r
2247 if ((sourceChar = fromUChar32) != 0 && target.hasRemaining()) {
\r
2248 // goto getTrail label
\r
2249 gotoGetTrail = true;
\r
2252 while (source.hasRemaining() || gotoGetTrail) {
\r
2253 if (target.hasRemaining() || gotoGetTrail) {
\r
2254 oldSourcePos = source.position();
\r
2255 if (!gotoGetTrail) {
\r
2256 sourceChar = source.get();
\r
2258 /* check if the char is a First surrogate */
\r
2259 if (UTF16.isSurrogate((char)sourceChar) || gotoGetTrail) {
\r
2260 if (UTF16.isLeadSurrogate((char)sourceChar) || gotoGetTrail) {
\r
2262 /* reset gotoGetTrail flag*/
\r
2263 gotoGetTrail = false;
\r
2265 /* look ahead to find the trail surrogate */
\r
2266 if (source.hasRemaining()) {
\r
2267 /* test the following code unit */
\r
2268 char trail = source.get();
\r
2269 source.position(source.position()-1);
\r
2270 if (UTF16.isTrailSurrogate(trail)) {
\r
2272 sourceChar = UCharacter.getCodePoint((char)sourceChar, trail);
\r
2273 fromUChar32 = 0x00;
\r
2274 /* convert this supplementary code point */
\r
2275 /* exit this condition tree */
\r
2277 /* this is an unmatched lead code unit (1st surrogate) */
\r
2278 /* callback(illegal) */
\r
2279 err = CoderResult.malformedForLength(1);
\r
2280 fromUChar32 = sourceChar;
\r
2284 /* no more input */
\r
2285 fromUChar32 = sourceChar;
\r
2289 /* this is an unmatched trail code unit (2nd surrogate) */
\r
2290 /* callback(illegal) */
\r
2291 err = CoderResult.malformedForLength(1);
\r
2292 fromUChar32 = sourceChar;
\r
2297 /* do the conversion */
\r
2298 if (sourceChar <= 0x007f) {
\r
2299 /* do not converter SO/SI/ESC */
\r
2300 if (IS_2022_CONTROL(sourceChar)) {
\r
2301 /* callback(illegal) */
\r
2302 err = CoderResult.malformedForLength(1);
\r
2303 fromUChar32 = sourceChar;
\r
2308 if (myConverterData.fromU2022State.g == 0) {
\r
2309 buffer[0] = (byte)sourceChar;
\r
2312 buffer[0] = UConverterConstants.SI;
\r
2313 buffer[1] = (byte)sourceChar;
\r
2315 myConverterData.fromU2022State.g = 0;
\r
2319 if (sourceChar == CR || sourceChar == LF) {
\r
2320 /* reset the state at the end of a line */
\r
2321 myConverterData.fromU2022State.reset();
\r
2325 /* convert U+0080..U+10ffff */
\r
2329 if (choiceCount == 0) {
\r
2330 /* try the current SO/G1 converter first */
\r
2331 choices[0] = myConverterData.fromU2022State.cs[1];
\r
2333 /* default to GB2312_1 if none is designated yet */
\r
2334 if (choices[0] == 0) {
\r
2335 choices[0] = GB2312_1;
\r
2337 if (myConverterData.version == 0) {
\r
2339 /* try other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
\r
2340 if (choices[0] == GB2312_1) {
\r
2341 choices[1] = CNS_11643_1;
\r
2343 choices[1] = GB2312_1;
\r
2348 /* ISO-2022-CN-EXT */
\r
2350 /* try one of the other converters */
\r
2351 switch (choices[0]) {
\r
2353 choices[1] = CNS_11643_1;
\r
2354 choices[2] = ISO_IR_165;
\r
2357 choices[1] = GB2312_1;
\r
2358 choices[2] = CNS_11643_1;
\r
2361 choices[1] = GB2312_1;
\r
2362 choices[2] = ISO_IR_165;
\r
2372 * len==0: no mapping found yet
\r
2373 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
\r
2374 * len>0: found a roundtrip result, done
\r
2378 * We will turn off usingFallback after finding a fallback,
\r
2379 * but we still get fallbacks from PUA code points as usual.
\r
2380 * Therefore, we will also need to check that we don't overwrite
\r
2381 * an early fallback with a later one.
\r
2383 usingFallback = useFallback;
\r
2385 for (i = 0; i < choiceCount && len <= 0; ++i) {
\r
2386 byte cs0 = choices[i];
\r
2388 int[] value = new int[1];
\r
2390 if (cs0 > CNS_11643_0) {
\r
2391 myConverterData.currentConverter.sharedData = myConverterData.myConverterArray[CNS_11643];
\r
2392 myConverterData.currentConverter.sharedData.mbcs.outputType = CharsetMBCS.MBCS_OUTPUT_3;
\r
2393 len2 = myConverterData.currentEncoder.fromUChar32(sourceChar, value, usingFallback);
\r
2394 //len2 = MBCSFromUChar32_ISO2022(myConverterData.myConverterArray[CNS_11643],
\r
2395 // sourceChar, value, usingFallback, CharsetMBCS.MBCS_OUTPUT_3);
\r
2396 if (len2 == 3 || (len2 == -3 && len == 0)) {
\r
2397 targetValue = value[0];
\r
2398 cs = (byte)(CNS_11643_0 + (value[0] >> 16) - 0x80);
\r
2403 usingFallback = false;
\r
2405 if (cs == CNS_11643_1) {
\r
2407 } else if (cs == CNS_11643_2) {
\r
2409 } else if (myConverterData.version == 1) { /* plane 3..7 */
\r
2412 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
\r
2417 /* GB2312_1 or ISO-IR-165 */
\r
2418 myConverterData.currentConverter.sharedData = myConverterData.myConverterArray[cs0];
\r
2419 myConverterData.currentConverter.sharedData.mbcs.outputType = CharsetMBCS.MBCS_OUTPUT_2;
\r
2420 len2 = myConverterData.currentEncoder.fromUChar32(sourceChar, value, usingFallback);
\r
2421 //len2 = MBCSFromUChar32_ISO2022(myConverterData.myConverterArray[cs0],
\r
2422 // sourceChar, value, usingFallback, CharsetMBCS.MBCS_OUTPUT_2);
\r
2423 if (len2 == 2 || (len2 == -2 && len == 0)) {
\r
2424 targetValue = value[0];
\r
2428 usingFallback = false;
\r
2435 len = 0; /* count output bytes; it must have ben abs(len) == 2 */
\r
2437 /* write the designation sequence if necessary */
\r
2438 if (cs != myConverterData.fromU2022State.cs[g]) {
\r
2439 if (cs < CNS_11643) {
\r
2440 for (int n = 0; n < escSeqCharsCN[cs].length; n++) {
\r
2441 buffer[n] = escSeqCharsCN[cs][n];
\r
2444 for (int n = 0; n < escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)].length; n++) {
\r
2445 buffer[n] = escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)][n];
\r
2449 myConverterData.fromU2022State.cs[g] = cs;
\r
2451 /* changing the SO/G1 charset invalidates the choices[] */
\r
2456 /* write the shift sequence if necessary */
\r
2457 if (g != myConverterData.fromU2022State.g) {
\r
2460 buffer[len++] = UConverterConstants.SO;
\r
2462 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
\r
2463 myConverterData.fromU2022State.g = 1;
\r
2466 buffer[len++] = 0x1b;
\r
2467 buffer[len++] = 0x4e;
\r
2469 default: /* case 3 */
\r
2470 buffer[len++] = 0x1b;
\r
2471 buffer[len++] = 0x4f;
\r
2476 /* write the two output bytes */
\r
2477 buffer[len++] = (byte)(targetValue >> 8);
\r
2478 buffer[len++] = (byte)targetValue;
\r
2480 /* if we cannot find the character after checking all codepages
\r
2481 * then this is an error
\r
2483 err = CoderResult.unmappableForLength(source.position()-oldSourcePos);
\r
2484 fromUChar32 = sourceChar;
\r
2488 /* output len>0 bytes in buffer[] */
\r
2490 target.put(buffer[0]);
\r
2491 if (offsets != null) {
\r
2492 offsets.put(source.position()-1);
\r
2494 } else if (len == 2 && (target.remaining() >= 2)) {
\r
2495 target.put(buffer[0]);
\r
2496 target.put(buffer[1]);
\r
2497 if (offsets != null) {
\r
2498 int sourceIndex = source.position();
\r
2499 offsets.put(sourceIndex);
\r
2500 offsets.put(sourceIndex);
\r
2503 err = CharsetEncoderICU.fromUWriteBytes(this, buffer, 0, len, target, offsets, source.position()-1);
\r
2504 if (err.isError()) {
\r
2509 err = CoderResult.OVERFLOW;
\r
2512 } /* end while (source.hasRemaining() */
\r
2515 * the end of the input stream and detection of truncated input
\r
2516 * are handled by the framework, but for ISO-2022-CN conversion
\r
2517 * we need to be in ASCII mode at the very end
\r
2521 * not in ASCII mode
\r
2522 * end of input and no truncated input
\r
2524 if (!err.isError() && myConverterData.fromU2022State.g != 0 && flush && !source.hasRemaining() && fromUChar32 == 0) {
\r
2527 /* we are switching to ASCII */
\r
2528 myConverterData.fromU2022State.g = 0;
\r
2530 /* get the source index of the last input character */
\r
2531 sourceIndex = source.position();
\r
2532 if (sourceIndex > 0) {
\r
2534 if (UTF16.isTrailSurrogate(source.get(sourceIndex)) &&
\r
2535 (sourceIndex == 0 || UTF16.isLeadSurrogate(source.get(sourceIndex-1)))) {
\r
2542 err = CharsetEncoderICU.fromUWriteBytes(this, SHIFT_IN_STR, 0, 1, target, offsets, sourceIndex);
\r
2548 /******************************** ISO-2022-KR *****************************/
\r
2550 * Rules for ISO-2022-KR encoding
\r
2551 * i) The KSC5601 designator sequence should appear only once in a file,
\r
2552 * at the begining of a line before any KSC5601 characters. This usually
\r
2553 * means that it appears by itself on the first line of the file
\r
2554 * ii) There are only 2 shifting sequences SO to shift into double byte mode
\r
2555 * and SI to shift into single byte mode
\r
2557 private class CharsetEncoderISO2022KR extends CharsetEncoderICU {
\r
2558 public CharsetEncoderISO2022KR(CharsetICU cs) {
\r
2559 super(cs, fromUSubstitutionChar[myConverterData.version]);
\r
2562 protected void implReset() {
\r
2563 super.implReset();
\r
2564 myConverterData.reset();
\r
2565 setInitialStateFromUnicodeKR(this);
\r
2568 /* This overrides the cbFromUWriteSub method in CharsetEncoderICU */
\r
2569 CoderResult cbFromUWriteSub (CharsetEncoderICU encoder,
\r
2570 CharBuffer source, ByteBuffer target, IntBuffer offsets){
\r
2571 CoderResult err = CoderResult.UNDERFLOW;
\r
2572 byte[] buffer = new byte[8];
\r
2573 int length, i = 0;
\r
2576 subchar = encoder.replacement();
\r
2577 length = subchar.length;
\r
2579 if (myConverterData.version == 0) {
\r
2580 if (length == 1) {
\r
2581 if (encoder.fromUnicodeStatus != 0) {
\r
2582 /* in DBCS mode: switch to SBCS */
\r
2583 encoder.fromUnicodeStatus = 0;
\r
2584 buffer[i++] = UConverterConstants.SI;
\r
2586 buffer[i++] = subchar[0];
\r
2587 } else { /* length == 2 */
\r
2588 if (encoder.fromUnicodeStatus == 0) {
\r
2589 /* in SBCS mode: switch to DBCS */
\r
2590 encoder.fromUnicodeStatus = 1;
\r
2591 buffer[i++] = UConverterConstants.SO;
\r
2593 buffer[i++] = subchar[0];
\r
2594 buffer[i++] = subchar[1];
\r
2596 err = CharsetEncoderICU.fromUWriteBytes(this, buffer, 0, i, target, offsets, source.position() - 1);
\r
2598 /* save the subvonverter's substitution string */
\r
2599 byte[] currentSubChars = myConverterData.currentEncoder.replacement();
\r
2601 /* set our substitution string into the subconverter */
\r
2602 myConverterData.currentEncoder.replaceWith(subchar);
\r
2603 myConverterData.currentConverter.subChar1 = fromUSubstitutionChar[0][0];
\r
2604 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
\r
2605 myConverterData.currentEncoder.fromUChar32 = encoder.fromUChar32;
\r
2606 err = myConverterData.currentEncoder.cbFromUWriteSub(myConverterData.currentEncoder, source, target, offsets);
\r
2607 encoder.fromUChar32 = myConverterData.currentEncoder.fromUChar32;
\r
2609 /* restore the subconverter's substitution string */
\r
2610 myConverterData.currentEncoder.replaceWith(currentSubChars);
\r
2612 if (err.isOverflow()) {
\r
2613 if (myConverterData.currentEncoder.errorBufferLength > 0) {
\r
2614 encoder.errorBuffer = myConverterData.currentEncoder.errorBuffer.clone();
\r
2616 encoder.errorBufferLength = myConverterData.currentEncoder.errorBufferLength;
\r
2617 myConverterData.currentEncoder.errorBufferLength = 0;
\r
2624 private CoderResult encodeLoopIBM(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
\r
2625 CoderResult err = CoderResult.UNDERFLOW;
\r
2627 myConverterData.currentEncoder.fromUChar32 = fromUChar32;
\r
2628 err = myConverterData.currentEncoder.cnvMBCSFromUnicodeWithOffsets(source, target, offsets, flush);
\r
2629 fromUChar32 = myConverterData.currentEncoder.fromUChar32;
\r
2631 if (err.isOverflow()) {
\r
2632 if (myConverterData.currentEncoder.errorBufferLength > 0) {
\r
2633 errorBuffer = myConverterData.currentEncoder.errorBuffer.clone();
\r
2635 errorBufferLength = myConverterData.currentEncoder.errorBufferLength;
\r
2636 myConverterData.currentEncoder.errorBufferLength = 0;
\r
2642 protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
\r
2643 CoderResult err = CoderResult.UNDERFLOW;
\r
2644 int[] targetByteUnit = { 0x0000 };
\r
2645 int sourceChar = 0x0000;
\r
2646 boolean isTargetByteDBCS;
\r
2647 boolean oldIsTargetByteDBCS;
\r
2648 boolean usingFallback;
\r
2650 boolean gotoGetTrail = false; // for goto getTrail label call
\r
2653 * if the version is 1 then the user is requesting
\r
2654 * conversion with ibm-25546 pass the argument to
\r
2655 * MBCS converter and return
\r
2657 if (myConverterData.version == 1) {
\r
2658 return encodeLoopIBM(source, target, offsets, flush);
\r
2661 usingFallback = useFallback;
\r
2662 isTargetByteDBCS = fromUnicodeStatus == 0 ? false : true;
\r
2663 if ((sourceChar = fromUChar32) != 0 && target.hasRemaining()) {
\r
2664 gotoGetTrail = true;
\r
2667 while (source.hasRemaining() || gotoGetTrail) {
\r
2668 targetByteUnit[0] = UConverterConstants.missingCharMarker;
\r
2670 if (target.hasRemaining() || gotoGetTrail) {
\r
2671 if (!gotoGetTrail) {
\r
2672 sourceChar = source.get();
\r
2674 /* do not convert SO/SI/ESC */
\r
2675 if (IS_2022_CONTROL(sourceChar)) {
\r
2676 /* callback(illegal) */
\r
2677 err = CoderResult.malformedForLength(1);
\r
2678 fromUChar32 = sourceChar;
\r
2681 myConverterData.currentConverter.sharedData.mbcs.outputType = CharsetMBCS.MBCS_OUTPUT_2;
\r
2682 length = myConverterData.currentEncoder.fromUChar32(sourceChar, targetByteUnit, usingFallback);
\r
2683 //length = MBCSFromUChar32_ISO2022(myConverterData.currentConverter.sharedData, sourceChar, targetByteUnit, usingFallback, CharsetMBCS.MBCS_OUTPUT_2);
\r
2685 length = -length; /* fallback */
\r
2687 /* only DBCS or SBCS characters are expected */
\r
2688 /* DB characters with high bit set to 1 are expected */
\r
2689 if (length > 2 || length == 0 ||
\r
2690 (length == 1 && targetByteUnit[0] > 0x7f) ||
\r
2692 ((char)(targetByteUnit[0] - 0xa1a1) > (0xfefe - 0xa1a1) ||
\r
2693 ((targetByteUnit[0] - 0xa1) & UConverterConstants.UNSIGNED_BYTE_MASK) > (0xfe - 0xa1)))) {
\r
2694 targetByteUnit[0] = UConverterConstants.missingCharMarker;
\r
2697 if (!gotoGetTrail && targetByteUnit[0] != UConverterConstants.missingCharMarker) {
\r
2698 oldIsTargetByteDBCS = isTargetByteDBCS;
\r
2699 isTargetByteDBCS = (targetByteUnit[0] > 0x00FF);
\r
2700 /* append the shift sequence */
\r
2701 if (oldIsTargetByteDBCS != isTargetByteDBCS) {
\r
2702 if (isTargetByteDBCS) {
\r
2703 target.put((byte)UConverterConstants.SO);
\r
2705 target.put((byte)UConverterConstants.SI);
\r
2707 if (offsets != null) {
\r
2708 offsets.put(source.position()-1);
\r
2711 /* write the targetUniChar to target */
\r
2712 if (targetByteUnit[0] <= 0x00FF) {
\r
2713 if (target.hasRemaining()) {
\r
2714 target.put((byte)targetByteUnit[0]);
\r
2715 if (offsets != null) {
\r
2716 offsets.put(source.position()-1);
\r
2719 errorBuffer[errorBufferLength++] = (byte)targetByteUnit[0];
\r
2720 err = CoderResult.OVERFLOW;
\r
2723 if (target.hasRemaining()) {
\r
2724 target.put((byte)(UConverterConstants.UNSIGNED_BYTE_MASK & ((targetByteUnit[0]>>8) - 0x80)));
\r
2725 if (offsets != null) {
\r
2726 offsets.put(source.position()-1);
\r
2728 if (target.hasRemaining()) {
\r
2729 target.put((byte)(UConverterConstants.UNSIGNED_BYTE_MASK & (targetByteUnit[0]- 0x80)));
\r
2730 if (offsets != null) {
\r
2731 offsets.put(source.position()-1);
\r
2734 errorBuffer[errorBufferLength++] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & (targetByteUnit[0] - 0x80));
\r
2735 err = CoderResult.OVERFLOW;
\r
2739 errorBuffer[errorBufferLength++] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & ((targetByteUnit[0]>>8) - 0x80));
\r
2740 errorBuffer[errorBufferLength++] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & (targetByteUnit[0]- 0x80));
\r
2741 err = CoderResult.OVERFLOW;
\r
2745 /* oops.. the code point is unassigned
\r
2746 * set the error and reason
\r
2749 /* check if the char is a First surrogate */
\r
2750 if (gotoGetTrail || UTF16.isSurrogate((char)sourceChar)) {
\r
2751 if (gotoGetTrail || UTF16.isLeadSurrogate((char)sourceChar)) {
\r
2753 // reset gotoGetTrail flag
\r
2754 gotoGetTrail = false;
\r
2756 /* look ahead to find the trail surrogate */
\r
2757 if (source.hasRemaining()) {
\r
2758 /* test the following code unit */
\r
2759 char trail = source.get();
\r
2760 source.position(source.position()-1);
\r
2761 if (UTF16.isTrailSurrogate(trail)) {
\r
2763 sourceChar = UCharacter.getCodePoint((char)sourceChar, trail);
\r
2764 err = CoderResult.unmappableForLength(2);
\r
2765 /* convert this surrogate code point */
\r
2766 /* exit this condition tree */
\r
2768 /* this is an unmatched lead code unit (1st surrogate) */
\r
2769 /* callback(illegal) */
\r
2770 err = CoderResult.malformedForLength(1);
\r
2773 /* no more input */
\r
2774 err = CoderResult.UNDERFLOW;
\r
2777 /* this is an unmatched trail code unit (2nd surrogate ) */
\r
2778 /* callback(illegal) */
\r
2779 err = CoderResult.malformedForLength(1);
\r
2782 /* callback(unassigned) for a BMP code point */
\r
2783 err = CoderResult.unmappableForLength(1);
\r
2786 fromUChar32 = sourceChar;
\r
2790 err = CoderResult.OVERFLOW;
\r
2795 * the end of the input stream and detection of truncated input
\r
2796 * are handled by the framework, but for ISO-2022-KR conversion
\r
2797 * we need to be inASCII mode at the very end
\r
2801 * not in ASCII mode
\r
2802 * end of input and no truncated input
\r
2804 if (!err.isError() && isTargetByteDBCS && flush && !source.hasRemaining() && fromUChar32 == 0) {
\r
2807 /* we are switching to ASCII */
\r
2808 isTargetByteDBCS = false;
\r
2810 /* get the source index of the last input character */
\r
2811 sourceIndex = source.position();
\r
2812 if (sourceIndex > 0) {
\r
2814 if (UTF16.isTrailSurrogate(source.get(sourceIndex)) && UTF16.isLeadSurrogate(source.get(sourceIndex-1))) {
\r
2821 CharsetEncoderICU.fromUWriteBytes(this, SHIFT_IN_STR, 0, 1, target, offsets, sourceIndex);
\r
2823 /*save the state and return */
\r
2824 fromUnicodeStatus = isTargetByteDBCS ? 1 : 0;
\r
2830 public CharsetDecoder newDecoder() {
\r
2831 switch (variant) {
\r
2833 return new CharsetDecoderISO2022JP(this);
\r
2836 return new CharsetDecoderISO2022CN(this);
\r
2839 setInitialStateToUnicodeKR();
\r
2840 return new CharsetDecoderISO2022KR(this);
\r
2842 default: /* should not happen */
\r
2847 public CharsetEncoder newEncoder() {
\r
2848 CharsetEncoderICU cnv;
\r
2850 switch (variant) {
\r
2852 return new CharsetEncoderISO2022JP(this);
\r
2855 return new CharsetEncoderISO2022CN(this);
\r
2858 cnv = new CharsetEncoderISO2022KR(this);
\r
2859 setInitialStateFromUnicodeKR(cnv);
\r
2862 default: /* should not happen */
\r
2867 private void setInitialStateToUnicodeKR() {
\r
2868 if (myConverterData.version == 1) {
\r
2869 myConverterData.currentDecoder.toUnicodeStatus = 0; /* offset */
\r
2870 myConverterData.currentDecoder.mode = 0; /* state */
\r
2871 myConverterData.currentDecoder.toULength = 0; /* byteIndex */
\r
2874 private void setInitialStateFromUnicodeKR(CharsetEncoderICU cnv) {
\r
2875 /* ISO-2022-KR the designator sequence appears only once
\r
2876 * in a file so we append it only once
\r
2878 if (cnv.errorBufferLength == 0) {
\r
2879 cnv.errorBufferLength = 4;
\r
2880 cnv.errorBuffer[0] = 0x1b;
\r
2881 cnv.errorBuffer[1] = 0x24;
\r
2882 cnv.errorBuffer[2] = 0x29;
\r
2883 cnv.errorBuffer[3] = 0x43;
\r
2885 if (myConverterData.version == 1) {
\r
2886 ((CharsetMBCS)myConverterData.currentEncoder.charset()).subChar1 = 0x1A;
\r
2887 myConverterData.currentEncoder.fromUChar32 = 0;
\r
2888 myConverterData.currentEncoder.fromUnicodeStatus = 1; /* prevLength */
\r
2892 void getUnicodeSetImpl(UnicodeSet setFillIn, int which) {
\r
2894 /*open a set and initialize it with code points that are algorithmically round-tripped */
\r
2898 /*include JIS X 0201 which is hardcoded */
\r
2899 setFillIn.add(0xa5);
\r
2900 setFillIn.add(0x203e);
\r
2901 if((jpCharsetMasks[myConverterData.version]&CSM(ISO8859_1))!=0){
\r
2902 /*include Latin-1 some variants of JP */
\r
2903 setFillIn.add(0, 0xff);
\r
2907 /* include ASCII for JP */
\r
2908 setFillIn.add(0, 0x7f);
\r
2910 if(myConverterData.version==3 || myConverterData.version==4 ||which == ROUNDTRIP_AND_FALLBACK_SET){
\r
2912 * Do not test(jpCharsetMasks[myConverterData.version]&CSM(HWKANA_7BIT))!=0 because the bit
\r
2913 * is on for all JP versions although version 3 & 4 (JIS7 and JIS8) use half-width Katakana.
\r
2914 * This is because all ISO_2022_JP variant are lenient in that they accept (in toUnicode) half-width
\r
2915 * Katakana via ESC.
\r
2916 * However, we only emit (fromUnicode) half-width Katakana according to the
\r
2917 * definition of each variant.
\r
2919 * When including fallbacks,
\r
2920 * we need to include half-width Katakana Unicode code points for all JP variants because
\r
2921 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
\r
2923 /* include half-width Katakana for JP */
\r
2924 setFillIn.add(HWKANA_START, HWKANA_END);
\r
2928 /* Include ASCII for CN */
\r
2929 setFillIn.add(0, 0x7f);
\r
2932 /* there is only one converter for KR */
\r
2933 myConverterData.currentConverter.getUnicodeSetImpl(setFillIn, which);
\r
2939 //TODO Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until
\r
2940 for(i=0; i<UCNV_2022_MAX_CONVERTERS;i++){
\r
2942 if(myConverterData.myConverterArray[i]!=null){
\r
2943 if(variant==ISO_2022_CN && myConverterData.version==0 && i==CNS_11643){
\r
2946 * version -specific for CN:
\r
2947 * CN version 0 does not map CNS planes 3..7 although
\r
2948 * they are all available in the CNS conversion table;
\r
2949 * CN version 1 (-EXT) does map them all.
\r
2950 * The two versions create different Unicode sets.
\r
2952 filter=CharsetMBCS.UCNV_SET_FILTER_2022_CN;
\r
2953 } else if(variant==ISO_2022_JP && i == JISX208){
\r
2955 * Only add code points that map to Shift-JIS codes
\r
2956 * corrosponding to JIS X 208
\r
2958 filter=CharsetMBCS.UCNV_SET_FILTER_SJIS;
\r
2959 } else if(i==KSC5601){
\r
2961 * Some of the KSC 5601 tables (Convrtrs.txt has this aliases on multiple tables)
\r
2962 * are broader than GR94.
\r
2964 filter=CharsetMBCS.UCNV_SET_FILTER_GR94DBCS;
\r
2966 filter=CharsetMBCS.UCNV_SET_FILTER_NONE;
\r
2969 myConverterData.currentConverter.MBCSGetFilteredUnicodeSetForUnicode(myConverterData.myConverterArray[i],setFillIn, which, filter);
\r
2973 * ISO Converter must not convert SO/SI/ESC despite what sub-converters do by themselves
\r
2974 * Remove these characters from the set.
\r
2976 setFillIn.remove(0x0e);
\r
2977 setFillIn.remove(0x0f);
\r
2978 setFillIn.remove(0x1b);
\r
2980 /* ISO 2022 converter do not convert C! controls either */
\r
2981 setFillIn.remove(0x80, 0x9f);
\r