2 *******************************************************************************
\r
3 * Copyright (C) 2008-2009, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.charset;
\r
9 import java.nio.ByteBuffer;
\r
10 import java.nio.CharBuffer;
\r
11 import java.nio.IntBuffer;
\r
12 import java.nio.charset.CharsetDecoder;
\r
13 import java.nio.charset.CharsetEncoder;
\r
14 import java.nio.charset.CoderResult;
\r
15 import java.util.Arrays;
\r
17 import com.ibm.icu.charset.CharsetMBCS.CharsetDecoderMBCS;
\r
18 import com.ibm.icu.charset.CharsetMBCS.CharsetEncoderMBCS;
\r
19 import com.ibm.icu.lang.UCharacter;
\r
20 import com.ibm.icu.text.UTF16;
\r
21 import com.ibm.icu.text.UnicodeSet;
\r
23 class CharsetISO2022 extends CharsetICU {
\r
24 private UConverterDataISO2022 myConverterData;
\r
25 private int variant; // one of enum {ISO_2022_JP, ISO_2022_KR, or ISO_2022_CN}
\r
27 private static final byte[] SHIFT_IN_STR = { 0x0f };
\r
28 // private static final byte[] SHIFT_OUT_STR = { 0x0e };
\r
30 private static final byte CR = 0x0D;
\r
31 private static final byte LF = 0x0A;
\r
33 private static final byte H_TAB = 0x09;
\r
34 private static final byte SPACE = 0x20;
\r
36 private static final char HWKANA_START = 0xff61;
\r
37 private static final char HWKANA_END = 0xff9f;
\r
40 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
\r
41 * as bytes 21..7E. (Subtract 0x80.)
\r
42 * 96-character sets with native bit values A0..FF are encoded in ISO 2022
\r
43 * as bytes 20..7F. (Subtract 0x80.)
\r
44 * Do not encode C1 control codes with native bytes 80..9F
\r
45 * as bytes 00..1F (C0 control codes).
\r
48 private static final char GR94_START = 0xa1;
\r
49 private static final char GR94_END = 0xfe;
\r
51 private static final char GR96_START = 0xa0;
\r
52 private static final char GR96_END = 0xff;
\r
54 /* for ISO-2022-JP and -CN implementations */
\r
57 private static final byte INVALID_STATE = -1;
\r
58 private static final byte ASCII = 0;
\r
60 private static final byte SS2_STATE = 0x10;
\r
61 private static final byte SS3_STATE = 0x11;
\r
64 private static final byte ISO8859_1 = 1;
\r
65 private static final byte ISO8859_7 = 2;
\r
66 private static final byte JISX201 = 3;
\r
67 private static final byte JISX208 = 4;
\r
68 private static final byte JISX212 = 5;
\r
69 private static final byte GB2312 = 6;
\r
70 private static final byte KSC5601 = 7;
\r
71 private static final byte HWKANA_7BIT = 8; /* Halfwidth Katakana 7 bit */
\r
74 /* the first few enum constants must keep their values because they corresponds to myConverterArray[] */
\r
75 private static final byte GB2312_1 = 1;
\r
76 private static final byte ISO_IR_165= 2;
\r
77 private static final byte CNS_11643 = 3;
\r
80 * these are used in StateEnum and ISO2022State variables,
\r
81 * but CNS_11643 must be used to index into myConverterArray[]
\r
83 private static final byte CNS_11643_0 = 0x20;
\r
84 private static final byte CNS_11643_1 = 0x21;
\r
85 private static final byte CNS_11643_2 = 0x22;
\r
86 private static final byte CNS_11643_3 = 0x23;
\r
87 private static final byte CNS_11643_4 = 0x24;
\r
88 private static final byte CNS_11643_5 = 0x25;
\r
89 private static final byte CNS_11643_6 = 0x26;
\r
90 private static final byte CNS_11643_7 = 0x27;
\r
94 public CharsetISO2022(String icuCanonicalName, String javaCanonicalName, String[] aliases) {
\r
95 super(icuCanonicalName, javaCanonicalName, aliases);
\r
97 myConverterData = new UConverterDataISO2022();
\r
99 int versionIndex = icuCanonicalName.indexOf("version=");
\r
100 int version = Integer.decode(icuCanonicalName.substring(versionIndex+8, versionIndex+9)).intValue();
\r
102 myConverterData.version = version;
\r
104 if (icuCanonicalName.indexOf("locale=ja") > 0) {
\r
105 ISO2022InitJP(version);
\r
106 } else if (icuCanonicalName.indexOf("locale=zh") > 0) {
\r
107 ISO2022InitCN(version);
\r
108 } else /* if (icuCanonicalName.indexOf("locale=ko") > 0) */ {
\r
109 ISO2022InitKR(version);
\r
112 myConverterData.currentEncoder = (CharsetEncoderMBCS)myConverterData.currentConverter.newEncoder();
\r
113 myConverterData.currentDecoder = (CharsetDecoderMBCS)myConverterData.currentConverter.newDecoder();
\r
116 private void ISO2022InitJP(int version) {
\r
117 variant = ISO_2022_JP;
\r
119 maxBytesPerChar = 6;
\r
120 minBytesPerChar = 1;
\r
121 maxCharsPerByte = 1;
\r
122 // open the required converters and cache them
\r
123 if((jpCharsetMasks[version]&CSM(ISO8859_7)) != 0) {
\r
124 myConverterData.myConverterArray[ISO8859_7] = ((CharsetMBCS)CharsetICU.forNameICU("ISO8859_7")).sharedData;
\r
126 // myConverterData.myConverterArray[JISX201] = ((CharsetMBCS)CharsetICU.forNameICU("jisx-201")).sharedData;
\r
127 myConverterData.myConverterArray[JISX208] = ((CharsetMBCS)CharsetICU.forNameICU("Shift-JIS")).sharedData;
\r
128 if ((jpCharsetMasks[version]&CSM(JISX212)) != 0) {
\r
129 myConverterData.myConverterArray[JISX212] = ((CharsetMBCS)CharsetICU.forNameICU("jisx-212")).sharedData;
\r
131 if ((jpCharsetMasks[version]&CSM(GB2312)) != 0) {
\r
132 myConverterData.myConverterArray[GB2312] = ((CharsetMBCS)CharsetICU.forNameICU("ibm-5478")).sharedData;
\r
134 if ((jpCharsetMasks[version]&CSM(KSC5601)) != 0) {
\r
135 myConverterData.myConverterArray[KSC5601] = ((CharsetMBCS)CharsetICU.forNameICU("ksc_5601")).sharedData;
\r
138 // create a generic CharsetMBCS object
\r
139 myConverterData.currentConverter = (CharsetMBCS)CharsetICU.forNameICU("icu-internal-25546");
\r
142 private void ISO2022InitCN(int version) {
\r
143 variant = ISO_2022_CN;
\r
145 maxBytesPerChar = 8;
\r
146 minBytesPerChar = 1;
\r
147 maxCharsPerByte = 1;
\r
148 // open the required coverters and cache them.
\r
149 myConverterData.myConverterArray[GB2312_1] = ((CharsetMBCS)CharsetICU.forNameICU("ibm-5478")).sharedData;
\r
150 if (version == 1) {
\r
151 myConverterData.myConverterArray[ISO_IR_165] = ((CharsetMBCS)CharsetICU.forNameICU("iso-ir-165")).sharedData;
\r
153 myConverterData.myConverterArray[CNS_11643] = ((CharsetMBCS)CharsetICU.forNameICU("cns-11643-1992")).sharedData;
\r
155 // create a generic CharsetMBCS object
\r
156 myConverterData.currentConverter = (CharsetMBCS)CharsetICU.forNameICU("icu-internal-25546");
\r
159 private void ISO2022InitKR(int version) {
\r
160 variant = ISO_2022_KR;
\r
162 maxBytesPerChar = 3;
\r
163 minBytesPerChar = 1;
\r
164 maxCharsPerByte = 1;
\r
166 if (version == 1) {
\r
167 myConverterData.currentConverter = (CharsetMBCS)CharsetICU.forNameICU("icu-internal-25546");
\r
168 myConverterData.currentConverter.subChar1 = fromUSubstitutionChar[0][0];
\r
170 myConverterData.currentConverter = (CharsetMBCS)CharsetICU.forNameICU("ibm-949");
\r
173 myConverterData.currentEncoder = (CharsetEncoderMBCS)myConverterData.currentConverter.newEncoder();
\r
174 myConverterData.currentDecoder = (CharsetDecoderMBCS)myConverterData.currentConverter.newDecoder();
\r
178 * ISO 2022 control codes must not be converted from Unicode
\r
179 * because they would mess up the byte stream.
\r
180 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
\r
181 * corresponding to SO, SI, and ESC.
\r
183 private static boolean IS_2022_CONTROL(int c) {
\r
184 return (((c)<0x20) && ((((int)1<<c) & 0x0800c000) != 0));
\r
188 * Check that the result is a 2-byte value with each byte in the range A1..FE
\r
189 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
\r
190 * to move it to the ISO 2022 range 21..7E.
\r
191 * return 0 if out of range.
\r
193 private static int _2022FromGR94DBCS(int value) {
\r
194 if ((value <= 0xfefe && value >= 0xa1a1) &&
\r
195 ((short)(value&UConverterConstants.UNSIGNED_BYTE_MASK) <= 0xfe && ((short)(value&UConverterConstants.UNSIGNED_BYTE_MASK) >= 0xa1))) {
\r
196 return (value - 0x8080); /* shift down to 21..7e byte range */
\r
198 return 0; /* not valid for ISO 2022 */
\r
203 * Commented out because Ticket 5691: Call sites now check for validity. They can just += 0x8080 after that.
\r
205 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
\r
206 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
\r
209 private static int _2022ToGR94DBCS(int value) {
\r
210 int returnValue = value + 0x8080;
\r
212 if ((returnValue <= 0xfefe && returnValue >= 0xa1a1) &&
\r
213 ((short)(returnValue&UConverterConstants.UNSIGNED_BYTE_MASK) <= 0xfe && ((short)(returnValue&UConverterConstants.UNSIGNED_BYTE_MASK) >= 0xa1))) {
\r
214 return returnValue;
\r
220 /* is the StateEnum charset value for a DBCS charset? */
\r
221 private static boolean IS_JP_DBCS(byte cs) {
\r
222 return ((JISX208 <= cs) && (cs <= KSC5601));
\r
225 private static short CSM(short cs) {
\r
226 return (short)(1<<cs);
\r
229 /* This gets the valid index of the end of buffer when decoding. */
\r
230 private static int getEndOfBuffer_2022(ByteBuffer source) {
\r
231 int sourceIndex = source.position();
\r
233 mySource = source.get(sourceIndex);
\r
235 while (source.hasRemaining() && mySource != ESC_2022) {
\r
236 mySource = source.get();
\r
237 if (mySource == ESC_2022) {
\r
242 return sourceIndex;
\r
246 * This is a simple version of _MBCSGetNextUChar() calls the method in CharsetDecoderMBCS and returns
\r
250 * U+fffe unassigned
\r
252 * otherwise the Unicode code point
\r
254 private int MBCSSimpleGetNextUChar(UConverterSharedData sharedData,
\r
255 ByteBuffer source,
\r
256 boolean useFallback) {
\r
258 UConverterSharedData tempSharedData = myConverterData.currentConverter.sharedData;
\r
259 myConverterData.currentConverter.sharedData = sharedData;
\r
260 returnValue = ((CharsetDecoderMBCS)myConverterData.currentDecoder).simpleGetNextUChar(source, useFallback);
\r
261 myConverterData.currentConverter.sharedData = tempSharedData;
\r
263 return returnValue;
\r
267 * @param is the the output byte
\r
268 * @return 1 roundtrip byte 0 no mapping -1 fallback byte
\r
270 static int MBCSSingleFromUChar32(UConverterSharedData sharedData, int c, int[] retval, boolean useFallback) {
\r
273 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
\r
274 if (c >= 0x10000 && (sharedData.mbcs.unicodeMask&UConverterConstants.HAS_SUPPLEMENTARY) == 0) {
\r
277 /* convert the Unicode code point in c into codepage bytes */
\r
278 table = sharedData.mbcs.fromUnicodeTable;
\r
279 /* get the byte for the output */
\r
280 value = CharsetMBCS.MBCS_SINGLE_RESULT_FROM_U(table, sharedData.mbcs.fromUnicodeBytes, c);
\r
281 /* get the byte for the output */
\r
282 retval[0] = value & 0xff;
\r
283 if (value >= 0xf00) {
\r
284 return 1; /* roundtrip */
\r
285 } else if (useFallback ? value>=0x800 : value>=0xc00) {
\r
286 return -1; /* fallback taken */
\r
288 return 0; /* no mapping */
\r
293 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
\r
294 * to whether that charset is used in the corresponding version x of ISO_2022, locale=ja,version=x
\r
296 * Note: The converter uses some leniency:
\r
297 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
\r
298 * all versions, not just JIS7 and JIS8.
\r
299 * - ICU does not distinguish between different version so of JIS X 0208.
\r
301 private static final short jpCharsetMasks[] = {
\r
302 (short)(CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)),
\r
303 (short)(CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)),
\r
304 (short)(CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)),
\r
305 (short)(CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)),
\r
306 (short)(CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7))
\r
311 private static final byte ASCII1 = 0;
\r
312 private static final byte LATIN1 = 1;
\r
313 private static final byte SBCS = 2;
\r
314 private static final byte DBCS = 3;
\r
315 private static final byte MBCS = 4;
\r
316 private static final byte HWKANA = 5;
\r
320 private class ISO2022State {
\r
321 private byte []cs; /* Charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
\r
322 private byte g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
\r
323 private byte prevG; /* g before single shift (SS2 or SS3) */
\r
330 Arrays.fill(cs, (byte)0);
\r
336 // private static final byte UCNV_OPTIONS_VERSION_MASK = 0xf;
\r
337 private static final byte UCNV_2022_MAX_CONVERTERS = 10;
\r
339 private class UConverterDataISO2022 {
\r
340 UConverterSharedData []myConverterArray;
\r
341 CharsetEncoderMBCS currentEncoder;
\r
342 CharsetDecoderMBCS currentDecoder;
\r
343 CharsetMBCS currentConverter;
\r
344 int currentType; // Cnv2022Type;
\r
345 ISO2022State toU2022State;
\r
346 ISO2022State fromU2022State;
\r
349 boolean isEmptySegment;
\r
351 UConverterDataISO2022() {
\r
352 myConverterArray = new UConverterSharedData[UCNV_2022_MAX_CONVERTERS];
\r
353 toU2022State = new ISO2022State();
\r
354 fromU2022State = new ISO2022State();
\r
358 isEmptySegment = false;
\r
362 toU2022State.reset();
\r
363 fromU2022State.reset();
\r
364 isEmptySegment = false;
\r
368 private static final byte ESC_2022 = 0x1B; /* ESC */
\r
371 private static final byte INVALID_2022 = -1; /* Doesn't correspond to a valid iso 2022 escape sequence */
\r
372 private static final byte VALID_NON_TERMINAL_2022 = 0; /* so far corresponds to a valid iso 2022 escape sequence */
\r
373 private static final byte VALID_TERMINAL_2022 = 1; /* corresponds to a valid iso 2022 escape sequence */
\r
374 private static final byte VALID_MAYBE_TERMINAL_2022 = 2; /* so far matches one iso 2022 escape sequence, but by adding
\r
375 more characters might match another escape sequence */
\r
376 // } UCNV_TableStates_2022;
\r
379 * The way these state transition arrays work is:
\r
380 * ex : ESC$B is the sequence for JISX208
\r
381 * a) First Iteration: char is ESC
\r
382 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
\r
383 * int x = normalize_esq_chars_2022[27] which is equal to 1
\r
384 * ii) Search for this value in escSeqStateTable_Key_2022[]
\r
385 * value of x is stored at escSeqStateTable_Key_2022[0]
\r
386 * iii) Save this index as offset
\r
387 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
\r
388 * escSeqStateTable_value_2022[offset], which is VALID_NON_TERMINAL_2022
\r
389 * b) Switch on this state and continue to next char
\r
390 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
\r
391 * which is normalize_esq_chars_2022[36] == 4
\r
392 * ii) x is currently 1(from above)
\r
393 * x<<=5 -- x is now 32
\r
394 * x+=normalize_esq_chars_2022[36]
\r
396 * iii) Search for this value in escSeqStateTable_Key_2022[]
\r
397 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
\r
398 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
\r
399 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
\r
400 * c) Switch on this state and continue to next char
\r
401 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
\r
402 * ii) x is currently 36 (from above)
\r
403 * x<<=5 -- x is now 1152
\r
404 * x+= normalize_esq_chars_2022[66]
\r
406 * iii) Search for this value in escSeqStateTable_Key_2022[]
\r
407 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
\r
408 * iv) Get state of this sequence from escSeqStateTable_Value_2022[1]
\r
409 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
\r
410 * v) Get the converter name from escSeqStateTable_Result_2022[21] which is JISX208
\r
412 /* Below are the 3 arrays depicting a state transition table */
\r
413 private static final byte normalize_esq_chars_2022[] = {
\r
414 /* 0 1 2 3 4 5 6 7 8 9 */
\r
415 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
416 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
417 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
\r
418 0, 0, 0, 0, 0, 0, 4, 7, 29, 0,
\r
419 2, 24, 26, 27, 0, 3, 23, 6, 0, 0,
\r
420 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
421 0, 0, 0, 0, 5, 8, 9, 10, 11, 12,
\r
422 13, 14, 15, 16, 17, 18, 19, 20, 25, 28,
\r
423 0, 0, 21, 0, 0, 0, 0, 0, 0, 0,
\r
424 22, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
425 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
426 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
427 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
428 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
429 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
430 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
431 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
432 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
433 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
434 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
435 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
436 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
437 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
438 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
439 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
440 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
444 private static final short MAX_STATES_2022 = 74;
\r
445 private static final int escSeqStateTable_Key_2022[/* MAX_STATES_2022 */] = {
\r
446 /* 0 1 2 3 4 5 6 7 8 9 */
\r
447 1, 34, 36, 39, 55, 57, 60, 61, 1093, 1096,
\r
448 1097, 1098, 1099, 1100, 1101, 1102, 1103, 1104, 1105, 1106,
\r
449 1109, 1154, 1157, 1160, 1161, 1176, 1178, 1179, 1254, 1257,
\r
450 1768, 1773, 1957, 35105, 36933, 36936, 36937, 36938, 36939, 36940,
\r
451 36942, 36943, 36944, 36945, 36946, 36947, 36948, 37640, 37642, 37644,
\r
452 37646, 37711, 37744, 37745, 37746, 37747, 37748, 40133, 40136, 40138,
\r
453 40139, 40140, 40141, 1123363, 35947624, 35947625, 35947626, 35947627, 35947629, 35947630,
\r
454 35947631, 35947635, 35947636, 35947638
\r
457 private static final byte escSeqStateTable_Value_2022[/* MAX_STATES_2022 */] = {
\r
459 VALID_NON_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_NON_TERMINAL_2022,
\r
460 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022,
\r
461 VALID_MAYBE_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022,
\r
462 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022,
\r
463 VALID_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022,
\r
464 VALID_NON_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_TERMINAL_2022,
\r
465 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_TERMINAL_2022,
\r
466 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022,
\r
467 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022,
\r
468 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022,
\r
469 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022,
\r
470 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022,
\r
471 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_NON_TERMINAL_2022, VALID_TERMINAL_2022,
\r
472 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022,
\r
473 VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022, VALID_TERMINAL_2022
\r
476 /* Type def for refactoring changeState_2022 code */
\r
478 private static final byte ISO_2022_JP = 1;
\r
479 private static final byte ISO_2022_KR = 2;
\r
480 private static final byte ISO_2022_CN = 3;
\r
483 /* const UConverterSharedData _ISO2022Data; */
\r
484 //private UConverterSharedData _ISO2022JPData;
\r
485 //private UConverterSharedData _ISO2022KRData;
\r
486 //private UConverterSharedData _ISO2022CNData;
\r
488 /******************** to unicode ********************/
\r
489 /****************************************************
\r
490 * Recognized escape sequenes are
\r
492 * <ESC>.A ISO-8859-1
\r
493 * <ESC>.F ISO-8859-7
\r
498 * <ESC>$(D JISX-212
\r
502 private final static byte nextStateToUnicodeJP[/* MAX_STATES_2022 */] = {
\r
503 /* 0 1 2 3 4 5 6 7 8 9 */
\r
504 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, SS2_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE,
\r
505 ASCII, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, JISX201, HWKANA_7BIT, JISX201, INVALID_STATE,
\r
506 INVALID_STATE, INVALID_STATE, JISX208, GB2312, JISX208, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE,
\r
507 ISO8859_1, ISO8859_7, JISX208, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, KSC5601, JISX212, INVALID_STATE,
\r
508 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE,
\r
509 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE,
\r
510 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE,
\r
511 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE
\r
514 private final static byte nextStateToUnicodeCN[/* MAX_STATES_2022 */] = {
\r
515 /* 0 1 2 3 4 5 6 7 8 9 */
\r
516 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, SS2_STATE, SS3_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE,
\r
517 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE,
\r
518 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE,
\r
519 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE,
\r
520 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, GB2312_1, INVALID_STATE, ISO_IR_165,
\r
521 CNS_11643_1, CNS_11643_2, CNS_11643_3, CNS_11643_4, CNS_11643_5, CNS_11643_6, CNS_11643_7, INVALID_STATE, INVALID_STATE, INVALID_STATE,
\r
522 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE,
\r
523 INVALID_STATE, INVALID_STATE, INVALID_STATE, INVALID_STATE
\r
526 /* runs through a state machine to determine the escape sequence - codepage correspondence */
\r
527 private CoderResult changeState_2022(CharsetDecoderICU decoder, ByteBuffer source, int var) {
\r
528 CoderResult err = CoderResult.UNDERFLOW;
\r
529 boolean DONE = false;
\r
531 int key[] = {myConverterData.key};
\r
532 int offset[] = {0};
\r
533 int initialToULength = decoder.toULength;
\r
535 int malformLength = 0;
\r
537 value = VALID_NON_TERMINAL_2022;
\r
538 while (source.hasRemaining()) {
\r
541 decoder.toUBytesArray[decoder.toULength++] = c;
\r
542 value = getKey_2022(c, key, offset);
\r
546 case VALID_NON_TERMINAL_2022:
\r
547 /* continue with the loop */
\r
550 case VALID_TERMINAL_2022:
\r
559 case VALID_MAYBE_TERMINAL_2022:
\r
560 /* not ISO_2022 itself, finish here */
\r
561 value = VALID_TERMINAL_2022;
\r
571 myConverterData.key = key[0];
\r
573 if (value == VALID_NON_TERMINAL_2022) {
\r
574 /* indicate that the escape sequence is incomplete: key !=0 */
\r
576 } else if (value == INVALID_2022) {
\r
577 err = CoderResult.malformedForLength(malformLength);
\r
578 } else /* value == VALID_TERMINAL_2022 */ {
\r
580 case ISO_2022_JP: {
\r
581 byte tempState = nextStateToUnicodeJP[offset[0]];
\r
582 switch (tempState) {
\r
583 case INVALID_STATE:
\r
584 err = CoderResult.malformedForLength(malformLength);
\r
587 if (myConverterData.toU2022State.cs[2] != 0) {
\r
588 if (myConverterData.toU2022State.g < 2) {
\r
589 myConverterData.toU2022State.prevG = myConverterData.toU2022State.g;
\r
591 myConverterData.toU2022State.g = 2;
\r
593 /* illegal to have SS2 before a matching designator */
\r
594 err = CoderResult.malformedForLength(malformLength);
\r
597 /* case SS3_STATE: not used in ISO-2022-JP-x */
\r
600 if ((jpCharsetMasks[myConverterData.version] & CSM(tempState)) == 0) {
\r
601 err = CoderResult.unmappableForLength(malformLength);
\r
603 /* G2 charset for SS2 */
\r
604 myConverterData.toU2022State.cs[2] = tempState;
\r
608 if ((jpCharsetMasks[myConverterData.version] & CSM(tempState)) == 0) {
\r
609 err = CoderResult.unmappableForLength(source.position() - 1);
\r
612 myConverterData.toU2022State.cs[0] = tempState;
\r
618 case ISO_2022_CN: {
\r
619 byte tempState = nextStateToUnicodeCN[offset[0]];
\r
620 switch (tempState) {
\r
621 case INVALID_STATE:
\r
622 err = CoderResult.unmappableForLength(malformLength);
\r
625 if (myConverterData.toU2022State.cs[2] != 0) {
\r
626 if (myConverterData.toU2022State.g < 2) {
\r
627 myConverterData.toU2022State.prevG = myConverterData.toU2022State.g;
\r
629 myConverterData.toU2022State.g = 2;
\r
631 /* illegal to have SS2 before a matching designator */
\r
632 err = CoderResult.malformedForLength(malformLength);
\r
636 if (myConverterData.toU2022State.cs[3] != 0) {
\r
637 if (myConverterData.toU2022State.g < 2) {
\r
638 myConverterData.toU2022State.prevG = myConverterData.toU2022State.g;
\r
640 myConverterData.toU2022State.g = 3;
\r
642 /* illegal to have SS3 before a matching designator */
\r
643 err = CoderResult.malformedForLength(malformLength);
\r
647 if (myConverterData.version == 0) {
\r
648 err = CoderResult.unmappableForLength(malformLength);
\r
655 myConverterData.toU2022State.cs[1] = tempState;
\r
658 myConverterData.toU2022State.cs[2] = tempState;
\r
661 /* other CNS 11643 planes */
\r
662 if (myConverterData.version == 0) {
\r
663 err = CoderResult.unmappableForLength(source.position() - 1);
\r
665 myConverterData.toU2022State.cs[3] = tempState;
\r
672 if (offset[0] == 0x30) {
\r
673 /* nothing to be done, just accept this one escape sequence */
\r
675 err = CoderResult.unmappableForLength(malformLength);
\r
679 err = CoderResult.malformedForLength(malformLength);
\r
683 if (!err.isError()) {
\r
684 decoder.toULength = 0;
\r
685 } else if (err.isMalformed()) {
\r
686 if (decoder.toULength > 1) {
\r
688 * Ticket 5691: consistent illegal sequences:
\r
689 * - We include at least the first byte (ESC) in the illegal sequence.
\r
690 * - If any of the non-initial bytes could be the start of a character,
\r
691 * we stop the illegal sequece before the first one of those.
\r
692 * In escape sequences, all following bytes are "printable", that is,
\r
693 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
\r
694 * they are valid single/lead bytes.
\r
695 * For simplicity, we always only report the initial ESC byte as the
\r
696 * illegal sequence and back out all other bytes we looked at.
\r
698 /* Back out some bytes. */
\r
699 int backOutDistance = decoder.toULength - 1;
\r
700 int bytesFromThisBuffer = decoder.toULength - initialToULength;
\r
701 if (backOutDistance <= bytesFromThisBuffer) {
\r
702 /* same as initialToULength<=1 */
\r
703 source.position(source.position() - backOutDistance);
\r
705 /* Back out bytes from the previous buffer: Need to replay them. */
\r
706 decoder.preToULength = (byte)(bytesFromThisBuffer - backOutDistance);
\r
707 /* same as -(initalToULength-1) */
\r
708 /* preToULength is negative! */
\r
709 for (int i = 0; i < -(decoder.preToULength); i++) {
\r
710 decoder.preToUArray[i] = decoder.toUBytesArray[i+1];
\r
712 source.position(source.position() - bytesFromThisBuffer);
\r
714 decoder.toULength = 1;
\r
721 private static byte getKey_2022(byte c, int[]key, int[]offset) {
\r
724 int hi = MAX_STATES_2022;
\r
727 togo = normalize_esq_chars_2022[(short)c&UConverterConstants.UNSIGNED_BYTE_MASK];
\r
730 /* not a valid character anywhere in an escape sequence */
\r
733 return INVALID_2022;
\r
735 togo = (key[0] << 5) + togo;
\r
737 while (hi != low) { /* binary search */
\r
738 int mid = (hi+low) >> 1; /* Finds median */
\r
740 if (mid == oldmid) {
\r
744 if (escSeqStateTable_Key_2022[mid] > togo) {
\r
746 } else if (escSeqStateTable_Key_2022[mid] < togo) {
\r
748 } else /* we found it */ {
\r
751 return escSeqStateTable_Value_2022[mid];
\r
755 return INVALID_2022;
\r
759 * To Unicode Callback helper function
\r
761 private static CoderResult toUnicodeCallback(CharsetDecoderICU cnv, int sourceChar, int targetUniChar) {
\r
762 CoderResult err = CoderResult.UNDERFLOW;
\r
763 if (sourceChar > 0xff) {
\r
764 cnv.toUBytesArray[0] = (byte)(sourceChar>>8);
\r
765 cnv.toUBytesArray[1] = (byte)sourceChar;
\r
768 cnv.toUBytesArray[0] = (byte)sourceChar;
\r
772 if (targetUniChar == (UConverterConstants.missingCharMarker-1/* 0xfffe */)) {
\r
773 err = CoderResult.unmappableForLength(1);
\r
775 err = CoderResult.malformedForLength(1);
\r
781 /****************************ISO-2022-JP************************************/
\r
782 private class CharsetDecoderISO2022JP extends CharsetDecoderICU {
\r
783 public CharsetDecoderISO2022JP(CharsetICU cs) {
\r
787 protected void implReset() {
\r
789 myConverterData.reset();
\r
792 * Map 00..7F to Unicode according to JIS X 0201.
\r
794 private int jisx201ToU(int value) {
\r
795 if (value < 0x5c) {
\r
797 } else if (value == 0x5c) {
\r
799 } else if (value == 0x7e) {
\r
801 } else { /* value <= 0x7f */
\r
806 * Convert a pair of JIS X 208 21..7E bytes to Shift-JIS.
\r
807 * If either byte is outside 21..7E make sure that the result is not valid
\r
808 * for Shift-JIS so that the converter catches it.
\r
809 * Some invalid byte values already turn into equally invalid Shift-JIS
\r
810 * byte values and need not be tested explicitly.
\r
812 private void _2022ToSJIS(char c1, char c2, byte []bytes) {
\r
817 } else if (c2 <= 0x7e) {
\r
820 c2 = 0; /* invalid */
\r
823 if ((c2 >= 0x21) && (c2 <= 0x7e)) {
\r
826 c2 = 0; /* invalid */
\r
833 } else if (c1 <= 0x3f) {
\r
836 c1 = 0; /* invalid */
\r
838 bytes[0] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & c1);
\r
839 bytes[1] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & c2);
\r
841 protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
\r
842 boolean gotoGetTrail = false;
\r
843 boolean gotoEscape = false;
\r
844 CoderResult err = CoderResult.UNDERFLOW;
\r
845 byte []tempBuf = new byte[2];
\r
846 int targetUniChar = 0x0000;
\r
847 int mySourceChar = 0x0000;
\r
848 int mySourceCharTemp = 0x0000; // use for getTrail label call.
\r
849 byte cs; /* StateEnum */
\r
850 byte csTemp= 0; // use for getTrail label call.
\r
852 if (myConverterData.key != 0) {
\r
853 /* continue with a partial escape sequence */
\r
856 } else if (toULength == 1 && source.hasRemaining() && target.hasRemaining()) {
\r
857 /* continue with a partial double-byte character */
\r
858 mySourceChar = (toUBytesArray[0] & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
860 cs = myConverterData.toU2022State.cs[myConverterData.toU2022State.g];
\r
861 // goto getTrailByte;
\r
862 mySourceCharTemp = 0x99;
\r
863 gotoGetTrail = true;
\r
866 while (source.hasRemaining() || gotoEscape || gotoGetTrail) {
\r
867 // This code is here for the goto escape label call above.
\r
869 mySourceCharTemp = ESC_2022;
\r
872 targetUniChar = UConverterConstants.missingCharMarker;
\r
874 if (gotoEscape || gotoGetTrail || target.hasRemaining()) {
\r
875 if (!gotoEscape && !gotoGetTrail) {
\r
876 mySourceChar = source.get() & UConverterConstants.UNSIGNED_BYTE_MASK;
\r
877 mySourceCharTemp = mySourceChar;
\r
880 switch (mySourceCharTemp) {
\r
881 case UConverterConstants.SI:
\r
882 if (myConverterData.version == 3) {
\r
883 myConverterData.toU2022State.g = 0;
\r
886 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
\r
887 myConverterData.isEmptySegment = false;
\r
891 case UConverterConstants.SO:
\r
892 if (myConverterData.version == 3) {
\r
893 /* JIS7: switch to G1 half-width Katakana */
\r
894 myConverterData.toU2022State.cs[1] = HWKANA_7BIT;
\r
895 myConverterData.toU2022State.g = 1;
\r
898 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
\r
899 myConverterData.isEmptySegment = false; /* reset this, we have a different error */
\r
905 source.position(source.position() - 1);
\r
907 gotoEscape = false;
\r
911 int mySourceBefore = source.position();
\r
912 int toULengthBefore = this.toULength;
\r
914 err = changeState_2022(this, source, variant);
\r
916 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
\r
917 if(myConverterData.version == 0 && myConverterData.key == 0 && !err.isError() && myConverterData.isEmptySegment) {
\r
918 err = CoderResult.malformedForLength(source.position() - mySourceBefore);
\r
919 this.toULength = toULengthBefore + (source.position() - mySourceBefore);
\r
923 /* invalid or illegal escape sequence */
\r
925 myConverterData.isEmptySegment = false; /* Reset to avoid future spurious errors */
\r
928 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
\r
929 if(myConverterData.key == 0) {
\r
930 myConverterData.isEmptySegment = true;
\r
934 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
\r
936 /* falls through */
\r
938 /* automatically reset to single-byte mode */
\r
939 if (myConverterData.toU2022State.cs[0] != ASCII && myConverterData.toU2022State.cs[0] != JISX201) {
\r
940 myConverterData.toU2022State.cs[0] = ASCII;
\r
942 myConverterData.toU2022State.cs[2] = 0;
\r
943 myConverterData.toU2022State.g = 0;
\r
944 /* falls through */
\r
946 /* convert one or two bytes */
\r
947 myConverterData.isEmptySegment = false;
\r
948 cs = myConverterData.toU2022State.cs[myConverterData.toU2022State.g];
\r
950 if (gotoGetTrail) {
\r
951 csTemp = (byte)0x99;
\r
953 if (!gotoGetTrail && ((mySourceChar >= 0xa1) && (mySourceChar <= 0xdf) && myConverterData.version == 4 && !IS_JP_DBCS(cs))) {
\r
954 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
\r
955 targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
\r
957 /* return from a single-shift state to the previous one */
\r
958 if (myConverterData.toU2022State.g >= 2) {
\r
959 myConverterData.toU2022State.g = myConverterData.toU2022State.prevG;
\r
964 if (mySourceChar <= 0x7f) {
\r
965 targetUniChar = mySourceChar;
\r
969 if (mySourceChar <= 0x7f) {
\r
970 targetUniChar = mySourceChar + 0x80;
\r
972 /* return from a single-shift state to the prevous one */
\r
973 myConverterData.toU2022State.g = myConverterData.toU2022State.prevG;
\r
976 if (mySourceChar <= 0x7f) {
\r
977 /* convert mySourceChar+0x80 to use a normal 8-bit table */
\r
978 targetUniChar = CharsetMBCS.MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(myConverterData.myConverterArray[cs].mbcs,
\r
979 mySourceChar+0x80);
\r
981 /* return from a single-shift state to the previous one */
\r
982 myConverterData.toU2022State.g = myConverterData.toU2022State.prevG;
\r
985 if (mySourceChar <= 0x7f) {
\r
986 targetUniChar = jisx201ToU(mySourceChar);
\r
990 if ((mySourceChar >= 0x21) && (mySourceChar <= 0x5f)) {
\r
991 /* 7-bit halfwidth Katakana */
\r
992 targetUniChar = mySourceChar + (HWKANA_START - 0x21);
\r
997 if (gotoGetTrail || source.hasRemaining()) {
\r
1000 gotoGetTrail = false;
\r
1002 boolean leadIsOk, trailIsOk;
\r
1004 trailByte = (short)(source.get(source.position()) & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
1006 * Ticket 5691: consistent illegal sequences:
\r
1007 * - We include at least the first byte in the illegal sequence.
\r
1008 * - If any of the non-initial bytes could be the start of a character,
\r
1009 * we stop the illegal sequence before the first one of those.
\r
1011 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
\r
1012 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
\r
1013 * Otherwise we convert or report the pair of bytes.
\r
1015 leadIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (mySourceChar - 0x21)) <= (0x7e - 0x21);
\r
1016 trailIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (trailByte - 0x21)) <= (0x7e - 0x21);
\r
1017 if (leadIsOk && trailIsOk) {
\r
1019 tmpSourceChar = (mySourceChar << 8) | trailByte;
\r
1020 if (cs == JISX208) {
\r
1021 _2022ToSJIS((char)mySourceChar, (char)trailByte, tempBuf);
\r
1022 mySourceChar = tmpSourceChar;
\r
1024 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
\r
1025 mySourceChar = tmpSourceChar;
\r
1026 if (cs == KSC5601) {
\r
1027 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
\r
1029 tempBuf[0] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & (tmpSourceChar >> 8));
\r
1030 tempBuf[1] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & tmpSourceChar);
\r
1032 targetUniChar = MBCSSimpleGetNextUChar(myConverterData.myConverterArray[cs], ByteBuffer.wrap(tempBuf), false);
\r
1033 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
\r
1034 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
\r
1036 /* add another bit so that the code below writes 2 bytes in case of error */
\r
1037 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
\r
1040 toUBytesArray[0] = (byte)mySourceChar;
\r
1045 } /* end of inner switch */
\r
1048 } /* end of outer switch */
\r
1050 if (targetUniChar < (UConverterConstants.missingCharMarker-1/*0xfffe*/)) {
\r
1051 if (offsets != null) {
\r
1052 offsets.put(target.remaining(), source.remaining() - (mySourceChar <= 0xff ? 1 : 2));
\r
1054 target.put((char)targetUniChar);
\r
1055 } else if (targetUniChar > UConverterConstants.missingCharMarker) {
\r
1056 /* disassemble the surrogate pair and write to output */
\r
1057 targetUniChar -= 0x0010000;
\r
1058 target.put((char)(0xd800 + (char)(targetUniChar>>10)));
\r
1059 target.position(target.position()-1);
\r
1060 if (offsets != null) {
\r
1061 offsets.put(target.remaining(), source.remaining() - (mySourceChar <= 0xff ? 1 : 2));
\r
1064 if (target.hasRemaining()) {
\r
1065 target.put((char)(0xdc00+(char)(targetUniChar&0x3ff)));
\r
1066 target.position(target.position()-1);
\r
1067 if (offsets != null) {
\r
1068 offsets.put(target.remaining(), source.remaining() - (mySourceChar <= 0xff ? 1 : 2));
\r
1072 charErrorBufferArray[charErrorBufferLength++] =
\r
1073 (char)(0xdc00+(char)(targetUniChar&0x3ff));
\r
1076 /* Call the callback function */
\r
1077 err = toUnicodeCallback(this, mySourceChar, targetUniChar);
\r
1080 } else { /* goes with "if (target.hasRemaining())" way up near the top of the function */
\r
1081 err = CoderResult.OVERFLOW;
\r
1088 } // end of class CharsetDecoderISO2022JP
\r
1090 /****************************ISO-2022-CN************************************/
\r
1091 private class CharsetDecoderISO2022CN extends CharsetDecoderICU {
\r
1092 public CharsetDecoderISO2022CN(CharsetICU cs) {
\r
1096 protected void implReset() {
\r
1097 super.implReset();
\r
1098 myConverterData.reset();
\r
1101 protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
\r
1102 CoderResult err = CoderResult.UNDERFLOW;
\r
1103 byte[] tempBuf = new byte[3];
\r
1104 int targetUniChar = 0x0000;
\r
1105 int mySourceChar = 0x0000;
\r
1106 int mySourceCharTemp = 0x0000;
\r
1107 boolean gotoEscape = false;
\r
1108 boolean gotoGetTrailByte = false;
\r
1110 if (myConverterData.key != 0) {
\r
1111 /* continue with a partial escape sequence */
\r
1113 gotoEscape = true;
\r
1114 } else if (toULength == 1 && source.hasRemaining() && target.hasRemaining()) {
\r
1115 /* continue with a partial double-byte character */
\r
1116 mySourceChar = (toUBytesArray[0] & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
1118 targetUniChar = UConverterConstants.missingCharMarker;
\r
1119 // goto getTrailByte
\r
1120 gotoGetTrailByte = true;
\r
1123 while (source.hasRemaining() || gotoGetTrailByte || gotoEscape) {
\r
1124 targetUniChar = UConverterConstants.missingCharMarker;
\r
1126 if (target.hasRemaining() || gotoEscape) {
\r
1128 mySourceChar = ESC_2022; // goto escape label
\r
1129 mySourceCharTemp = mySourceChar;
\r
1130 } else if (gotoGetTrailByte) {
\r
1131 mySourceCharTemp = 0xff; // goto getTrailByte; set mySourceCharTemp to go to default
\r
1133 mySourceChar = UConverterConstants.UNSIGNED_BYTE_MASK & source.get();
\r
1134 mySourceCharTemp = mySourceChar;
\r
1137 switch (mySourceCharTemp) {
\r
1138 case UConverterConstants.SI:
\r
1139 myConverterData.toU2022State.g = 0;
\r
1140 if (myConverterData.isEmptySegment) {
\r
1141 myConverterData.isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */
\r
1142 err = CoderResult.malformedForLength(1);
\r
1143 this.toUBytesArray[0] = (byte)mySourceChar;
\r
1144 this.toULength = 1;
\r
1149 case UConverterConstants.SO:
\r
1150 if (myConverterData.toU2022State.cs[1] != 0) {
\r
1151 myConverterData.toU2022State.g = 1;
\r
1152 myConverterData.isEmptySegment = true; /* Begin a new segment, empty so far */
\r
1155 /* illegal to have SO before a matching designator */
\r
1156 myConverterData.isEmptySegment = false; /* Handling a different error, reset this to avoid future spurious errs */
\r
1161 if (!gotoEscape) {
\r
1162 source.position(source.position()-1);
\r
1165 gotoEscape = false;
\r
1167 int mySourceBefore = source.position();
\r
1168 int toULengthBefore = this.toULength;
\r
1170 err = changeState_2022(this, source, ISO_2022_CN);
\r
1172 /* After SO there must be at least one character before a designator (designator error handled separately) */
\r
1173 if(myConverterData.key == 0 && !err.isError() && myConverterData.isEmptySegment) {
\r
1174 err = CoderResult.malformedForLength(source.position() - mySourceBefore);
\r
1175 this.toULength = toULengthBefore + (source.position() - mySourceBefore);
\r
1179 /* invalid or illegal escape sequence */
\r
1180 if(err.isError()){
\r
1181 myConverterData.isEmptySegment = false; /* Reset to avoid future spurious errors */
\r
1186 /*ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
\r
1188 /* falls through */
\r
1190 myConverterData.toU2022State.reset();
\r
1191 /* falls through */
\r
1193 /* converter one or two bytes */
\r
1194 myConverterData.isEmptySegment = false;
\r
1195 if (myConverterData.toU2022State.g != 0 || gotoGetTrailByte) {
\r
1196 if (source.hasRemaining() || gotoGetTrailByte) {
\r
1197 UConverterSharedData cnv;
\r
1200 boolean leadIsOk, trailIsOk;
\r
1202 // getTrailByte: label
\r
1203 gotoGetTrailByte = false; // reset gotoGetTrailByte
\r
1205 trailByte = (short)(source.get(source.position()) & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
1207 * Ticket 5691: consistent illegal sequences:
\r
1208 * - We include at least the first byte in the illegal sequence.
\r
1209 * - If any of the non-initial bytes could be the start of a character,
\r
1210 * we stop the illegal sequence before the first one of those.
\r
1212 * In ISO-2022 DBCS, if the second byte is in the range 21..7e range or is
\r
1213 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
\r
1214 * Otherwise we convert or report the pair of bytes.
\r
1216 leadIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (mySourceChar - 0x21)) <= (0x7e - 0x21);
\r
1217 trailIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (trailByte - 0x21)) <= (0x7e - 0x21);
\r
1218 if (leadIsOk && trailIsOk) {
\r
1220 tempState = myConverterData.toU2022State.cs[myConverterData.toU2022State.g];
\r
1221 if (tempState > CNS_11643_0) {
\r
1222 cnv = myConverterData.myConverterArray[CNS_11643];
\r
1223 tempBuf[0] = (byte)(0x80 + (tempState - CNS_11643_0));
\r
1224 tempBuf[1] = (byte)mySourceChar;
\r
1225 tempBuf[2] = (byte)trailByte;
\r
1228 cnv = myConverterData.myConverterArray[tempState];
\r
1229 tempBuf[0] = (byte)mySourceChar;
\r
1230 tempBuf[1] = (byte)trailByte;
\r
1233 ByteBuffer tempBuffer = ByteBuffer.wrap(tempBuf);
\r
1234 tempBuffer.limit(tempBufLen);
\r
1235 targetUniChar = MBCSSimpleGetNextUChar(cnv, tempBuffer, false);
\r
1236 mySourceChar = (mySourceChar << 8) | trailByte;
\r
1238 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
\r
1239 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
\r
1241 /* add another bit so that the code below writes 2 bytes in case of error */
\r
1242 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
\r
1244 if (myConverterData.toU2022State.g >= 2) {
\r
1245 /* return from a single-shift state to the previous one */
\r
1246 myConverterData.toU2022State.g = myConverterData.toU2022State.prevG;
\r
1249 toUBytesArray[0] = (byte)mySourceChar;
\r
1255 if (mySourceChar <= 0x7f) {
\r
1256 targetUniChar = (char)mySourceChar;
\r
1261 if ((UConverterConstants.UNSIGNED_INT_MASK&targetUniChar) < (UConverterConstants.UNSIGNED_INT_MASK&(UConverterConstants.missingCharMarker-1))) {
\r
1262 if (offsets != null) {
\r
1263 offsets.array()[target.position()] = source.remaining() - (mySourceChar <= 0xff ? 1 : 2);
\r
1265 target.put((char)targetUniChar);
\r
1266 } else if ((UConverterConstants.UNSIGNED_INT_MASK&targetUniChar) > (UConverterConstants.UNSIGNED_INT_MASK&(UConverterConstants.missingCharMarker))) {
\r
1267 /* disassemble the surrogate pair and write to output */
\r
1268 targetUniChar -= 0x0010000;
\r
1269 target.put((char)(0xd800+(char)(targetUniChar>>10)));
\r
1270 if (offsets != null) {
\r
1271 offsets.array()[target.position()-1] = (int)(source.position() - (mySourceChar <= 0xff ? 1 : 2));
\r
1273 if (target.hasRemaining()) {
\r
1274 target.put((char)(0xdc00+(char)(targetUniChar&0x3ff)));
\r
1275 if (offsets != null) {
\r
1276 offsets.array()[target.position()-1] = (int)(source.position() - (mySourceChar <= 0xff ? 1 : 2));
\r
1279 charErrorBufferArray[charErrorBufferLength++] = (char)(0xdc00+(char)(targetUniChar&0x3ff));
\r
1282 /* Call the callback function */
\r
1283 err = toUnicodeCallback(this, mySourceChar, targetUniChar);
\r
1288 err = CoderResult.OVERFLOW;
\r
1297 /************************ ISO-2022-KR ********************/
\r
1298 private class CharsetDecoderISO2022KR extends CharsetDecoderICU {
\r
1299 public CharsetDecoderISO2022KR(CharsetICU cs) {
\r
1303 protected void implReset() {
\r
1304 super.implReset();
\r
1305 setInitialStateToUnicodeKR();
\r
1306 myConverterData.reset();
\r
1309 protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
\r
1310 CoderResult err = CoderResult.UNDERFLOW;
\r
1311 int mySourceChar = 0x0000;
\r
1312 int targetUniChar = 0x0000;
\r
1313 byte[] tempBuf = new byte[2];
\r
1314 boolean usingFallback;
\r
1315 boolean gotoGetTrailByte = false;
\r
1316 boolean gotoEscape = false;
\r
1318 if (myConverterData.version == 1) {
\r
1319 return decodeLoopIBM(myConverterData.currentDecoder, source, target, offsets, flush);
\r
1322 /* initialize state */
\r
1323 usingFallback = isFallbackUsed();
\r
1325 if (myConverterData.key != 0) {
\r
1326 /* continue with a partial escape sequence */
\r
1327 gotoEscape = true;
\r
1328 } else if (toULength == 1 && source.hasRemaining() && target.hasRemaining()) {
\r
1329 /* continue with a partial double-byte character */
\r
1330 mySourceChar = (toUBytesArray[0] & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
1332 gotoGetTrailByte = true;
\r
1335 while (source.hasRemaining() || gotoGetTrailByte || gotoEscape) {
\r
1336 if (target.hasRemaining() || gotoGetTrailByte || gotoEscape) {
\r
1337 if (!gotoGetTrailByte && !gotoEscape) {
\r
1338 mySourceChar = (char)(source.get() & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
1341 if (!gotoGetTrailByte && !gotoEscape && mySourceChar == UConverterConstants.SI) {
\r
1342 myConverterData.toU2022State.g = 0;
\r
1343 if (myConverterData.isEmptySegment) {
\r
1344 myConverterData.isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */
\r
1345 err = CoderResult.malformedForLength(1);
\r
1346 this.toUBytesArray[0] = (byte)mySourceChar;
\r
1347 this.toULength = 1;
\r
1350 /* consume the source */
\r
1352 } else if (!gotoGetTrailByte && !gotoEscape && mySourceChar == UConverterConstants.SO) {
\r
1353 myConverterData.toU2022State.g = 1;
\r
1354 myConverterData.isEmptySegment = true;
\r
1355 /* consume the source */
\r
1357 } else if (!gotoGetTrailByte && (gotoEscape || mySourceChar == ESC_2022)) {
\r
1358 if (!gotoEscape) {
\r
1359 source.position(source.position()-1);
\r
1362 gotoEscape = false; // reset gotoEscape flag
\r
1363 myConverterData.isEmptySegment = false; /* Any invalid ESC sequences will be detected separately, so just reset this */
\r
1364 err = changeState_2022(this, source, ISO_2022_KR);
\r
1365 if (err.isError()) {
\r
1370 myConverterData.isEmptySegment = false; /* Any invalid char errors will be detected separately, so just reset this */
\r
1371 if (myConverterData.toU2022State.g == 1 || gotoGetTrailByte) {
\r
1372 if (source.hasRemaining() || gotoGetTrailByte) {
\r
1373 boolean leadIsOk, trailIsOk;
\r
1375 // getTrailByte label
\r
1376 gotoGetTrailByte = false; // reset gotoGetTrailByte flag
\r
1378 trailByte = (short)(source.get(source.position()) & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
1379 targetUniChar = UConverterConstants.missingCharMarker;
\r
1381 * Ticket 5691: consistent illegal sequences:
\r
1382 * - We include at least the first byte in the illegal sequence.
\r
1383 * - If any of the non-initial bytes could be the start of a character,
\r
1384 * we stop the illegal sequence before the first one of those.
\r
1386 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
\r
1387 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
\r
1388 * Otherwise we convert or report the pair of bytes.
\r
1390 leadIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (mySourceChar - 0x21)) <= (0x7e - 0x21);
\r
1391 trailIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (trailByte - 0x21)) <= (0x7e - 0x21);
\r
1392 if (leadIsOk && trailIsOk) {
\r
1394 tempBuf[0] = (byte)(mySourceChar + 0x80);
\r
1395 tempBuf[1] = (byte)(trailByte + 0x80);
\r
1396 targetUniChar = MBCSSimpleGetNextUChar(myConverterData.currentConverter.sharedData, ByteBuffer.wrap(tempBuf), usingFallback);
\r
1397 mySourceChar = (char)((mySourceChar << 8) | trailByte);
\r
1398 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
\r
1399 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
\r
1401 /* add another bit so that the code below writes 2 bytes in case of error */
\r
1402 mySourceChar = (char)(0x10000 | (mySourceChar << 8) | trailByte);
\r
1405 toUBytesArray[0] = (byte)mySourceChar;
\r
1409 } else if (mySourceChar <= 0x7f) {
\r
1410 int savedSourceLimit = source.limit();
\r
1411 int savedSourcePosition = source.position();
\r
1412 source.limit(source.position());
\r
1413 source.position(source.position()-1);
\r
1414 targetUniChar = MBCSSimpleGetNextUChar(myConverterData.currentConverter.sharedData, source, usingFallback);
\r
1415 source.limit(savedSourceLimit);
\r
1416 source.position(savedSourcePosition);
\r
1418 targetUniChar = 0xffff;
\r
1420 if (targetUniChar < 0xfffe) {
\r
1421 target.put((char)targetUniChar);
\r
1422 if (offsets != null) {
\r
1423 offsets.array()[target.position()] = source.position() - (mySourceChar <= 0xff ? 1 : 2);
\r
1426 /* Call the callback function */
\r
1427 err = toUnicodeCallback(this, mySourceChar, targetUniChar);
\r
1431 err = CoderResult.OVERFLOW;
\r
1439 protected CoderResult decodeLoopIBM(CharsetDecoderMBCS cnv, ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
\r
1440 CoderResult err = CoderResult.UNDERFLOW;
\r
1445 boolean gotoEscape = false;
\r
1446 int oldSourceLimit;
\r
1448 /* remember the original start of the input for offsets */
\r
1449 sourceStart = argSource = source.position();
\r
1451 if (myConverterData.key != 0) {
\r
1452 /* continue with a partial escape sequence */
\r
1453 gotoEscape = true;
\r
1456 while (gotoEscape || (!err.isError() && source.hasRemaining())) {
\r
1457 if (!gotoEscape) {
\r
1458 /* Find the end of the buffer e.g : Next Escape Seq | end of Buffer */
\r
1459 int oldSourcePos = source.position();
\r
1460 sourceLimit = getEndOfBuffer_2022(source);
\r
1461 source.position(oldSourcePos);
\r
1462 if (source.position() != sourceLimit) {
\r
1464 * get the current partial byte sequence
\r
1466 * it needs to be moved between the public and the subconverter
\r
1467 * so that the conversion frameword, which only sees the public
\r
1468 * converter, can handle truncated and illegal input etc.
\r
1470 if (toULength > 0) {
\r
1471 cnv.toUBytesArray = (byte[])(toUBytesArray.clone());
\r
1473 cnv.toULength = toULength;
\r
1476 * Convert up to the end of the input, or to before the next escape character.
\r
1477 * Does not handle conversion extensions because the preToU[] state etc.
\r
1480 argTarget = target.position();
\r
1481 oldSourceLimit = source.limit(); // save the old source limit change to new one
\r
1482 source.limit(sourceLimit);
\r
1483 err = myConverterData.currentDecoder.cnvMBCSToUnicodeWithOffsets(source, target, offsets, flush);
\r
1484 source.limit(oldSourceLimit); // restore source limit;
\r
1485 if (offsets != null && sourceStart != argSource) {
\r
1486 /* update offsets to base them on the actual start of the input */
\r
1487 int delta = argSource - sourceStart;
\r
1488 while (argTarget < target.position()) {
\r
1489 int currentOffset = offsets.get();
\r
1490 offsets.position(offsets.position()-1);
\r
1491 if (currentOffset >= 0) {
\r
1492 offsets.put(currentOffset + delta);
\r
1493 offsets.position(offsets.position()-1);
\r
1499 argSource = source.position();
\r
1501 /* copy input/error/overflow buffers */
\r
1502 if (cnv.toULength > 0) {
\r
1503 toUBytesArray = (byte[])(cnv.toUBytesArray.clone());
\r
1505 toULength = cnv.toULength;
\r
1507 if (err.isOverflow()) {
\r
1508 if (cnv.charErrorBufferLength > 0) {
\r
1509 charErrorBufferArray = (char[])(cnv.charErrorBufferArray.clone());
\r
1511 charErrorBufferLength = cnv.charErrorBufferLength;
\r
1512 cnv.charErrorBufferLength = 0;
\r
1516 if (err.isError() || err.isOverflow() || (source.position() == source.limit())) {
\r
1521 gotoEscape = false;
\r
1522 err = changeState_2022(this, source, ISO_2022_KR);
\r
1528 /******************** from unicode **********************/
\r
1529 /* preference order of JP charsets */
\r
1530 private final static byte []jpCharsetPref = {
\r
1542 * The escape sequences must be in order of the enum constants like JISX201 = 3,
\r
1543 * not in order of jpCharsetPref[]!
\r
1545 private final static byte [][]escSeqChars = {
\r
1546 { 0x1B, 0x28, 0x42}, /* <ESC>(B ASCII */
\r
1547 { 0x1B, 0x2E, 0x41}, /* <ESC>.A ISO-8859-1 */
\r
1548 { 0x1B, 0x2E, 0x46}, /* <ESC>.F ISO-8859-7 */
\r
1549 { 0x1B, 0x28, 0x4A}, /* <ESC>(J JISX-201 */
\r
1550 { 0x1B, 0x24, 0x42}, /* <ESC>$B JISX-208 */
\r
1551 { 0x1B, 0x24, 0x28, 0x44}, /* <ESC>$(D JISX-212 */
\r
1552 { 0x1B, 0x24, 0x41}, /* <ESC>$A GB2312 */
\r
1553 { 0x1B, 0x24, 0x28, 0x43}, /* <ESC>$(C KSC5601 */
\r
1554 { 0x1B, 0x28, 0x49} /* <ESC>(I HWKANA_7BIT */
\r
1557 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
\r
1559 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
\r
1560 * because Shift-JIS roundtrips half-width Katakana to single bytes.
\r
1561 * These were the only fallbacks in ICU's jisx-208.ucm file.
\r
1563 private final static char []hwkana_fb = {
\r
1564 0x2123, /* U+FF61 */
\r
1579 0x213C, /* U+FF70 */
\r
1595 0x253F, /* U+FF80 */
\r
1611 0x255F, /* U+FF90 */
\r
1626 0x212C /* U+FF9F */
\r
1629 protected byte [][]fromUSubstitutionChar = new byte[][]{ { (byte)0x1A }, { (byte)0x2F, (byte)0x7E} };
\r
1630 /****************************ISO-2022-JP************************************/
\r
1631 private class CharsetEncoderISO2022JP extends CharsetEncoderICU {
\r
1632 public CharsetEncoderISO2022JP(CharsetICU cs) {
\r
1633 super(cs, fromUSubstitutionChar[0]);
\r
1636 protected void implReset() {
\r
1637 super.implReset();
\r
1638 myConverterData.reset();
\r
1640 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
\r
1641 private int jisx201FromU(int value) {
\r
1642 if (value <= 0x7f) {
\r
1643 if (value != 0x5c && value != 0x7e) {
\r
1646 } else if (value == 0xa5) {
\r
1648 } else if (value == 0x203e) {
\r
1651 return (int)(UConverterConstants.UNSIGNED_INT_MASK & 0xfffe);
\r
1655 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
\r
1656 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
\r
1657 * Return 0 if the byte pair is out of range.
\r
1659 private int _2022FromSJIS(int value) {
\r
1662 if (value > 0xEFFC) {
\r
1663 return 0; /* beyond JIS X 0208 */
\r
1666 trail = (short)(value & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
1668 value &= 0xff00; /* lead byte */
\r
1669 if (value <= 0x9f00) {
\r
1671 } else { /* 0xe000 <= value <= 0xef00 */
\r
1677 if (trail <= 0x9e) {
\r
1679 if (trail <= 0x7e) {
\r
1680 value |= ((trail - 0x1f) & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
1682 value |= ((trail - 0x20) & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
1684 } else { /* trail <= 0xfc */
\r
1685 value |= ((trail - 0x7e) & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
1690 /* This overrides the cbFromUWriteSub method in CharsetEncoderICU */
\r
1691 CoderResult cbFromUWriteSub (CharsetEncoderICU encoder,
\r
1692 CharBuffer source, ByteBuffer target, IntBuffer offsets){
\r
1693 CoderResult err = CoderResult.UNDERFLOW;
\r
1694 byte[] buffer = new byte[8];
\r
1697 subchar = encoder.replacement();
\r
1700 if (myConverterData.fromU2022State.g == 1) {
\r
1701 /* JIS7: switch from G1 to G0 */
\r
1702 myConverterData.fromU2022State.g = 0;
\r
1703 buffer[i++] = UConverterConstants.SI;
\r
1705 cs = myConverterData.fromU2022State.cs[0];
\r
1707 if (cs != ASCII && cs != JISX201) {
\r
1708 /* not in ASCII or JIS X 0201: switch to ASCII */
\r
1709 myConverterData.fromU2022State.cs[0] = ASCII;
\r
1710 buffer[i++] = 0x1B;
\r
1711 buffer[i++] = 0x28;
\r
1712 buffer[i++] = 0x42;
\r
1715 buffer[i++] = subchar[0];
\r
1717 err = CharsetEncoderICU.fromUWriteBytes(this, buffer, 0, i, target, offsets, source.position() - 1);
\r
1722 protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
\r
1723 CoderResult err = CoderResult.UNDERFLOW;
\r
1728 byte[] choices = new byte[10];
\r
1729 int targetValue = 0;
\r
1730 boolean usingFallback;
\r
1731 byte[] buffer = new byte[8];
\r
1732 boolean getTrail = false; // use for getTrail label
\r
1733 int oldSourcePos; // for proper error handling
\r
1737 /* check if the last codepoint of previous buffer was a lead surrogate */
\r
1738 if ((sourceChar = fromUChar32) != 0 && target.hasRemaining()) {
\r
1742 while (getTrail || source.hasRemaining()) {
\r
1743 if (getTrail || target.hasRemaining()) {
\r
1744 oldSourcePos = source.position();
\r
1745 if (!getTrail) { /* skip if going to getTrail label */
\r
1746 sourceChar = source.get();
\r
1748 /* check if the char is a First surrogate */
\r
1749 if (getTrail || UTF16.isSurrogate((char)sourceChar)) {
\r
1750 if (getTrail || UTF16.isLeadSurrogate((char)sourceChar)) {
\r
1755 /* look ahead to find the trail surrogate */
\r
1756 if (source.hasRemaining()) {
\r
1757 /* test the following code unit */
\r
1758 char trail = source.get();
\r
1759 /* go back to the previous position */
\r
1760 source.position(source.position()-1);
\r
1761 if (UTF16.isTrailSurrogate(trail)) {
\r
1763 sourceChar = UCharacter.getCodePoint((char)sourceChar, trail);
\r
1764 fromUChar32 = 0x00;
\r
1765 /* convert this supplementary code point */
\r
1766 /* exit this condition tree */
\r
1768 /* this is an unmatched lead code unit (1st surrogate) */
\r
1769 /* callback(illegal) */
\r
1770 err = CoderResult.malformedForLength(1);
\r
1771 fromUChar32 = sourceChar;
\r
1775 /* no more input */
\r
1776 fromUChar32 = sourceChar;
\r
1780 /* this is an unmatched trail code unit (2nd surrogate) */
\r
1781 /* callback(illegal) */
\r
1782 err = CoderResult.malformedForLength(1);
\r
1783 fromUChar32 = sourceChar;
\r
1788 /* do not convert SO/SI/ESC */
\r
1789 if (IS_2022_CONTROL(sourceChar)) {
\r
1790 /* callback(illegal) */
\r
1791 err = CoderResult.malformedForLength(1);
\r
1792 fromUChar32 = sourceChar;
\r
1796 /* do the conversion */
\r
1798 if (choiceCount == 0) {
\r
1801 * The csm variable keeps track of which charsets are allowed
\r
1802 * and not used yet while building the choices[].
\r
1804 csm = (char)jpCharsetMasks[myConverterData.version];
\r
1807 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
\r
1808 if (myConverterData.version == 3 || myConverterData.version == 4) {
\r
1809 choices[choiceCount++] = HWKANA_7BIT;
\r
1811 /* Do not try single-bit half-width Katakana for other versions. */
\r
1812 csm &= ~CSM(HWKANA_7BIT);
\r
1814 /* try the current G0 charset */
\r
1815 choices[choiceCount++] = cs = myConverterData.fromU2022State.cs[0];
\r
1818 /* try the current G2 charset */
\r
1819 if ((cs = myConverterData.fromU2022State.cs[2]) != 0) {
\r
1820 choices[choiceCount++] = cs;
\r
1824 /* try all the other charsets */
\r
1825 for (int i = 0; i < jpCharsetPref.length; i++) {
\r
1826 cs = jpCharsetPref[i];
\r
1827 if ((CSM(cs) & csm) != 0) {
\r
1828 choices[choiceCount++] = cs;
\r
1836 * len==0: no mapping found yet
\r
1837 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
\r
1838 * len>0: found a roundtrip result, done
\r
1842 * We will turn off usingFallBack after finding a fallback,
\r
1843 * but we still get fallbacks from PUA code points as usual.
\r
1844 * Therefore, we will also need to check that we don't overwrite
\r
1845 * an early fallback with a later one.
\r
1847 usingFallback = useFallback;
\r
1849 for (int i = 0; i < choiceCount && len <= 0; i++) {
\r
1850 int[] value = new int[1];
\r
1852 byte cs0 = choices[i];
\r
1855 if (sourceChar <= 0x7f) {
\r
1856 targetValue = sourceChar;
\r
1863 if (GR96_START <= sourceChar && sourceChar <= GR96_END) {
\r
1864 targetValue = sourceChar - 0x80;
\r
1871 if (sourceChar <= HWKANA_END && sourceChar >= HWKANA_START) {
\r
1872 if (myConverterData.version == 3) {
\r
1873 /* JIS7: use G1 (SO) */
\r
1874 /* Shift U+FF61..U+FF9F to bytes 21..5F. */
\r
1875 targetValue = (int)(UConverterConstants.UNSIGNED_INT_MASK & (sourceChar - (HWKANA_START - 0x21)));
\r
1877 myConverterData.fromU2022State.cs[1] = cs = cs0; /* do not output an escape sequence */
\r
1879 } else if (myConverterData.version == 4) {
\r
1880 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
\r
1881 /* Shift U+FF61..U+FF9F to bytes A1..DF. */
\r
1882 targetValue = (int)(UConverterConstants.UNSIGNED_INT_MASK & (sourceChar - (HWKANA_START - 0xa1)));
\r
1885 cs = myConverterData.fromU2022State.cs[0];
\r
1886 if (IS_JP_DBCS(cs)) {
\r
1887 /* switch from a DBCS charset to JISX201 */
\r
1890 /* else stay in the current G0 charset */
\r
1893 /* else do not use HWKANA_7BIT with other versions */
\r
1898 value[0] = jisx201FromU(sourceChar);
\r
1899 if (value[0] <= 0x7f) {
\r
1900 targetValue = value[0];
\r
1904 usingFallback = false;
\r
1908 /* G0 DBCS from JIS table */
\r
1909 myConverterData.currentConverter.sharedData = myConverterData.myConverterArray[cs0];
\r
1910 myConverterData.currentConverter.sharedData.mbcs.outputType = CharsetMBCS.MBCS_OUTPUT_2;
\r
1911 len2 = myConverterData.currentEncoder.fromUChar32(sourceChar, value, usingFallback);
\r
1912 //len2 = MBCSFromUChar32_ISO2022(myConverterData.myConverterArray[cs0], sourceChar, value, usingFallback, CharsetMBCS.MBCS_OUTPUT_2);
\r
1913 if (len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len) == 2 */
\r
1914 value[0] = _2022FromSJIS(value[0]);
\r
1915 if (value[0] != 0) {
\r
1916 targetValue = value[0];
\r
1920 usingFallback = false;
\r
1922 } else if (len == 0 && usingFallback && sourceChar <= HWKANA_END && sourceChar >= HWKANA_START) {
\r
1923 targetValue = hwkana_fb[sourceChar - HWKANA_START];
\r
1927 usingFallback = false;
\r
1931 /* G0 SBCS forced to 7-bit output */
\r
1932 len2 = MBCSSingleFromUChar32(myConverterData.myConverterArray[cs0], sourceChar, value, usingFallback);
\r
1933 if (len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value[0] && value[0] <= GR96_END) {
\r
1934 targetValue = value[0] - 0x80;
\r
1938 usingFallback = false;
\r
1943 myConverterData.currentConverter.sharedData = myConverterData.myConverterArray[cs0];
\r
1944 myConverterData.currentConverter.sharedData.mbcs.outputType = CharsetMBCS.MBCS_OUTPUT_2;
\r
1945 len2 = myConverterData.currentEncoder.fromUChar32(sourceChar, value, usingFallback);
\r
1946 //len2 = MBCSFromUChar32_ISO2022(myConverterData.myConverterArray[cs0], sourceChar, value, usingFallback, CharsetMBCS.MBCS_OUTPUT_2);
\r
1947 if (len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
\r
1948 if (cs0 == KSC5601) {
\r
1950 * Check for valid bytes for the encoding scheme.
\r
1951 * This is necessary because the sub-converter (windows-949)
\r
1952 * has a broader encoding scheme than is valid for 2022.
\r
1954 value[0] = _2022FromGR94DBCS(value[0]);
\r
1955 if (value[0] == 0) {
\r
1959 targetValue = value[0];
\r
1963 usingFallback = false;
\r
1971 len = -len; /* fallback */
\r
1975 /* write SI if necessary (only for JIS7 */
\r
1976 if (myConverterData.fromU2022State.g == 1 && g == 0) {
\r
1977 buffer[outLen++] = UConverterConstants.SI;
\r
1978 myConverterData.fromU2022State.g = 0;
\r
1981 /* write the designation sequence if necessary */
\r
1982 if (cs != myConverterData.fromU2022State.cs[g]) {
\r
1983 for (int i = 0; i < escSeqChars[cs].length; i++) {
\r
1984 buffer[outLen++] = escSeqChars[cs][i];
\r
1986 myConverterData.fromU2022State.cs[g] = cs;
\r
1988 /* invalidate the choices[] */
\r
1992 /* write the shift sequence if necessary */
\r
1993 if (g != myConverterData.fromU2022State.g) {
\r
1995 /* case 0 handled before writing escapes */
\r
1997 buffer[outLen++] = UConverterConstants.SO;
\r
1998 myConverterData.fromU2022State.g = 1;
\r
2000 default : /* case 2 */
\r
2001 buffer[outLen++] = 0x1b;
\r
2002 buffer[outLen++] = 0x4e;
\r
2004 /* case 3: no SS3 in ISO-2022-JP-x */
\r
2008 /* write the output bytes */
\r
2010 buffer[outLen++] = (byte)targetValue;
\r
2011 } else { /* len == 2 */
\r
2012 buffer[outLen++] = (byte)(targetValue >> 8);
\r
2013 buffer[outLen++] = (byte)targetValue;
\r
2017 * if we cannot find the character after checking all codepages
\r
2018 * then this is an error.
\r
2020 err = CoderResult.unmappableForLength(source.position()-oldSourcePos);
\r
2021 fromUChar32 = sourceChar;
\r
2025 if (sourceChar == CR || sourceChar == LF) {
\r
2026 /* reset the G2 state at the end of a line (conversion got use into ASCII or JISX201 already) */
\r
2027 myConverterData.fromU2022State.cs[2] = 0;
\r
2031 /* output outLen>0 bytes in buffer[] */
\r
2032 if (outLen == 1) {
\r
2033 target.put(buffer[0]);
\r
2034 if (offsets != null) {
\r
2035 offsets.put(source.remaining() - 1); /* -1 known to be ASCII */
\r
2037 } else if (outLen == 2 && (target.position() + 2) <= target.limit()) {
\r
2038 target.put(buffer[0]);
\r
2039 target.put(buffer[1]);
\r
2040 if (offsets != null) {
\r
2041 int sourceIndex = source.position() - 1;
\r
2042 offsets.put(sourceIndex);
\r
2043 offsets.put(sourceIndex);
\r
2046 err = CharsetEncoderICU.fromUWriteBytes(this, buffer, 0, outLen, target, offsets, source.position()-1);
\r
2049 err = CoderResult.OVERFLOW;
\r
2055 * the end of the input stream and detection of truncated input
\r
2056 * are handled by the framework, but for ISO-2022-JP conversion
\r
2057 * we need to be in ASCII mode at the very end
\r
2061 * in SO mode or not in ASCII mode
\r
2062 * end of input and no truncated input
\r
2064 if (!err.isError() &&
\r
2065 (myConverterData.fromU2022State.g != 0 || myConverterData.fromU2022State.cs[0] != ASCII) &&
\r
2066 flush && !source.hasRemaining() && fromUChar32 == 0) {
\r
2071 if (myConverterData.fromU2022State.g != 0) {
\r
2072 buffer[outLen++] = UConverterConstants.SI;
\r
2073 myConverterData.fromU2022State.g = 0;
\r
2076 if (myConverterData.fromU2022State.cs[0] != ASCII) {
\r
2077 for (int i = 0; i < escSeqChars[ASCII].length; i++) {
\r
2078 buffer[outLen++] = escSeqChars[ASCII][i];
\r
2080 myConverterData.fromU2022State.cs[0] = ASCII;
\r
2083 /* get the source index of the last input character */
\r
2084 sourceIndex = source.position();
\r
2085 if (sourceIndex > 0) {
\r
2087 if (UTF16.isTrailSurrogate(source.get(sourceIndex)) &&
\r
2088 (sourceIndex == 0 || UTF16.isLeadSurrogate(source.get(sourceIndex-1)))) {
\r
2095 err = CharsetEncoderICU.fromUWriteBytes(this, buffer, 0, outLen, target, offsets, sourceIndex);
\r
2100 /****************************ISO-2022-CN************************************/
\r
2102 * Rules for ISO-2022-CN Encoding:
\r
2103 * i) The designator sequence must appear once on a line before any instance
\r
2104 * of chracter set it designates.
\r
2105 * ii) If two lines contain characters from the same character set, both lines
\r
2106 * must include the designator sequence.
\r
2107 * iii) Once the designator sequence is known, a shifting sequence has to be found
\r
2108 * to invoke the shifting
\r
2109 * iv) All lines start in ASCII and end in ASCII.
\r
2110 * v) Four shifting sequences are employed for this purpose:
\r
2111 * Sequence ASCII Eq Charsets
\r
2112 * --------- --------- --------
\r
2113 * SI <SI> US-ASCII
\r
2114 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
\r
2115 * SS2 <ESC>N CNS-11643-1992 Plane 2
\r
2116 * SS3 <ESC>O CNS-11643-1992 Planes 3-7
\r
2118 * SOdesignator : ESC "$" ")" finalchar_for_SO
\r
2119 * SS2designator : ESC "$" "*" finalchar_for_SS2
\r
2120 * SS3designator : ESC "$" "+" finalchar_for_SS3
\r
2122 * ESC $ ) A Indicates the bytes following SO are Chinese
\r
2123 * characters as defined in GB 2312-80, until
\r
2124 * another SOdesignation appears
\r
2126 * ESC $ ) E Indicates the bytes following SO are as defined
\r
2127 * in ISO-IR-165 (for details, see section 2.1),
\r
2128 * until another SOdesignation appears
\r
2130 * ESC $ ) G Indicates the bytes following SO are as defined
\r
2131 * in CNS 11643-plane-1, until another SOdesignation appears
\r
2133 * ESC $ * H Indicates teh two bytes immediately following
\r
2134 * SS2 is a Chinese character as defined in CNS
\r
2135 * 11643-plane-2, until another SS2designation
\r
2137 * (Meaning <ESC>N must preceed ever 2 byte sequence.)
\r
2139 * ESC $ + I Indicates the immediate two bytes following SS3
\r
2140 * is a Chinese character as defined in CNS
\r
2141 * 11643-plane-3, until another SS3designation
\r
2143 * (Meaning <ESC>O must preceed every 2 byte sequence.)
\r
2145 * ESC $ + J Indicates the immediate two bytes following SS3
\r
2146 * is a Chinese character as defined in CNS
\r
2147 * 11643-plane-4, until another SS3designation
\r
2149 * (In English: <ESC>O must preceed every 2 byte sequence.)
\r
2151 * ESC $ + K Indicates the immediate two bytes following SS3
\r
2152 * is a Chinese character as defined in CNS
\r
2153 * 11643-plane-5, until another SS3designation
\r
2156 * ESC $ + L Indicates the immediate two bytes following SS3
\r
2157 * is a Chinese character as defined in CNS
\r
2158 * 11643-plane-6, until another SS3designation
\r
2161 * ESC $ + M Indicates the immediate two bytes following SS3
\r
2162 * is a Chinese character as defined in CNS
\r
2163 * 11643-plane-7, until another SS3designation
\r
2166 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
\r
2167 * has its own designation information before any Chinese chracters
\r
2171 /* The following are defined this way to make strings truely readonly */
\r
2172 private final static byte[] GB_2312_80_STR = { 0x1B, 0x24, 0x29, 0x41 };
\r
2173 private final static byte[] ISO_IR_165_STR = { 0x1B, 0x24, 0x29, 0x45 };
\r
2174 private final static byte[] CNS_11643_1992_Plane_1_STR = { 0x1B, 0x24, 0x29, 0x47 };
\r
2175 private final static byte[] CNS_11643_1992_Plane_2_STR = { 0x1B, 0x24, 0x2A, 0x48 };
\r
2176 private final static byte[] CNS_11643_1992_Plane_3_STR = { 0x1B, 0x24, 0x2B, 0x49 };
\r
2177 private final static byte[] CNS_11643_1992_Plane_4_STR = { 0x1B, 0x24, 0x2B, 0x4A };
\r
2178 private final static byte[] CNS_11643_1992_Plane_5_STR = { 0x1B, 0x24, 0x2B, 0x4B };
\r
2179 private final static byte[] CNS_11643_1992_Plane_6_STR = { 0x1B, 0x24, 0x2B, 0x4C };
\r
2180 private final static byte[] CNS_11643_1992_Plane_7_STR = { 0x1B, 0x24, 0x2B, 0x4D };
\r
2182 /************************ ISO2022-CN Data *****************************/
\r
2183 private final static byte[][] escSeqCharsCN = {
\r
2187 CNS_11643_1992_Plane_1_STR,
\r
2188 CNS_11643_1992_Plane_2_STR,
\r
2189 CNS_11643_1992_Plane_3_STR,
\r
2190 CNS_11643_1992_Plane_4_STR,
\r
2191 CNS_11643_1992_Plane_5_STR,
\r
2192 CNS_11643_1992_Plane_6_STR,
\r
2193 CNS_11643_1992_Plane_7_STR,
\r
2196 private class CharsetEncoderISO2022CN extends CharsetEncoderICU {
\r
2197 public CharsetEncoderISO2022CN(CharsetICU cs) {
\r
2198 super(cs, fromUSubstitutionChar[0]);
\r
2201 protected void implReset() {
\r
2202 super.implReset();
\r
2203 myConverterData.reset();
\r
2206 /* This overrides the cbFromUWriteSub method in CharsetEncoderICU */
\r
2207 CoderResult cbFromUWriteSub (CharsetEncoderICU encoder,
\r
2208 CharBuffer source, ByteBuffer target, IntBuffer offsets){
\r
2209 CoderResult err = CoderResult.UNDERFLOW;
\r
2210 byte[] buffer = new byte[8];
\r
2213 subchar = encoder.replacement();
\r
2215 if (myConverterData.fromU2022State.g != 0) {
\r
2216 /* not in ASCII mode: switch to ASCII */
\r
2217 myConverterData.fromU2022State.g = 0;
\r
2218 buffer[i++] = UConverterConstants.SI;
\r
2220 buffer[i++] = subchar[0];
\r
2222 err = CharsetEncoderICU.fromUWriteBytes(this, buffer, 0, i, target, offsets, source.position() - 1);
\r
2227 protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
\r
2228 CoderResult err = CoderResult.UNDERFLOW;
\r
2230 byte[] buffer = new byte[8];
\r
2232 byte[] choices = new byte[3];
\r
2234 int targetValue = 0;
\r
2235 boolean usingFallback;
\r
2236 boolean gotoGetTrail = false;
\r
2237 int oldSourcePos; // For proper error handling
\r
2241 /* check if the last codepoint of previous buffer was a lead surrogate */
\r
2242 if ((sourceChar = fromUChar32) != 0 && target.hasRemaining()) {
\r
2243 // goto getTrail label
\r
2244 gotoGetTrail = true;
\r
2247 while (source.hasRemaining() || gotoGetTrail) {
\r
2248 if (target.hasRemaining() || gotoGetTrail) {
\r
2249 oldSourcePos = source.position();
\r
2250 if (!gotoGetTrail) {
\r
2251 sourceChar = source.get();
\r
2253 /* check if the char is a First surrogate */
\r
2254 if (UTF16.isSurrogate((char)sourceChar) || gotoGetTrail) {
\r
2255 if (UTF16.isLeadSurrogate((char)sourceChar) || gotoGetTrail) {
\r
2257 /* reset gotoGetTrail flag*/
\r
2258 gotoGetTrail = false;
\r
2260 /* look ahead to find the trail surrogate */
\r
2261 if (source.hasRemaining()) {
\r
2262 /* test the following code unit */
\r
2263 char trail = source.get();
\r
2264 source.position(source.position()-1);
\r
2265 if (UTF16.isTrailSurrogate(trail)) {
\r
2267 sourceChar = UCharacter.getCodePoint((char)sourceChar, trail);
\r
2268 fromUChar32 = 0x00;
\r
2269 /* convert this supplementary code point */
\r
2270 /* exit this condition tree */
\r
2272 /* this is an unmatched lead code unit (1st surrogate) */
\r
2273 /* callback(illegal) */
\r
2274 err = CoderResult.malformedForLength(1);
\r
2275 fromUChar32 = sourceChar;
\r
2279 /* no more input */
\r
2280 fromUChar32 = sourceChar;
\r
2284 /* this is an unmatched trail code unit (2nd surrogate) */
\r
2285 /* callback(illegal) */
\r
2286 err = CoderResult.malformedForLength(1);
\r
2287 fromUChar32 = sourceChar;
\r
2292 /* do the conversion */
\r
2293 if (sourceChar <= 0x007f) {
\r
2294 /* do not converter SO/SI/ESC */
\r
2295 if (IS_2022_CONTROL(sourceChar)) {
\r
2296 /* callback(illegal) */
\r
2297 err = CoderResult.malformedForLength(1);
\r
2298 fromUChar32 = sourceChar;
\r
2303 if (myConverterData.fromU2022State.g == 0) {
\r
2304 buffer[0] = (byte)sourceChar;
\r
2307 buffer[0] = UConverterConstants.SI;
\r
2308 buffer[1] = (byte)sourceChar;
\r
2310 myConverterData.fromU2022State.g = 0;
\r
2314 if (sourceChar == CR || sourceChar == LF) {
\r
2315 /* reset the state at the end of a line */
\r
2316 myConverterData.fromU2022State.reset();
\r
2320 /* convert U+0080..U+10ffff */
\r
2324 if (choiceCount == 0) {
\r
2325 /* try the current SO/G1 converter first */
\r
2326 choices[0] = myConverterData.fromU2022State.cs[1];
\r
2328 /* default to GB2312_1 if none is designated yet */
\r
2329 if (choices[0] == 0) {
\r
2330 choices[0] = GB2312_1;
\r
2332 if (myConverterData.version == 0) {
\r
2334 /* try other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
\r
2335 if (choices[0] == GB2312_1) {
\r
2336 choices[1] = CNS_11643_1;
\r
2338 choices[1] = GB2312_1;
\r
2343 /* ISO-2022-CN-EXT */
\r
2345 /* try one of the other converters */
\r
2346 switch (choices[0]) {
\r
2348 choices[1] = CNS_11643_1;
\r
2349 choices[2] = ISO_IR_165;
\r
2352 choices[1] = GB2312_1;
\r
2353 choices[2] = CNS_11643_1;
\r
2356 choices[1] = GB2312_1;
\r
2357 choices[2] = ISO_IR_165;
\r
2367 * len==0: no mapping found yet
\r
2368 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
\r
2369 * len>0: found a roundtrip result, done
\r
2373 * We will turn off usingFallback after finding a fallback,
\r
2374 * but we still get fallbacks from PUA code points as usual.
\r
2375 * Therefore, we will also need to check that we don't overwrite
\r
2376 * an early fallback with a later one.
\r
2378 usingFallback = useFallback;
\r
2380 for (i = 0; i < choiceCount && len <= 0; ++i) {
\r
2381 byte cs0 = choices[i];
\r
2383 int[] value = new int[1];
\r
2385 if (cs0 > CNS_11643_0) {
\r
2386 myConverterData.currentConverter.sharedData = myConverterData.myConverterArray[CNS_11643];
\r
2387 myConverterData.currentConverter.sharedData.mbcs.outputType = CharsetMBCS.MBCS_OUTPUT_3;
\r
2388 len2 = myConverterData.currentEncoder.fromUChar32(sourceChar, value, usingFallback);
\r
2389 //len2 = MBCSFromUChar32_ISO2022(myConverterData.myConverterArray[CNS_11643],
\r
2390 // sourceChar, value, usingFallback, CharsetMBCS.MBCS_OUTPUT_3);
\r
2391 if (len2 == 3 || (len2 == -3 && len == 0)) {
\r
2392 targetValue = value[0];
\r
2393 cs = (byte)(CNS_11643_0 + (value[0] >> 16) - 0x80);
\r
2398 usingFallback = false;
\r
2400 if (cs == CNS_11643_1) {
\r
2402 } else if (cs == CNS_11643_2) {
\r
2404 } else if (myConverterData.version == 1) { /* plane 3..7 */
\r
2407 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
\r
2412 /* GB2312_1 or ISO-IR-165 */
\r
2413 myConverterData.currentConverter.sharedData = myConverterData.myConverterArray[cs0];
\r
2414 myConverterData.currentConverter.sharedData.mbcs.outputType = CharsetMBCS.MBCS_OUTPUT_2;
\r
2415 len2 = myConverterData.currentEncoder.fromUChar32(sourceChar, value, usingFallback);
\r
2416 //len2 = MBCSFromUChar32_ISO2022(myConverterData.myConverterArray[cs0],
\r
2417 // sourceChar, value, usingFallback, CharsetMBCS.MBCS_OUTPUT_2);
\r
2418 if (len2 == 2 || (len2 == -2 && len == 0)) {
\r
2419 targetValue = value[0];
\r
2423 usingFallback = false;
\r
2430 len = 0; /* count output bytes; it must have ben abs(len) == 2 */
\r
2432 /* write the designation sequence if necessary */
\r
2433 if (cs != myConverterData.fromU2022State.cs[g]) {
\r
2434 if (cs < CNS_11643) {
\r
2435 for (int n = 0; n < escSeqCharsCN[cs].length; n++) {
\r
2436 buffer[n] = escSeqCharsCN[cs][n];
\r
2439 for (int n = 0; n < escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)].length; n++) {
\r
2440 buffer[n] = escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)][n];
\r
2444 myConverterData.fromU2022State.cs[g] = cs;
\r
2446 /* changing the SO/G1 charset invalidates the choices[] */
\r
2451 /* write the shift sequence if necessary */
\r
2452 if (g != myConverterData.fromU2022State.g) {
\r
2455 buffer[len++] = UConverterConstants.SO;
\r
2457 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
\r
2458 myConverterData.fromU2022State.g = 1;
\r
2461 buffer[len++] = 0x1b;
\r
2462 buffer[len++] = 0x4e;
\r
2464 default: /* case 3 */
\r
2465 buffer[len++] = 0x1b;
\r
2466 buffer[len++] = 0x4f;
\r
2471 /* write the two output bytes */
\r
2472 buffer[len++] = (byte)(targetValue >> 8);
\r
2473 buffer[len++] = (byte)targetValue;
\r
2475 /* if we cannot find the character after checking all codepages
\r
2476 * then this is an error
\r
2478 err = CoderResult.unmappableForLength(source.position()-oldSourcePos);
\r
2479 fromUChar32 = sourceChar;
\r
2483 /* output len>0 bytes in buffer[] */
\r
2485 target.put(buffer[0]);
\r
2486 if (offsets != null) {
\r
2487 offsets.put(source.position()-1);
\r
2489 } else if (len == 2 && (target.remaining() >= 2)) {
\r
2490 target.put(buffer[0]);
\r
2491 target.put(buffer[1]);
\r
2492 if (offsets != null) {
\r
2493 int sourceIndex = source.position();
\r
2494 offsets.put(sourceIndex);
\r
2495 offsets.put(sourceIndex);
\r
2498 err = CharsetEncoderICU.fromUWriteBytes(this, buffer, 0, len, target, offsets, source.position()-1);
\r
2499 if (err.isError()) {
\r
2504 err = CoderResult.OVERFLOW;
\r
2507 } /* end while (source.hasRemaining() */
\r
2510 * the end of the input stream and detection of truncated input
\r
2511 * are handled by the framework, but for ISO-2022-CN conversion
\r
2512 * we need to be in ASCII mode at the very end
\r
2516 * not in ASCII mode
\r
2517 * end of input and no truncated input
\r
2519 if (!err.isError() && myConverterData.fromU2022State.g != 0 && flush && !source.hasRemaining() && fromUChar32 == 0) {
\r
2522 /* we are switching to ASCII */
\r
2523 myConverterData.fromU2022State.g = 0;
\r
2525 /* get the source index of the last input character */
\r
2526 sourceIndex = source.position();
\r
2527 if (sourceIndex > 0) {
\r
2529 if (UTF16.isTrailSurrogate(source.get(sourceIndex)) &&
\r
2530 (sourceIndex == 0 || UTF16.isLeadSurrogate(source.get(sourceIndex-1)))) {
\r
2537 err = CharsetEncoderICU.fromUWriteBytes(this, SHIFT_IN_STR, 0, 1, target, offsets, sourceIndex);
\r
2543 /******************************** ISO-2022-KR *****************************/
\r
2545 * Rules for ISO-2022-KR encoding
\r
2546 * i) The KSC5601 designator sequence should appear only once in a file,
\r
2547 * at the begining of a line before any KSC5601 characters. This usually
\r
2548 * means that it appears by itself on the first line of the file
\r
2549 * ii) There are only 2 shifting sequences SO to shift into double byte mode
\r
2550 * and SI to shift into single byte mode
\r
2552 private class CharsetEncoderISO2022KR extends CharsetEncoderICU {
\r
2553 public CharsetEncoderISO2022KR(CharsetICU cs) {
\r
2554 super(cs, fromUSubstitutionChar[myConverterData.version]);
\r
2557 protected void implReset() {
\r
2558 super.implReset();
\r
2559 myConverterData.reset();
\r
2560 setInitialStateFromUnicodeKR(this);
\r
2563 /* This overrides the cbFromUWriteSub method in CharsetEncoderICU */
\r
2564 CoderResult cbFromUWriteSub (CharsetEncoderICU encoder,
\r
2565 CharBuffer source, ByteBuffer target, IntBuffer offsets){
\r
2566 CoderResult err = CoderResult.UNDERFLOW;
\r
2567 byte[] buffer = new byte[8];
\r
2568 int length, i = 0;
\r
2571 subchar = encoder.replacement();
\r
2572 length = subchar.length;
\r
2574 if (myConverterData.version == 0) {
\r
2575 if (length == 1) {
\r
2576 if (encoder.fromUnicodeStatus != 0) {
\r
2577 /* in DBCS mode: switch to SBCS */
\r
2578 encoder.fromUnicodeStatus = 0;
\r
2579 buffer[i++] = UConverterConstants.SI;
\r
2581 buffer[i++] = subchar[0];
\r
2582 } else { /* length == 2 */
\r
2583 if (encoder.fromUnicodeStatus == 0) {
\r
2584 /* in SBCS mode: switch to DBCS */
\r
2585 encoder.fromUnicodeStatus = 1;
\r
2586 buffer[i++] = UConverterConstants.SO;
\r
2588 buffer[i++] = subchar[0];
\r
2589 buffer[i++] = subchar[1];
\r
2591 err = CharsetEncoderICU.fromUWriteBytes(this, buffer, 0, i, target, offsets, source.position() - 1);
\r
2593 /* save the subvonverter's substitution string */
\r
2594 byte[] currentSubChars = myConverterData.currentEncoder.replacement();
\r
2596 /* set our substitution string into the subconverter */
\r
2597 myConverterData.currentEncoder.replaceWith(subchar);
\r
2598 myConverterData.currentConverter.subChar1 = fromUSubstitutionChar[0][0];
\r
2599 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
\r
2600 myConverterData.currentEncoder.fromUChar32 = encoder.fromUChar32;
\r
2601 err = myConverterData.currentEncoder.cbFromUWriteSub(myConverterData.currentEncoder, source, target, offsets);
\r
2602 encoder.fromUChar32 = myConverterData.currentEncoder.fromUChar32;
\r
2604 /* restore the subconverter's substitution string */
\r
2605 myConverterData.currentEncoder.replaceWith(currentSubChars);
\r
2607 if (err.isOverflow()) {
\r
2608 if (myConverterData.currentEncoder.errorBufferLength > 0) {
\r
2609 encoder.errorBuffer = (byte[])(myConverterData.currentEncoder.errorBuffer.clone());
\r
2611 encoder.errorBufferLength = myConverterData.currentEncoder.errorBufferLength;
\r
2612 myConverterData.currentEncoder.errorBufferLength = 0;
\r
2619 private CoderResult encodeLoopIBM(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
\r
2620 CoderResult err = CoderResult.UNDERFLOW;
\r
2622 myConverterData.currentEncoder.fromUChar32 = fromUChar32;
\r
2623 err = myConverterData.currentEncoder.cnvMBCSFromUnicodeWithOffsets(source, target, offsets, flush);
\r
2624 fromUChar32 = myConverterData.currentEncoder.fromUChar32;
\r
2626 if (err.isOverflow()) {
\r
2627 if (myConverterData.currentEncoder.errorBufferLength > 0) {
\r
2628 errorBuffer = (byte[])(myConverterData.currentEncoder.errorBuffer.clone());
\r
2630 errorBufferLength = myConverterData.currentEncoder.errorBufferLength;
\r
2631 myConverterData.currentEncoder.errorBufferLength = 0;
\r
2637 protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
\r
2638 CoderResult err = CoderResult.UNDERFLOW;
\r
2639 int[] targetByteUnit = { 0x0000 };
\r
2640 int sourceChar = 0x0000;
\r
2641 boolean isTargetByteDBCS;
\r
2642 boolean oldIsTargetByteDBCS;
\r
2643 boolean usingFallback;
\r
2645 boolean gotoGetTrail = false; // for goto getTrail label call
\r
2648 * if the version is 1 then the user is requesting
\r
2649 * conversion with ibm-25546 pass the argument to
\r
2650 * MBCS converter and return
\r
2652 if (myConverterData.version == 1) {
\r
2653 return encodeLoopIBM(source, target, offsets, flush);
\r
2656 usingFallback = useFallback;
\r
2657 isTargetByteDBCS = fromUnicodeStatus == 0 ? false : true;
\r
2658 if ((sourceChar = fromUChar32) != 0 && target.hasRemaining()) {
\r
2659 gotoGetTrail = true;
\r
2662 while (source.hasRemaining() || gotoGetTrail) {
\r
2663 targetByteUnit[0] = UConverterConstants.missingCharMarker;
\r
2665 if (target.hasRemaining() || gotoGetTrail) {
\r
2666 if (!gotoGetTrail) {
\r
2667 sourceChar = source.get();
\r
2669 /* do not convert SO/SI/ESC */
\r
2670 if (IS_2022_CONTROL(sourceChar)) {
\r
2671 /* callback(illegal) */
\r
2672 err = CoderResult.malformedForLength(1);
\r
2673 fromUChar32 = sourceChar;
\r
2676 myConverterData.currentConverter.sharedData.mbcs.outputType = CharsetMBCS.MBCS_OUTPUT_2;
\r
2677 length = myConverterData.currentEncoder.fromUChar32(sourceChar, targetByteUnit, usingFallback);
\r
2678 //length = MBCSFromUChar32_ISO2022(myConverterData.currentConverter.sharedData, sourceChar, targetByteUnit, usingFallback, CharsetMBCS.MBCS_OUTPUT_2);
\r
2680 length = -length; /* fallback */
\r
2682 /* only DBCS or SBCS characters are expected */
\r
2683 /* DB characters with high bit set to 1 are expected */
\r
2684 if (length > 2 || length == 0 ||
\r
2685 (length == 1 && targetByteUnit[0] > 0x7f) ||
\r
2687 ((char)(targetByteUnit[0] - 0xa1a1) > (0xfefe - 0xa1a1) ||
\r
2688 ((targetByteUnit[0] - 0xa1) & UConverterConstants.UNSIGNED_BYTE_MASK) > (0xfe - 0xa1)))) {
\r
2689 targetByteUnit[0] = UConverterConstants.missingCharMarker;
\r
2692 if (!gotoGetTrail && targetByteUnit[0] != UConverterConstants.missingCharMarker) {
\r
2693 oldIsTargetByteDBCS = isTargetByteDBCS;
\r
2694 isTargetByteDBCS = (targetByteUnit[0] > 0x00FF);
\r
2695 /* append the shift sequence */
\r
2696 if (oldIsTargetByteDBCS != isTargetByteDBCS) {
\r
2697 if (isTargetByteDBCS) {
\r
2698 target.put((byte)UConverterConstants.SO);
\r
2700 target.put((byte)UConverterConstants.SI);
\r
2702 if (offsets != null) {
\r
2703 offsets.put(source.position()-1);
\r
2706 /* write the targetUniChar to target */
\r
2707 if (targetByteUnit[0] <= 0x00FF) {
\r
2708 if (target.hasRemaining()) {
\r
2709 target.put((byte)targetByteUnit[0]);
\r
2710 if (offsets != null) {
\r
2711 offsets.put(source.position()-1);
\r
2714 errorBuffer[errorBufferLength++] = (byte)targetByteUnit[0];
\r
2715 err = CoderResult.OVERFLOW;
\r
2718 if (target.hasRemaining()) {
\r
2719 target.put((byte)(UConverterConstants.UNSIGNED_BYTE_MASK & ((targetByteUnit[0]>>8) - 0x80)));
\r
2720 if (offsets != null) {
\r
2721 offsets.put(source.position()-1);
\r
2723 if (target.hasRemaining()) {
\r
2724 target.put((byte)(UConverterConstants.UNSIGNED_BYTE_MASK & (targetByteUnit[0]- 0x80)));
\r
2725 if (offsets != null) {
\r
2726 offsets.put(source.position()-1);
\r
2729 errorBuffer[errorBufferLength++] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & (targetByteUnit[0] - 0x80));
\r
2730 err = CoderResult.OVERFLOW;
\r
2734 errorBuffer[errorBufferLength++] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & ((targetByteUnit[0]>>8) - 0x80));
\r
2735 errorBuffer[errorBufferLength++] = (byte)(UConverterConstants.UNSIGNED_BYTE_MASK & (targetByteUnit[0]- 0x80));
\r
2736 err = CoderResult.OVERFLOW;
\r
2740 /* oops.. the code point is unassigned
\r
2741 * set the error and reason
\r
2744 /* check if the char is a First surrogate */
\r
2745 if (gotoGetTrail || UTF16.isSurrogate((char)sourceChar)) {
\r
2746 if (gotoGetTrail || UTF16.isLeadSurrogate((char)sourceChar)) {
\r
2748 // reset gotoGetTrail flag
\r
2749 gotoGetTrail = false;
\r
2751 /* look ahead to find the trail surrogate */
\r
2752 if (source.hasRemaining()) {
\r
2753 /* test the following code unit */
\r
2754 char trail = source.get();
\r
2755 source.position(source.position()-1);
\r
2756 if (UTF16.isTrailSurrogate(trail)) {
\r
2758 sourceChar = UCharacter.getCodePoint((char)sourceChar, trail);
\r
2759 err = CoderResult.unmappableForLength(2);
\r
2760 /* convert this surrogate code point */
\r
2761 /* exit this condition tree */
\r
2763 /* this is an unmatched lead code unit (1st surrogate) */
\r
2764 /* callback(illegal) */
\r
2765 err = CoderResult.malformedForLength(1);
\r
2768 /* no more input */
\r
2769 err = CoderResult.UNDERFLOW;
\r
2772 /* this is an unmatched trail code unit (2nd surrogate ) */
\r
2773 /* callback(illegal) */
\r
2774 err = CoderResult.malformedForLength(1);
\r
2777 /* callback(unassigned) for a BMP code point */
\r
2778 err = CoderResult.unmappableForLength(1);
\r
2781 fromUChar32 = sourceChar;
\r
2785 err = CoderResult.OVERFLOW;
\r
2790 * the end of the input stream and detection of truncated input
\r
2791 * are handled by the framework, but for ISO-2022-KR conversion
\r
2792 * we need to be inASCII mode at the very end
\r
2796 * not in ASCII mode
\r
2797 * end of input and no truncated input
\r
2799 if (!err.isError() && isTargetByteDBCS && flush && !source.hasRemaining() && fromUChar32 == 0) {
\r
2802 /* we are switching to ASCII */
\r
2803 isTargetByteDBCS = false;
\r
2805 /* get the source index of the last input character */
\r
2806 sourceIndex = source.position();
\r
2807 if (sourceIndex > 0) {
\r
2809 if (UTF16.isTrailSurrogate(source.get(sourceIndex)) && UTF16.isLeadSurrogate(source.get(sourceIndex-1))) {
\r
2816 CharsetEncoderICU.fromUWriteBytes(this, SHIFT_IN_STR, 0, 1, target, offsets, sourceIndex);
\r
2818 /*save the state and return */
\r
2819 fromUnicodeStatus = isTargetByteDBCS ? 1 : 0;
\r
2825 public CharsetDecoder newDecoder() {
\r
2826 switch (variant) {
\r
2828 return new CharsetDecoderISO2022JP(this);
\r
2831 return new CharsetDecoderISO2022CN(this);
\r
2834 setInitialStateToUnicodeKR();
\r
2835 return new CharsetDecoderISO2022KR(this);
\r
2837 default: /* should not happen */
\r
2842 public CharsetEncoder newEncoder() {
\r
2843 CharsetEncoderICU cnv;
\r
2845 switch (variant) {
\r
2847 return new CharsetEncoderISO2022JP(this);
\r
2850 return new CharsetEncoderISO2022CN(this);
\r
2853 cnv = new CharsetEncoderISO2022KR(this);
\r
2854 setInitialStateFromUnicodeKR(cnv);
\r
2857 default: /* should not happen */
\r
2862 private void setInitialStateToUnicodeKR() {
\r
2863 if (myConverterData.version == 1) {
\r
2864 myConverterData.currentDecoder.toUnicodeStatus = 0; /* offset */
\r
2865 myConverterData.currentDecoder.mode = 0; /* state */
\r
2866 myConverterData.currentDecoder.toULength = 0; /* byteIndex */
\r
2869 private void setInitialStateFromUnicodeKR(CharsetEncoderICU cnv) {
\r
2870 /* ISO-2022-KR the designator sequence appears only once
\r
2871 * in a file so we append it only once
\r
2873 if (cnv.errorBufferLength == 0) {
\r
2874 cnv.errorBufferLength = 4;
\r
2875 cnv.errorBuffer[0] = 0x1b;
\r
2876 cnv.errorBuffer[1] = 0x24;
\r
2877 cnv.errorBuffer[2] = 0x29;
\r
2878 cnv.errorBuffer[3] = 0x43;
\r
2880 if (myConverterData.version == 1) {
\r
2881 ((CharsetMBCS)myConverterData.currentEncoder.charset()).subChar1 = 0x1A;
\r
2882 myConverterData.currentEncoder.fromUChar32 = 0;
\r
2883 myConverterData.currentEncoder.fromUnicodeStatus = 1; /* prevLength */
\r
2887 void getUnicodeSetImpl(UnicodeSet setFillIn, int which) {
\r
2889 /*open a set and initialize it with code points that are algorithmically round-tripped */
\r
2893 /*include JIS X 0201 which is hardcoded */
\r
2894 setFillIn.add(0xa5);
\r
2895 setFillIn.add(0x203e);
\r
2896 if((jpCharsetMasks[myConverterData.version]&CSM(ISO8859_1))!=0){
\r
2897 /*include Latin-1 some variants of JP */
\r
2898 setFillIn.add(0, 0xff);
\r
2902 /* include ASCII for JP */
\r
2903 setFillIn.add(0, 0x7f);
\r
2905 if(myConverterData.version==3 || myConverterData.version==4 ||which == ROUNDTRIP_AND_FALLBACK_SET){
\r
2907 * Do not test(jpCharsetMasks[myConverterData.version]&CSM(HWKANA_7BIT))!=0 because the bit
\r
2908 * is on for all JP versions although version 3 & 4 (JIS7 and JIS8) use half-width Katakana.
\r
2909 * This is because all ISO_2022_JP variant are lenient in that they accept (in toUnicode) half-width
\r
2910 * Katakana via ESC.
\r
2911 * However, we only emit (fromUnicode) half-width Katakana according to the
\r
2912 * definition of each variant.
\r
2914 * When including fallbacks,
\r
2915 * we need to include half-width Katakana Unicode code points for all JP variants because
\r
2916 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
\r
2918 /* include half-width Katakana for JP */
\r
2919 setFillIn.add(HWKANA_START, HWKANA_END);
\r
2923 /* Include ASCII for CN */
\r
2924 setFillIn.add(0, 0x7f);
\r
2927 /* there is only one converter for KR */
\r
2928 myConverterData.currentConverter.getUnicodeSetImpl(setFillIn, which);
\r
2934 //TODO Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until
\r
2935 for(i=0; i<UCNV_2022_MAX_CONVERTERS;i++){
\r
2937 if(myConverterData.myConverterArray[i]!=null){
\r
2938 if(variant==ISO_2022_CN && myConverterData.version==0 && i==CNS_11643){
\r
2941 * version -specific for CN:
\r
2942 * CN version 0 does not map CNS planes 3..7 although
\r
2943 * they are all available in the CNS conversion table;
\r
2944 * CN version 1 (-EXT) does map them all.
\r
2945 * The two versions create different Unicode sets.
\r
2947 filter=CharsetMBCS.UCNV_SET_FILTER_2022_CN;
\r
2948 } else if(variant==ISO_2022_JP && i == JISX208){
\r
2950 * Only add code points that map to Shift-JIS codes
\r
2951 * corrosponding to JIS X 208
\r
2953 filter=CharsetMBCS.UCNV_SET_FILTER_SJIS;
\r
2954 } else if(i==KSC5601){
\r
2956 * Some of the KSC 5601 tables (Convrtrs.txt has this aliases on multiple tables)
\r
2957 * are broader than GR94.
\r
2959 filter=CharsetMBCS.UCNV_SET_FILTER_GR94DBCS;
\r
2961 filter=CharsetMBCS.UCNV_SET_FILTER_NONE;
\r
2964 myConverterData.currentConverter.MBCSGetFilteredUnicodeSetForUnicode(myConverterData.myConverterArray[i],setFillIn, which, filter);
\r
2968 * ISO Converter must not convert SO/SI/ESC despite what sub-converters do by themselves
\r
2969 * Remove these characters from the set.
\r
2971 setFillIn.remove(0x0e);
\r
2972 setFillIn.remove(0x0f);
\r
2973 setFillIn.remove(0x1b);
\r
2975 /* ISO 2022 converter do not convert C! controls either */
\r
2976 setFillIn.remove(0x80, 0x9f);
\r