2 *******************************************************************************
3 * Copyright (C) 2006-2012, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
7 *******************************************************************************
9 package com.ibm.icu.charset;
11 import java.io.BufferedInputStream;
12 import java.io.IOException;
13 import java.io.InputStream;
14 import java.nio.Buffer;
15 import java.nio.BufferOverflowException;
16 import java.nio.ByteBuffer;
17 import java.nio.CharBuffer;
18 import java.nio.IntBuffer;
19 import java.nio.charset.CharsetDecoder;
20 import java.nio.charset.CharsetEncoder;
21 import java.nio.charset.CoderResult;
22 import java.util.Locale;
24 import com.ibm.icu.charset.UConverterSharedData.UConverterType;
25 import com.ibm.icu.impl.ICUData;
26 import com.ibm.icu.impl.ICUResourceBundle;
27 import com.ibm.icu.impl.InvalidFormatException;
28 import com.ibm.icu.lang.UCharacter;
29 import com.ibm.icu.text.UTF16;
30 import com.ibm.icu.text.UnicodeSet;
32 class CharsetMBCS extends CharsetICU {
34 private byte[] fromUSubstitution = null;
35 UConverterSharedData sharedData = null;
36 private static final int MAX_VERSION_LENGTH = 4;
38 // these variables are used in getUnicodeSet() and may be changed in future
39 // typedef enum UConverterSetFilter {
40 static final int UCNV_SET_FILTER_NONE = 1;
41 static final int UCNV_SET_FILTER_DBCS_ONLY = 2;
42 static final int UCNV_SET_FILTER_2022_CN = 3;
43 static final int UCNV_SET_FILTER_SJIS= 4 ;
44 static final int UCNV_SET_FILTER_GR94DBCS = 5;
45 static final int UCNV_SET_FILTER_HZ = 6;
46 static final int UCNV_SET_FILTER_COUNT = 7;
47 // } UConverterSetFilter;
50 * Fallbacks to Unicode are stored outside the normal state table and code point structures in a vector of items of
51 * this type. They are sorted by offset.
53 final static class MBCSToUFallback {
59 * This is the MBCS part of the UConverterTable union (a runtime data structure). It keeps all the per-converter
60 * data and points into the loaded mapping tables.
62 static final class UConverterMBCSTable {
66 boolean stateTableOwned;
67 int countToUFallbacks;
69 int stateTable[/* countStates */][/* 256 */];
70 int swapLFNLStateTable[/* countStates */][/* 256 */]; /* for swaplfnl */
71 char unicodeCodeUnits[/* countUnicodeResults */];
72 MBCSToUFallback toUFallbacks[/* countToUFallbacks */];
75 char fromUnicodeTable[];
76 byte fromUnicodeBytes[];
77 byte swapLFNLFromUnicodeBytes[]; /* for swaplfnl */
79 short outputType, unicodeMask;
81 /* converter name for swaplfnl */
85 UConverterSharedData baseSharedData;
87 ByteBuffer extIndexes; // create int[] view etc. as needed
89 CharBuffer mbcsIndex; /* for fast conversion from most of BMP to MBCS (utf8Friendly data) */
90 char sbcsIndex[/* SBCS_FAST_LIMIT>>6 */]; /* for fast conversion from low BMP to SBCS (utf8Friendly data) */
91 boolean utf8Friendly; /* for utf8Friendly data */
92 char maxFastUChar; /* for utf8Friendly data */
97 UConverterMBCSTable() {
100 sbcsIndex = new char[SBCS_FAST_LIMIT>>6];
104 * UConverterMBCSTable(UConverterMBCSTable t) { countStates = t.countStates; dbcsOnlyState = t.dbcsOnlyState;
105 * stateTableOwned = t.stateTableOwned; countToUFallbacks = t.countToUFallbacks; stateTable = t.stateTable;
106 * swapLFNLStateTable = t.swapLFNLStateTable; unicodeCodeUnits = t.unicodeCodeUnits; toUFallbacks =
107 * t.toUFallbacks; fromUnicodeTable = t.fromUnicodeTable; fromUnicodeBytes = t.fromUnicodeBytes;
108 * swapLFNLFromUnicodeBytes = t.swapLFNLFromUnicodeBytes; fromUBytesLength = t.fromUBytesLength; outputType =
109 * t.outputType; unicodeMask = t.unicodeMask; swapLFNLName = t.swapLFNLName; baseSharedData = t.baseSharedData;
110 * extIndexes = t.extIndexes; }
114 /* Constants used in MBCS data header */
116 static final int MBCS_OPT_LENGTH_MASK=0x3f;
117 static final int MBCS_OPT_NO_FROM_U=0x40;
119 * If any of the following options bits are set,
120 * then the file must be rejected.
122 static final int MBCS_OPT_INCOMPATIBLE_MASK=0xffc0;
124 * Remove bits from this mask as more options are recognized
125 * by all implementations that use this constant.
127 static final int MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK=0xff80;
129 /* Constants for fast and UTF-8-friendly conversion. */
131 static final int SBCS_FAST_MAX=0x0fff; /* maximum code point with UTF-8-friendly SBCS runtime code, see makeconv SBCS_UTF8_MAX */
132 static final int SBCS_FAST_LIMIT=SBCS_FAST_MAX+1; /* =0x1000 */
133 static final int MBCS_FAST_MAX=0xd7ff; /* maximum code point with UTF-8-friendly MBCS runtime code, see makeconv MBCS_UTF8_MAX */
134 static final int MBCS_FAST_LIMIT=MBCS_FAST_MAX+1; /* =0xd800 */
137 * MBCS data header. See data format description above.
139 final static class MBCSHeader {
140 byte version[/* U_MAX_VERSION_LENGTH */];
141 int countStates, countToUFallbacks, offsetToUCodeUnits, offsetFromUTable, offsetFromUBytes;
143 int fromUBytesLength;
145 /* new and required in version 5 */
148 /* new and optional in version 5; used if options&MBCS_OPT_NO_FROM_U */
149 int fullStage2Length; /* number of 32-bit units */
152 version = new byte[MAX_VERSION_LENGTH];
156 public CharsetMBCS(String icuCanonicalName, String javaCanonicalName, String[] aliases, String classPath,
157 ClassLoader loader) throws InvalidFormatException {
158 super(icuCanonicalName, javaCanonicalName, aliases);
160 /* See if the icuCanonicalName contains certain option information. */
161 if (icuCanonicalName.indexOf(UConverterConstants.OPTION_SWAP_LFNL_STRING) > -1) {
162 options = UConverterConstants.OPTION_SWAP_LFNL;
163 icuCanonicalName = icuCanonicalName.substring(0, icuCanonicalName.indexOf(UConverterConstants.OPTION_SWAP_LFNL_STRING));
164 super.icuCanonicalName = icuCanonicalName;
167 // now try to load the data
168 sharedData = loadConverter(1, icuCanonicalName, classPath, loader);
170 maxBytesPerChar = sharedData.staticData.maxBytesPerChar;
171 minBytesPerChar = sharedData.staticData.minBytesPerChar;
173 fromUSubstitution = sharedData.staticData.subChar;
174 subChar = sharedData.staticData.subChar;
175 subCharLen = sharedData.staticData.subCharLen;
176 subChar1 = sharedData.staticData.subChar1;
177 fromUSubstitution = new byte[sharedData.staticData.subCharLen];
178 System.arraycopy(sharedData.staticData.subChar, 0, fromUSubstitution, 0, sharedData.staticData.subCharLen);
180 initializeConverter(options);
183 public CharsetMBCS(String icuCanonicalName, String javaCanonicalName, String[] aliases)
184 throws InvalidFormatException {
185 this(icuCanonicalName, javaCanonicalName, aliases, ICUResourceBundle.ICU_BUNDLE, null);
188 private UConverterSharedData loadConverter(int nestedLoads, String myName, String classPath, ClassLoader loader)
189 throws InvalidFormatException {
190 boolean noFromU = false;
191 // Read converter data from file
192 UConverterStaticData staticData = new UConverterStaticData();
193 UConverterDataReader reader = null;
195 String resourceName = classPath + "/" + myName + "." + UConverterSharedData.DATA_TYPE;
198 if (loader != null) {
199 i = ICUData.getRequiredStream(loader, resourceName);
201 i = ICUData.getRequiredStream(resourceName);
203 BufferedInputStream b = new BufferedInputStream(i, UConverterConstants.CNV_DATA_BUFFER_SIZE);
204 reader = new UConverterDataReader(b);
205 reader.readStaticData(staticData);
206 } catch (IOException e) {
207 throw new InvalidFormatException();
208 } catch (Exception e) {
209 throw new InvalidFormatException();
212 UConverterSharedData data = null;
213 int type = staticData.conversionType;
215 if (type != UConverterSharedData.UConverterType.MBCS
216 || staticData.structSize != UConverterStaticData.SIZE_OF_UCONVERTER_STATIC_DATA) {
217 throw new InvalidFormatException();
220 data = new UConverterSharedData(1, null, false, 0);
221 data.dataReader = reader;
222 data.staticData = staticData;
223 data.sharedDataCached = false;
226 UConverterMBCSTable mbcsTable = data.mbcs;
227 MBCSHeader header = new MBCSHeader();
229 reader.readMBCSHeader(header);
230 } catch (IOException e) {
231 throw new InvalidFormatException();
235 // int[] extIndexesArray = null;
236 String baseNameString = null;
237 int[][] stateTableArray = null;
238 MBCSToUFallback[] toUFallbacksArray = null;
239 char[] unicodeCodeUnitsArray = null;
240 char[] fromUnicodeTableArray = null;
241 byte[] fromUnicodeBytesArray = null;
243 if (header.version[0] == 5 && header.version[1] >= 3 && (header.options & MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK) == 0) {
244 noFromU = ((header.options & MBCS_OPT_NO_FROM_U) != 0);
245 } else if (header.version[0] != 4) {
246 throw new InvalidFormatException();
249 mbcsTable.outputType = (byte) header.flags;
251 /* extension data, header version 4.2 and higher */
252 offset = header.flags >>> 8;
253 // if(offset!=0 && mbcsTable.outputType == MBCS_OUTPUT_EXT_ONLY) {
254 if (mbcsTable.outputType == MBCS_OUTPUT_EXT_ONLY) {
256 baseNameString = reader.readBaseTableName();
258 // agljport:commment subtract 32 for sizeof(_MBCSHeader) and length of baseNameString and 1 null
259 // terminator byte all already read;
260 mbcsTable.extIndexes = reader.readExtIndexes(offset
261 - (reader.bytesRead - reader.staticDataBytesRead));
263 } catch (IOException e) {
264 throw new InvalidFormatException();
268 // agljport:add this would be unnecessary if extIndexes were memory mapped
270 * if(mbcsTable.extIndexes != null) {
272 * try { //int nbytes = mbcsTable.extIndexes[UConverterExt.UCNV_EXT_TO_U_LENGTH]*4 +
273 * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_TO_U_UCHARS_LENGTH]*2 +
274 * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_LENGTH]*6 +
275 * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_BYTES_LENGTH] +
276 * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_STAGE_12_LENGTH]*2 +
277 * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_STAGE_3_LENGTH]*2 +
278 * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_STAGE_3B_LENGTH]*4; //int nbytes =
279 * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_SIZE] //byte[] extTables = dataReader.readExtTables(nbytes);
280 * //mbcsTable.extTables = ByteBuffer.wrap(extTables); } catch(IOException e) { System.err.println("Caught
281 * IOException: " + e.getMessage()); pErrorCode[0] = UErrorCode.U_INVALID_FORMAT_ERROR; return; } }
283 if (mbcsTable.outputType == MBCS_OUTPUT_EXT_ONLY) {
284 UConverterSharedData baseSharedData = null;
285 ByteBuffer extIndexes;
288 /* extension-only file, load the base table and set values appropriately */
289 extIndexes = mbcsTable.extIndexes;
290 if (extIndexes == null) {
291 /* extension-only file without extension */
292 throw new InvalidFormatException();
295 if (nestedLoads != 1) {
296 /* an extension table must not be loaded as a base table */
297 throw new InvalidFormatException();
300 /* load the base table */
301 baseName = baseNameString;
302 if (baseName.equals(staticData.name)) {
303 /* forbid loading this same extension-only file */
304 throw new InvalidFormatException();
307 // agljport:fix args.size=sizeof(UConverterLoadArgs);
308 baseSharedData = loadConverter(2, baseName, classPath, loader);
310 if (baseSharedData.staticData.conversionType != UConverterType.MBCS
311 || baseSharedData.mbcs.baseSharedData != null) {
312 // agljport:fix ucnv_unload(baseSharedData);
313 throw new InvalidFormatException();
316 /* copy the base table data */
317 // agljport:comment deep copy in C changes mbcs through local reference mbcsTable; in java we probably don't
318 // need the deep copy so can just make sure mbcs and its local reference both refer to the same new object
319 mbcsTable = data.mbcs = baseSharedData.mbcs;
321 /* overwrite values with relevant ones for the extension converter */
322 mbcsTable.baseSharedData = baseSharedData;
323 mbcsTable.extIndexes = extIndexes;
326 * It would be possible to share the swapLFNL data with a base converter, but the generated name would have
327 * to be different, and the memory would have to be free'd only once. It is easier to just create the data
328 * for the extension converter separately when it is requested.
330 mbcsTable.swapLFNLStateTable = null;
331 mbcsTable.swapLFNLFromUnicodeBytes = null;
332 mbcsTable.swapLFNLName = null;
335 * Set a special, runtime-only outputType if the extension converter is a DBCS version of a base converter
336 * that also maps single bytes.
338 if (staticData.conversionType == UConverterType.DBCS
339 || (staticData.conversionType == UConverterType.MBCS && staticData.minBytesPerChar >= 2)) {
341 if (baseSharedData.mbcs.outputType == MBCS_OUTPUT_2_SISO) {
342 /* the base converter is SI/SO-stateful */
345 /* get the dbcs state from the state table entry for SO=0x0e */
346 entry = mbcsTable.stateTable[0][0xe];
347 if (MBCS_ENTRY_IS_FINAL(entry) && MBCS_ENTRY_FINAL_ACTION(entry) == MBCS_STATE_CHANGE_ONLY
348 && MBCS_ENTRY_FINAL_STATE(entry) != 0) {
349 mbcsTable.dbcsOnlyState = (byte) MBCS_ENTRY_FINAL_STATE(entry);
351 mbcsTable.outputType = MBCS_OUTPUT_DBCS_ONLY;
353 } else if (baseSharedData.staticData.conversionType == UConverterType.MBCS
354 && baseSharedData.staticData.minBytesPerChar == 1
355 && baseSharedData.staticData.maxBytesPerChar == 2 && mbcsTable.countStates <= 127) {
357 /* non-stateful base converter, need to modify the state table */
358 int newStateTable[][/* 256 */];
359 int state[]; // this works because java 2-D array is array of references and we can have state =
363 /* allocate a new state table and copy the base state table contents */
364 count = mbcsTable.countStates;
365 newStateTable = new int[(count + 1) * 1024][256];
367 for (i = 0; i < mbcsTable.stateTable.length; ++i)
368 System.arraycopy(mbcsTable.stateTable[i], 0, newStateTable[i], 0,
369 mbcsTable.stateTable[i].length);
371 /* change all final single-byte entries to go to a new all-illegal state */
372 state = newStateTable[0];
373 for (i = 0; i < 256; ++i) {
374 if (MBCS_ENTRY_IS_FINAL(state[i])) {
375 state[i] = MBCS_ENTRY_TRANSITION(count, 0);
379 /* build the new all-illegal state */
380 state = newStateTable[count];
381 for (i = 0; i < 256; ++i) {
382 state[i] = MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0);
384 mbcsTable.stateTable = newStateTable;
385 mbcsTable.countStates = (byte) (count + 1);
386 mbcsTable.stateTableOwned = true;
388 mbcsTable.outputType = MBCS_OUTPUT_DBCS_ONLY;
393 * unlike below for files with base tables, do not get the unicodeMask from the sharedData; instead, use the
394 * base table's unicodeMask, which we copied in the memcpy above; this is necessary because the static data
395 * unicodeMask, especially the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data
398 /* conversion file with a base table; an additional extension table is optional */
399 /* make sure that the output type is known */
400 switch (mbcsTable.outputType) {
405 case MBCS_OUTPUT_3_EUC:
406 case MBCS_OUTPUT_4_EUC:
407 case MBCS_OUTPUT_2_SISO:
411 throw new InvalidFormatException();
414 stateTableArray = new int[header.countStates][256];
415 toUFallbacksArray = new MBCSToUFallback[header.countToUFallbacks];
416 for (int i = 0; i < toUFallbacksArray.length; ++i)
417 toUFallbacksArray[i] = new MBCSToUFallback();
418 unicodeCodeUnitsArray = new char[(header.offsetFromUTable - header.offsetToUCodeUnits) / 2];
419 fromUnicodeTableArray = new char[(header.offsetFromUBytes - header.offsetFromUTable) / 2];
420 fromUnicodeBytesArray = new byte[header.fromUBytesLength];
422 reader.readMBCSTable(stateTableArray, toUFallbacksArray, unicodeCodeUnitsArray, fromUnicodeTableArray,
423 fromUnicodeBytesArray);
424 } catch (IOException e) {
425 throw new InvalidFormatException();
428 mbcsTable.countStates = (byte) header.countStates;
429 mbcsTable.countToUFallbacks = header.countToUFallbacks;
430 mbcsTable.stateTable = stateTableArray;
431 mbcsTable.toUFallbacks = toUFallbacksArray;
432 mbcsTable.unicodeCodeUnits = unicodeCodeUnitsArray;
434 mbcsTable.fromUnicodeTable = fromUnicodeTableArray;
435 mbcsTable.fromUnicodeBytes = fromUnicodeBytesArray;
436 mbcsTable.fromUBytesLength = header.fromUBytesLength;
439 * converter versions 6.1 and up contain a unicodeMask that is used here to select the most efficient
440 * function implementations
442 // agljport:fix info.size=sizeof(UDataInfo);
443 // agljport:fix udata_getInfo((UDataMemory *)sharedData->dataMemory, &info);
444 // agljport:fix if(info.formatVersion[0]>6 || (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) {
445 /* mask off possible future extensions to be safe */
446 mbcsTable.unicodeMask = (short) (staticData.unicodeMask & 3);
447 // agljport:fix } else {
448 /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */
449 // agljport:fix mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES;
453 // agljport:commment subtract 32 for sizeof(_MBCSHeader) and length of baseNameString and 1 null
454 // terminator byte all already read;
455 // int namelen = baseNameString != null? baseNameString.length() + 1: 0;
456 mbcsTable.extIndexes = reader.readExtIndexes(offset
457 - (reader.bytesRead - reader.staticDataBytesRead));
458 } catch (IOException e) {
459 throw new InvalidFormatException();
463 if (header.version[1] >= 3 && (mbcsTable.unicodeMask & UConverterConstants.HAS_SURROGATES) == 0 &&
464 (mbcsTable.countStates == 1 ? ((char)header.version[2] >= (SBCS_FAST_MAX>>8)) : ((char)header.version[2] >= (MBCS_FAST_MAX>>8)))) {
465 mbcsTable.utf8Friendly = true;
467 if (mbcsTable.countStates == 1) {
469 * SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBCS_FAST_MAX or higher.
470 * Build a table with indexes to each block, to be used instaed of
471 * the regular stage 1/2 table.
473 for (int i = 0; i < (SBCS_FAST_LIMIT>>6); ++i) {
474 mbcsTable.sbcsIndex[i] = mbcsTable.fromUnicodeTable[mbcsTable.fromUnicodeTable[i>>4]+((i<<2)&0x3c)];
476 /* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header.version[2]>(SBCS_FAST_MAX>>8) */
477 mbcsTable.maxFastUChar = SBCS_FAST_MAX;
480 * MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBCS_FAST_MAX or higher.
481 * The .cnv file is prebuilt with an additional stage table with indexes to each block.
484 mbcsTable.mbcsIndex = ByteBuffer.wrap(mbcsTable.fromUnicodeBytes).asCharBuffer();
486 mbcsTable.maxFastUChar = (char)((header.version[2]<<8) | 0xff);
489 /* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */
491 long asciiRoundtrips = 0xffffffff;
492 for (int i = 0; i < 0x80; ++i) {
493 if (mbcsTable.stateTable[0][i] != MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, i)) {
494 asciiRoundtrips&=~((long)1<<(i>>2))&UConverterConstants.UNSIGNED_INT_MASK;
497 mbcsTable.asciiRoundtrips = asciiRoundtrips&UConverterConstants.UNSIGNED_INT_MASK;
501 int stage1Length = (mbcsTable.unicodeMask&UConverterConstants.HAS_SUPPLEMENTARY) != 0 ? 0x440 : 0x40;
502 int stage2Length = (header.offsetFromUBytes - header.offsetFromUTable)/4 - stage1Length/2;
503 reconstituteData(mbcsTable, stage1Length, stage2Length, header.fullStage2Length);
505 if (mbcsTable.outputType == MBCS_OUTPUT_DBCS_ONLY || mbcsTable.outputType == MBCS_OUTPUT_2_SISO) {
507 * MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not roundtrip.
508 * MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength correctly.
510 mbcsTable.asciiRoundtrips = 0;
516 private static boolean writeStage3Roundtrip(UConverterMBCSTable mbcsTable, long value, int codePoints[]) {
525 table = mbcsTable.fromUnicodeTable;
526 bytes = mbcsTable.fromUnicodeBytes;
528 /* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */
529 switch(mbcsTable.outputType) {
530 case MBCS_OUTPUT_3_EUC:
532 /* short sequences are stored directly */
533 /* code set 0 or 1 */
534 } else if(value<=0x8effff) {
537 } else /* first byte is 0x8f */ {
542 case MBCS_OUTPUT_4_EUC:
543 if(value<=0xffffff) {
544 /* short sequences are stored directly */
545 /* code set 0 or 1 */
546 } else if(value<=0x8effffff) {
549 } else /* first byte is 0x8f */ {
558 for(i=0; i<=0x1f; ++value, ++i) {
564 /* locate the stage 2 & 3 data */
565 stage2 = table[c>>10] + ((c>>4)&0x3f);
566 st3 = table[stage2*2]<<16|table[stage2*2 + 1];
567 st3 = (int)(char)(st3 * 16 + (c&0xf));
569 /* write the codepage bytes into stage 3 */
570 switch(mbcsTable.outputType) {
572 case MBCS_OUTPUT_4_EUC:
574 bytes[p] = (byte)(value>>16);
575 bytes[p+1] = (byte)(value>>8);
576 bytes[p+2] = (byte)value;
579 bytes[st3*4] = (byte)(value >> 24);
580 bytes[st3*4 + 1] = (byte)(value >> 16);
581 bytes[st3*4 + 2] = (byte)(value >> 8);
582 bytes[st3*4 + 3] = (byte)value;
585 /* 2 bytes per character */
586 bytes[st3*2] = (byte)(value >> 8);
587 bytes[st3*2 + 1] = (byte)value;
591 /* set the roundtrip flag */
592 temp = (1L<<(16+(c&0xf)));
593 table[stage2*2] |= (char)(temp>>16);
594 table[stage2*2 + 1] |= (char)temp;
599 private static void reconstituteData(UConverterMBCSTable mbcsTable, int stage1Length, int stage2Length, int fullStage2Length) {
600 int datalength = stage1Length*2+fullStage2Length*4+mbcsTable.fromUBytesLength;
602 byte[] stage = new byte[datalength];
604 for (int i = 0; i < stage1Length; ++i) {
605 stage[i*2] = (byte)(mbcsTable.fromUnicodeTable[i]>>8);
606 stage[i*2+1] = (byte)(mbcsTable.fromUnicodeTable[i]);
609 offset = ((fullStage2Length - stage2Length) * 4) + (stage1Length * 2);
610 for (int i = 0; i < stage2Length; ++i) {
611 stage[offset + i*4] = (byte)(mbcsTable.fromUnicodeTable[stage1Length + i*2]>>8);
612 stage[offset + i*4+1] = (byte)(mbcsTable.fromUnicodeTable[stage1Length + i*2]);
613 stage[offset + i*4+2] = (byte)(mbcsTable.fromUnicodeTable[stage1Length + i*2+1]>>8);
614 stage[offset + i*4+3] = (byte)(mbcsTable.fromUnicodeTable[stage1Length + i*2+1]);
617 /* indexes into stage 2 count from the bottom of the fromUnicodeTable */
619 /* reconsitute the initial part of stage 2 from the mbcsIndex */
621 int stageUTF8Length=(mbcsTable.maxFastUChar+1)>>6;
622 int stageUTF8Index=0;
623 int st1, st2, st3, i;
625 for (st1 = 0; stageUTF8Index < stageUTF8Length; ++st1) {
626 st2 = ((char)stage[2*st1]<<8) | (0xff & stage[2*st1+1]);
627 if (st2 != stage1Length/2) {
628 /* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */
629 for (i = 0; i < 16; ++i) {
630 st3 = mbcsTable.mbcsIndex.get(stageUTF8Index++);
632 /* a stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */
635 * 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are
636 * allocated together as a single 64-block for access from the mbcsIndex
638 stage[4*st2] = (byte)(st3>>24); stage[4*st2+1] = (byte)(st3>>16); stage[4*st2+2] = (byte)(st3>>8); stage[4*st2+3] = (byte)(st3); st2++; st3++;
639 stage[4*st2] = (byte)(st3>>24); stage[4*st2+1] = (byte)(st3>>16); stage[4*st2+2] = (byte)(st3>>8); stage[4*st2+3] = (byte)(st3); st2++; st3++;
640 stage[4*st2] = (byte)(st3>>24); stage[4*st2+1] = (byte)(st3>>16); stage[4*st2+2] = (byte)(st3>>8); stage[4*st2+3] = (byte)(st3); st2++; st3++;
641 stage[4*st2] = (byte)(st3>>24); stage[4*st2+1] = (byte)(st3>>16); stage[4*st2+2] = (byte)(st3>>8); stage[4*st2+3] = (byte)(st3);
643 /* no stage 3 block, skip */
648 /* no stage 2 block, skip */
654 char[] stage1 = new char[stage.length/2];
655 for (int i = 0; i < stage1.length; ++i) {
656 stage1[i] = (char)(((stage[i*2])<<8)|(stage[i*2+1] & UConverterConstants.UNSIGNED_BYTE_MASK));
658 byte[] stage2 = new byte[stage.length - ((stage1Length * 2) + (fullStage2Length * 4))];
659 System.arraycopy(stage, ((stage1Length * 2) + (fullStage2Length * 4)), stage2, 0, stage2.length);
661 mbcsTable.fromUnicodeTable = stage1;
662 mbcsTable.fromUnicodeBytes = stage2;
664 /* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */
665 MBCSEnumToUnicode(mbcsTable);
669 * Internal function enumerating the toUnicode data of an MBCS converter.
670 * Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U
671 * table, but could also be used for a future getUnicodeSet() option
672 * that includes reverse fallbacks (after updating this function's implementation).
673 * Currently only handles roundtrip mappings.
674 * Does not currently handle extensions.
676 private static void MBCSEnumToUnicode(UConverterMBCSTable mbcsTable) {
678 * Properties for each state, to speed up the enumeration.
679 * Ignorable actions are unassigned/illegal/state-change-only:
680 * They do not lead to mappings.
683 * 1 direct/initial state (stateful converters have mulitple)
684 * 0 non-initial state with transitions or with nonignorable result actions
685 * -1 final state with only ignorable actions
688 * The lowest byte value with non-ignorable actions is
689 * value<<5 (rounded down).
692 * The highest byte value with non-ignorable actions is
693 * (value<<5)&0x1f (rounded up).
695 byte stateProps[] = new byte[MBCS_MAX_STATE_COUNT];
698 /* recurse from state 0 and set all stateProps */
699 getStateProp(mbcsTable.stateTable, stateProps, 0);
701 for (state = 0; state < mbcsTable.countStates; ++state) {
702 if (stateProps[state] >= 0x40) {
703 /* start from each direct state */
704 enumToU(mbcsTable, stateProps, state, 0, 0);
711 private static boolean enumToU(UConverterMBCSTable mbcsTable, byte stateProps[], int state, int offset, int value) {
712 int[] codePoints = new int[32];
714 char[] unicodeCodeUnits;
718 row = mbcsTable.stateTable[state];
719 unicodeCodeUnits = mbcsTable.unicodeCodeUnits;
722 anyCodePoints = -1; /* becomes non-negative if there is a mapping */
724 b = (stateProps[state]&0x38)<<2;
725 if (b == 0 && stateProps[state] >= 0x40) {
726 /* skip byte sequences with leading zeros because they are note stored in the fromUnicode table */
727 codePoints[0] = UConverterConstants.U_SENTINEL;
730 limit = ((stateProps[state]&7)+1)<<5;
733 if (MBCS_ENTRY_IS_TRANSITION(entry)) {
734 int nextState = MBCS_ENTRY_TRANSITION_STATE(entry);
735 if (stateProps[nextState] >= 0) {
736 /* recurse to a state with non-ignorable actions */
737 if (!enumToU(mbcsTable, stateProps, nextState, offset+MBCS_ENTRY_TRANSITION_OFFSET(entry), value|b)) {
741 codePoints[b&0x1f] = UConverterConstants.U_SENTINEL;
747 * An if-else-if chain provides more reliable performance for
748 * the most common cases compared to a switch.
750 action = MBCS_ENTRY_FINAL_ACTION(entry);
751 if (action == MBCS_STATE_VALID_DIRECT_16) {
752 /* output BMP code point */
753 c = MBCS_ENTRY_FINAL_VALUE_16(entry);
754 } else if (action == MBCS_STATE_VALID_16) {
755 int finalOffset = offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
756 c = unicodeCodeUnits[finalOffset];
758 /* output BMP code point */
760 c = UConverterConstants.U_SENTINEL;
762 } else if (action == MBCS_STATE_VALID_16_PAIR) {
763 int finalOffset = offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
764 c = unicodeCodeUnits[finalOffset++];
766 /* output BMP code point below 0xd800 */
767 } else if (c <= 0xdbff) {
768 /* output roundtrip or fallback supplementary code point */
769 c = ((c&0x3ff)<<10)+unicodeCodeUnits[finalOffset]+(0x10000-0xdc00);
770 } else if (c == 0xe000) {
771 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
772 c = unicodeCodeUnits[finalOffset];
774 c = UConverterConstants.U_SENTINEL;
776 } else if (action == MBCS_STATE_VALID_DIRECT_20) {
777 /* output supplementary code point */
778 c = MBCS_ENTRY_FINAL_VALUE(entry)+0x10000;
780 c = UConverterConstants.U_SENTINEL;
783 codePoints[b&0x1f] = c;
786 if (((++b)&0x1f) == 0) {
787 if(anyCodePoints>=0) {
788 if(!writeStage3Roundtrip(mbcsTable, value|(b-0x20)&UConverterConstants.UNSIGNED_INT_MASK, codePoints)) {
800 * Only called if stateProps[state]==-1.
801 * A recursive call may do stateProps[state]|=0x40 if this state is the target of an
802 * MBCS_STATE_CHANGE_ONLY.
804 private static byte getStateProp(int stateTable[][], byte stateProps[], int state) {
806 int min, max, entry, nextState;
808 row = stateTable[state];
809 stateProps[state] = 0;
811 /* find first non-ignorable state */
812 for (min = 0;;++min) {
814 nextState = MBCS_ENTRY_STATE(entry);
815 if (stateProps[nextState] == -1) {
816 getStateProp(stateTable, stateProps, nextState);
818 if (MBCS_ENTRY_IS_TRANSITION(entry)) {
819 if (stateProps[nextState] >- 0) {
822 } else if (MBCS_ENTRY_FINAL_ACTION(entry) < MBCS_STATE_UNASSIGNED) {
826 stateProps[state] = -0x40; /* (byte)0xc0 */
827 return stateProps[state];
830 stateProps[state]|=(byte)((min>>5)<<3);
832 /* find last non-ignorable state */
833 for (max = 0xff; min < max; --max) {
835 nextState = MBCS_ENTRY_STATE(entry);
836 if (stateProps[nextState] == -1) {
837 getStateProp(stateTable, stateProps, nextState);
839 if (MBCS_ENTRY_IS_TRANSITION(entry)) {
840 if (stateProps[nextState] >- 0) {
843 } else if (MBCS_ENTRY_FINAL_ACTION(entry) < MBCS_STATE_UNASSIGNED) {
847 stateProps[state]|=(byte)(max>>5);
849 /* recurse further and collect direct-state information */
852 nextState = MBCS_ENTRY_STATE(entry);
853 if (stateProps[nextState] == -1) {
854 getStateProp(stateTable, stateProps, nextState);
856 if (MBCS_ENTRY_IS_TRANSITION(entry)) {
857 stateProps[nextState]|=0x40;
858 if (MBCS_ENTRY_FINAL_ACTION(entry) <= MBCS_STATE_FALLBACK_DIRECT_20) {
859 stateProps[state]|=0x40;
864 return stateProps[state];
867 protected void initializeConverter(int myOptions) {
868 UConverterMBCSTable mbcsTable;
869 ByteBuffer extIndexes;
871 byte maxBytesPerUChar;
873 mbcsTable = sharedData.mbcs;
874 outputType = mbcsTable.outputType;
876 if (outputType == MBCS_OUTPUT_DBCS_ONLY) {
877 /* the swaplfnl option does not apply, remove it */
878 this.options = myOptions &= ~UConverterConstants.OPTION_SWAP_LFNL;
881 if ((myOptions & UConverterConstants.OPTION_SWAP_LFNL) != 0) {
882 /* do this because double-checked locking is broken */
885 // agljport:todo umtx_lock(NULL);
886 isCached = mbcsTable.swapLFNLStateTable != null;
887 // agljport:todo umtx_unlock(NULL);
891 if (!EBCDICSwapLFNL()) {
892 /* this option does not apply, remove it */
893 this.options = myOptions & ~UConverterConstants.OPTION_SWAP_LFNL;
895 } catch (Exception e) {
896 /* something went wrong. */
902 String lowerCaseName = icuCanonicalName.toLowerCase(Locale.ENGLISH);
903 if (lowerCaseName.indexOf("gb18030") >= 0) {
904 /* set a flag for GB 18030 mode, which changes the callback behavior */
905 this.options |= MBCS_OPTION_GB18030;
906 } else if (lowerCaseName.indexOf("keis") >= 0) {
907 this.options |= MBCS_OPTION_KEIS;
908 } else if (lowerCaseName.indexOf("jef") >= 0) {
909 this.options |= MBCS_OPTION_JEF;
910 } else if (lowerCaseName.indexOf("jips") >= 0) {
911 this.options |= MBCS_OPTION_JIPS;
914 /* fix maxBytesPerUChar depending on outputType and options etc. */
915 if (outputType == MBCS_OUTPUT_2_SISO) {
916 /* changed from 3 to 4 in ICU4J only. #9205 */
917 maxBytesPerChar = 4; /* SO+DBCS+SI*/
920 extIndexes = mbcsTable.extIndexes;
921 if (extIndexes != null) {
922 maxBytesPerUChar = (byte) GET_MAX_BYTES_PER_UCHAR(extIndexes);
923 if (outputType == MBCS_OUTPUT_2_SISO) {
924 ++maxBytesPerUChar; /* SO + multiple DBCS */
927 if (maxBytesPerUChar > maxBytesPerChar) {
928 maxBytesPerChar = maxBytesPerUChar;
932 /* EBCDIC swap LF<->NL--------------------------------------------------------------------------------*/
934 * This code modifies a standard EBCDIC<->Unicode mappling table for
935 * OS/390 (z/OS) Unix System Services (Open Edition).
936 * The difference is in the mapping of Line Feed and New Line control codes:
937 * Standard EBDIC maps
942 * but OS/390 USS EBCDIC swaps the control codes for LF and NL,
948 * This code modifies a loaded standard EBCDIC<->Unicode mapping table
949 * by copying it into allocated memory and swapping the LF and NL values.
950 * It allows to support the same EBCDIC charset in both version without
951 * duplicating the entire installed table.
953 /* standard EBCDIC codes */
954 private static final short EBCDIC_LF = 0x0025;
955 private static final short EBCDIC_NL = 0x0015;
957 /* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */
958 private static final short EBCDIC_RT_LF = 0x0f25;
959 private static final short EBCDIC_RT_NL = 0x0f15;
961 /* Unicode code points */
962 private static final short U_LF = 0x000A;
963 private static final short U_NL = 0x0085;
965 private boolean EBCDICSwapLFNL() throws Exception {
966 UConverterMBCSTable mbcsTable;
972 int[][] newStateTable;
978 int sizeofFromUBytes;
980 mbcsTable = sharedData.mbcs;
982 table = mbcsTable.fromUnicodeTable;
983 bytes = mbcsTable.fromUnicodeBytes;
987 * Check that this is an EBCDIC table with SBCS portion -
988 * SBCS or EBCDIC with standard EBCDIC LF and NL mappings.
990 * If not, ignore the option Options are always ignored if they do not apply.
992 if (!((mbcsTable.outputType == MBCS_OUTPUT_1 || mbcsTable.outputType == MBCS_OUTPUT_2_SISO) &&
993 mbcsTable.stateTable[0][EBCDIC_LF] == MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) &&
994 mbcsTable.stateTable[0][EBCDIC_NL] == MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL))) {
998 if (mbcsTable.outputType == MBCS_OUTPUT_1) {
999 if (!(EBCDIC_RT_LF == MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) &&
1000 EBCDIC_RT_NL == MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL))) {
1003 } else /* MBCS_OUTPUT_2_SISO */ {
1004 stage2Entry = MBCS_STAGE_2_FROM_U(table, U_LF);
1005 if (!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF) &&
1006 EBCDIC_LF == MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF))) {
1010 stage2Entry = MBCS_STAGE_2_FROM_U(table, U_NL);
1011 if (!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL) &&
1012 EBCDIC_NL == MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL))) {
1017 if (mbcsTable.fromUBytesLength > 0) {
1019 * We _know_ the number of bytes in the fromUnicodeBytes array
1020 * starting with header.version 4.1.
1022 sizeofFromUBytes = mbcsTable.fromUBytesLength;
1026 * There used to be code to enumerate the fromUnicode
1027 * trie and find the highest entry, but it was removed in ICU 3.2
1028 * because it was not tested and caused a low code coverage number.
1030 throw new Exception("U_INVALID_FORMAT_ERROR");
1034 * The table has an appropriate format.
1035 * Allocate and build
1036 * - a modified to-Unicode state table
1037 * - a modified from-Unicode output array
1038 * - a converter name string with the swap option appended
1040 // size = mbcsTable.countStates * 1024 + sizeofFromUBytes + UConverterConstants.MAX_CONVERTER_NAME_LENGTH + 20;
1042 /* copy and modify the to-Unicode state table */
1043 newStateTable = new int[mbcsTable.stateTable.length][mbcsTable.stateTable[0].length];
1044 for (int i = 0; i < newStateTable.length; i++) {
1045 System.arraycopy(mbcsTable.stateTable[i], 0, newStateTable[i], 0, newStateTable[i].length);
1048 newStateTable[0][EBCDIC_LF] = MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL);
1049 newStateTable[0][EBCDIC_NL] = MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF);
1051 /* copy and modify the from-Unicode result table */
1052 newResults = new byte[sizeofFromUBytes];
1053 System.arraycopy(bytes, 0, newResults, 0, sizeofFromUBytes);
1054 /* conveniently, the table access macros work on the left side of expressions */
1055 if (mbcsTable.outputType == MBCS_OUTPUT_1) {
1056 MBCS_SINGLE_RESULT_FROM_U_SET(table, newResults, U_LF, EBCDIC_RT_NL);
1057 MBCS_SINGLE_RESULT_FROM_U_SET(table, newResults, U_NL, EBCDIC_RT_LF);
1058 } else /* MBCS_OUTPUT_2_SISO */ {
1059 stage2Entry = MBCS_STAGE_2_FROM_U(table, U_LF);
1060 MBCS_VALUE_2_FROM_STAGE_2_SET(newResults, stage2Entry, U_LF, EBCDIC_NL);
1062 stage2Entry = MBCS_STAGE_2_FROM_U(table, U_NL);
1063 MBCS_VALUE_2_FROM_STAGE_2_SET(newResults, stage2Entry, U_NL, EBCDIC_LF);
1066 /* set the canonical converter name */
1067 newName = icuCanonicalName.concat(UConverterConstants.OPTION_SWAP_LFNL_STRING);
1069 if (mbcsTable.swapLFNLStateTable == null) {
1070 mbcsTable.swapLFNLStateTable = newStateTable;
1071 mbcsTable.swapLFNLFromUnicodeBytes = newResults;
1072 mbcsTable.swapLFNLName = newName;
1078 * MBCS output types for conversions from Unicode. These per-converter types determine the storage method in stage 3
1079 * of the lookup table, mostly how many bytes are stored per entry.
1081 static final int MBCS_OUTPUT_1 = 0; /* 0 */
1082 static final int MBCS_OUTPUT_2 = MBCS_OUTPUT_1 + 1; /* 1 */
1083 static final int MBCS_OUTPUT_3 = MBCS_OUTPUT_2 + 1; /* 2 */
1084 static final int MBCS_OUTPUT_4 = MBCS_OUTPUT_3 + 1; /* 3 */
1085 static final int MBCS_OUTPUT_3_EUC = 8; /* 8 */
1086 static final int MBCS_OUTPUT_4_EUC = MBCS_OUTPUT_3_EUC + 1; /* 9 */
1087 static final int MBCS_OUTPUT_2_SISO = 12; /* c */
1088 static final int MBCS_OUTPUT_2_HZ = MBCS_OUTPUT_2_SISO + 1; /* d */
1089 static final int MBCS_OUTPUT_EXT_ONLY = MBCS_OUTPUT_2_HZ + 1; /* e */
1090 // static final int MBCS_OUTPUT_COUNT = MBCS_OUTPUT_EXT_ONLY + 1;
1091 static final int MBCS_OUTPUT_DBCS_ONLY = 0xdb; /* runtime-only type for DBCS-only handling of SISO tables */
1093 /* GB 18030 data ------------------------------------------------------------ */
1095 /* helper macros for linear values for GB 18030 four-byte sequences */
1096 private static long LINEAR_18030(long a, long b, long c, long d) {
1097 return ((((a & 0xff) * 10 + (b & 0xff)) * 126L + (c & 0xff)) * 10L + (d & 0xff));
1100 private static long LINEAR_18030_BASE = LINEAR_18030(0x81, 0x30, 0x81, 0x30);
1102 private static long LINEAR(long x) {
1103 return LINEAR_18030(x >>> 24, (x >>> 16) & 0xff, (x >>> 8) & 0xff, x & 0xff);
1107 * Some ranges of GB 18030 where both the Unicode code points and the GB four-byte sequences are contiguous and are
1108 * handled algorithmically by the special callback functions below. The values are start & end of Unicode & GB
1111 * Note that single surrogates are not mapped by GB 18030 as of the re-released mapping tables from 2000-nov-30.
1113 private static final long gb18030Ranges[][] = new long[/* 14 */][/* 4 */] {
1114 { 0x10000L, 0x10FFFFL, LINEAR(0x90308130L), LINEAR(0xE3329A35L) },
1115 { 0x9FA6L, 0xD7FFL, LINEAR(0x82358F33L), LINEAR(0x8336C738L) },
1116 { 0x0452L, 0x1E3EL, LINEAR(0x8130D330L), LINEAR(0x8135F436L) },
1117 { 0x1E40L, 0x200FL, LINEAR(0x8135F438L), LINEAR(0x8136A531L) },
1118 { 0xE865L, 0xF92BL, LINEAR(0x8336D030L), LINEAR(0x84308534L) },
1119 { 0x2643L, 0x2E80L, LINEAR(0x8137A839L), LINEAR(0x8138FD38L) },
1120 { 0xFA2AL, 0xFE2FL, LINEAR(0x84309C38L), LINEAR(0x84318537L) },
1121 { 0x3CE1L, 0x4055L, LINEAR(0x8231D438L), LINEAR(0x8232AF32L) },
1122 { 0x361BL, 0x3917L, LINEAR(0x8230A633L), LINEAR(0x8230F237L) },
1123 { 0x49B8L, 0x4C76L, LINEAR(0x8234A131L), LINEAR(0x8234E733L) },
1124 { 0x4160L, 0x4336L, LINEAR(0x8232C937L), LINEAR(0x8232F837L) },
1125 { 0x478EL, 0x4946L, LINEAR(0x8233E838L), LINEAR(0x82349638L) },
1126 { 0x44D7L, 0x464BL, LINEAR(0x8233A339L), LINEAR(0x8233C931L) },
1127 { 0xFFE6L, 0xFFFFL, LINEAR(0x8431A234L), LINEAR(0x8431A439L) } };
1129 /* bit flag for UConverter.options indicating GB 18030 special handling */
1130 private static final int MBCS_OPTION_GB18030 = 0x8000;
1132 /* bit flag for UConverter.options indicating KEIS,JEF,JIF special handling */
1133 private static final int MBCS_OPTION_KEIS = 0x01000;
1134 private static final int MBCS_OPTION_JEF = 0x02000;
1135 private static final int MBCS_OPTION_JIPS = 0x04000;
1137 private static enum SISO_Option {
1142 private static final byte[] KEIS_SO_CHAR = { 0x0A, 0x42 };
1143 private static final byte[] KEIS_SI_CHAR = { 0x0A, 0x41 };
1144 private static final byte JEF_SO_CHAR = 0x28;
1145 private static final byte JEF_SI_CHAR = 0x29;
1146 private static final byte[] JIPS_SO_CHAR = { 0x1A, 0x70 };
1147 private static final byte[] JIPS_SI_CHAR = { 0x1A, 0x71 };
1149 private static int getSISOBytes(SISO_Option option, int cnvOption, byte[] value) {
1154 if ((cnvOption&MBCS_OPTION_KEIS)!=0) {
1155 value[0] = KEIS_SI_CHAR[0];
1156 value[1] = KEIS_SI_CHAR[1];
1158 } else if ((cnvOption&MBCS_OPTION_JEF)!=0) {
1159 value[0] = JEF_SI_CHAR;
1161 } else if ((cnvOption&MBCS_OPTION_JIPS)!=0) {
1162 value[0] = JIPS_SI_CHAR[0];
1163 value[1] = JIPS_SI_CHAR[1];
1166 value[0] = UConverterConstants.SI;
1171 if ((cnvOption&MBCS_OPTION_KEIS)!=0) {
1172 value[0] = KEIS_SO_CHAR[0];
1173 value[1] = KEIS_SO_CHAR[1];
1175 } else if ((cnvOption&MBCS_OPTION_JEF)!=0) {
1176 value[0] = JEF_SO_CHAR;
1178 } else if ((cnvOption&MBCS_OPTION_JIPS)!=0) {
1179 value[0] = JIPS_SO_CHAR[0];
1180 value[1] = JIPS_SO_CHAR[1];
1183 value[0] = UConverterConstants.SO;
1188 /* Should never happen. */
1195 static final int MBCS_MAX_STATE_COUNT = 128;
1198 * MBCS action codes for conversions to Unicode. These values are in bits 23..20 of the state table entries.
1200 static final int MBCS_STATE_VALID_DIRECT_16 = 0;
1201 static final int MBCS_STATE_VALID_DIRECT_20 = MBCS_STATE_VALID_DIRECT_16 + 1;
1202 static final int MBCS_STATE_FALLBACK_DIRECT_16 = MBCS_STATE_VALID_DIRECT_20 + 1;
1203 static final int MBCS_STATE_FALLBACK_DIRECT_20 = MBCS_STATE_FALLBACK_DIRECT_16 + 1;
1204 static final int MBCS_STATE_VALID_16 = MBCS_STATE_FALLBACK_DIRECT_20 + 1;
1205 static final int MBCS_STATE_VALID_16_PAIR = MBCS_STATE_VALID_16 + 1;
1206 static final int MBCS_STATE_UNASSIGNED = MBCS_STATE_VALID_16_PAIR + 1;
1207 static final int MBCS_STATE_ILLEGAL = MBCS_STATE_UNASSIGNED + 1;
1208 static final int MBCS_STATE_CHANGE_ONLY = MBCS_STATE_ILLEGAL + 1;
1210 static int MBCS_ENTRY_SET_STATE(int entry, int state) {
1211 return (entry&0x80ffffff)|(state<<24L);
1214 static int MBCS_ENTRY_STATE(int entry) {
1215 return (((entry)>>24)&0x7f);
1218 /* Methods for state table entries */
1219 static int MBCS_ENTRY_TRANSITION(int state, int offset) {
1220 return (state << 24L) | offset;
1223 static int MBCS_ENTRY_FINAL(int state, int action, int value) {
1224 return 0x80000000 | (state << 24L) | (action << 20L) | value;
1227 static boolean MBCS_ENTRY_IS_TRANSITION(int entry) {
1228 return (entry) >= 0;
1231 static boolean MBCS_ENTRY_IS_FINAL(int entry) {
1235 static int MBCS_ENTRY_TRANSITION_STATE(int entry) {
1236 return ((entry) >>> 24);
1239 static int MBCS_ENTRY_TRANSITION_OFFSET(int entry) {
1240 return ((entry) & 0xffffff);
1243 static int MBCS_ENTRY_FINAL_STATE(int entry) {
1244 return ((entry) >>> 24) & 0x7f;
1247 static boolean MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(int entry) {
1248 return ((entry) < 0x80100000);
1251 static int MBCS_ENTRY_FINAL_ACTION(int entry) {
1252 return ((entry) >>> 20) & 0xf;
1255 static int MBCS_ENTRY_FINAL_VALUE(int entry) {
1256 return ((entry) & 0xfffff);
1259 static char MBCS_ENTRY_FINAL_VALUE_16(int entry) {
1260 return (char) (entry);
1263 static boolean MBCS_IS_ASCII_ROUNDTRIP(int b, long asciiRoundtrips) {
1264 return (((asciiRoundtrips) & (1<<((b)>>2)))!=0);
1268 * This macro version of _MBCSSingleSimpleGetNextUChar() gets a code point from a byte. It works for single-byte,
1269 * single-state codepages that only map to and from BMP code points, and it always returns fallback values.
1271 static char MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(UConverterMBCSTable mbcs, final int b) {
1272 return MBCS_ENTRY_FINAL_VALUE_16(mbcs.stateTable[0][b & UConverterConstants.UNSIGNED_BYTE_MASK]);
1275 /* single-byte fromUnicode: get the 16-bit result word */
1276 static char MBCS_SINGLE_RESULT_FROM_U(char[] table, byte[] results, int c) {
1277 int i1 = table[c >>> 10] + ((c >>> 4) & 0x3f);
1278 int i = 2 * (table[i1] + (c & 0xf)); // used as index into byte[] array treated as char[] array
1279 return (char) (((results[i] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | (results[i + 1] & UConverterConstants.UNSIGNED_BYTE_MASK));
1282 /* single-byte fromUnicode: set the 16-bit result word with newValue*/
1283 static void MBCS_SINGLE_RESULT_FROM_U_SET(char[] table, byte[] results, int c, int newValue) {
1284 int i1 = table[c >>> 10] + ((c >>> 4) & 0x3f);
1285 int i = 2 * (table[i1] + (c & 0xf)); // used as index into byte[] array treated as char[] array
1286 results[i] = (byte)((newValue >> 8) & UConverterConstants.UNSIGNED_BYTE_MASK);
1287 results[i + 1] = (byte)(newValue & UConverterConstants.UNSIGNED_BYTE_MASK);
1290 /* multi-byte fromUnicode: get the 32-bit stage 2 entry */
1291 static int MBCS_STAGE_2_FROM_U(char[] table, int c) {
1292 int i = 2 * (table[(c) >>> 10] + ((c >>> 4) & 0x3f)); // 2x because used as index into char[] array treated as
1294 return ((table[i] & UConverterConstants.UNSIGNED_SHORT_MASK) << 16)
1295 | (table[i + 1] & UConverterConstants.UNSIGNED_SHORT_MASK);
1298 private static boolean MBCS_FROM_U_IS_ROUNDTRIP(int stage2Entry, int c) {
1299 return (((stage2Entry) & (1 << (16 + ((c) & 0xf)))) != 0);
1302 static char MBCS_VALUE_2_FROM_STAGE_2(byte[] bytes, int stage2Entry, int c) {
1303 int i = 2 * (16 * ((char) stage2Entry & UConverterConstants.UNSIGNED_SHORT_MASK) + (c & 0xf));
1304 return (char) (((bytes[i] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | (bytes[i + 1] & UConverterConstants.UNSIGNED_BYTE_MASK));
1307 static void MBCS_VALUE_2_FROM_STAGE_2_SET(byte[] bytes, int stage2Entry, int c, int newValue) {
1308 int i = 2 * (16 * ((char) stage2Entry & UConverterConstants.UNSIGNED_SHORT_MASK) + (c & 0xf));
1309 bytes[i] = (byte)((newValue >> 8) & UConverterConstants.UNSIGNED_BYTE_MASK);
1310 bytes[i + 1] = (byte)(newValue & UConverterConstants.UNSIGNED_BYTE_MASK);
1313 private static int MBCS_VALUE_4_FROM_STAGE_2(byte[] bytes, int stage2Entry, int c) {
1314 int i = 4 * (16 * ((char) stage2Entry & UConverterConstants.UNSIGNED_SHORT_MASK) + (c & 0xf));
1315 return ((bytes[i] & UConverterConstants.UNSIGNED_BYTE_MASK) << 24)
1316 | ((bytes[i + 1] & UConverterConstants.UNSIGNED_BYTE_MASK) << 16)
1317 | ((bytes[i + 2] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8)
1318 | (bytes[i + 3] & UConverterConstants.UNSIGNED_BYTE_MASK);
1321 static int MBCS_POINTER_3_FROM_STAGE_2(byte[] bytes, int stage2Entry, int c) {
1322 return ((16 * ((char) (stage2Entry) & UConverterConstants.UNSIGNED_SHORT_MASK) + ((c) & 0xf)) * 3);
1325 // ------------UConverterExt-------------------------------------------------------
1327 static final int EXT_INDEXES_LENGTH = 0; /* 0 */
1329 static final int EXT_TO_U_INDEX = EXT_INDEXES_LENGTH + 1; /* 1 */
1330 static final int EXT_TO_U_LENGTH = EXT_TO_U_INDEX + 1;
1331 static final int EXT_TO_U_UCHARS_INDEX = EXT_TO_U_LENGTH + 1;
1332 static final int EXT_TO_U_UCHARS_LENGTH = EXT_TO_U_UCHARS_INDEX + 1;
1334 static final int EXT_FROM_U_UCHARS_INDEX = EXT_TO_U_UCHARS_LENGTH + 1; /* 5 */
1335 static final int EXT_FROM_U_VALUES_INDEX = EXT_FROM_U_UCHARS_INDEX + 1;
1336 static final int EXT_FROM_U_LENGTH = EXT_FROM_U_VALUES_INDEX + 1;
1337 static final int EXT_FROM_U_BYTES_INDEX = EXT_FROM_U_LENGTH + 1;
1338 static final int EXT_FROM_U_BYTES_LENGTH = EXT_FROM_U_BYTES_INDEX + 1;
1340 static final int EXT_FROM_U_STAGE_12_INDEX = EXT_FROM_U_BYTES_LENGTH + 1; /* 10 */
1341 static final int EXT_FROM_U_STAGE_1_LENGTH = EXT_FROM_U_STAGE_12_INDEX + 1;
1342 static final int EXT_FROM_U_STAGE_12_LENGTH = EXT_FROM_U_STAGE_1_LENGTH + 1;
1343 static final int EXT_FROM_U_STAGE_3_INDEX = EXT_FROM_U_STAGE_12_LENGTH + 1;
1344 static final int EXT_FROM_U_STAGE_3_LENGTH = EXT_FROM_U_STAGE_3_INDEX + 1;
1345 static final int EXT_FROM_U_STAGE_3B_INDEX = EXT_FROM_U_STAGE_3_LENGTH + 1;
1346 static final int EXT_FROM_U_STAGE_3B_LENGTH = EXT_FROM_U_STAGE_3B_INDEX + 1;
1348 private static final int EXT_COUNT_BYTES = EXT_FROM_U_STAGE_3B_LENGTH + 1; /* 17 */
1349 // private static final int EXT_COUNT_UCHARS = EXT_COUNT_BYTES + 1;
1350 // private static final int EXT_FLAGS = EXT_COUNT_UCHARS + 1;
1352 // private static final int EXT_RESERVED_INDEX = EXT_FLAGS + 1; /* 20, moves with additional indexes */
1354 // private static final int EXT_SIZE=31;
1355 // private static final int EXT_INDEXES_MIN_LENGTH=32;
1357 static final int EXT_FROM_U_MAX_DIRECT_LENGTH = 3;
1359 /* toUnicode helpers -------------------------------------------------------- */
1361 private static final int TO_U_BYTE_SHIFT = 24;
1362 private static final int TO_U_VALUE_MASK = 0xffffff;
1363 private static final int TO_U_MIN_CODE_POINT = 0x1f0000;
1364 private static final int TO_U_MAX_CODE_POINT = 0x2fffff;
1365 private static final int TO_U_ROUNDTRIP_FLAG = (1 << 23);
1366 private static final int TO_U_INDEX_MASK = 0x3ffff;
1367 private static final int TO_U_LENGTH_SHIFT = 18;
1368 private static final int TO_U_LENGTH_OFFSET = 12;
1370 /* maximum number of indexed UChars */
1371 static final int MAX_UCHARS = 19;
1373 static int TO_U_GET_BYTE(int word) {
1374 return word >>> TO_U_BYTE_SHIFT;
1377 static int TO_U_GET_VALUE(int word) {
1378 return word & TO_U_VALUE_MASK;
1381 static boolean TO_U_IS_ROUNDTRIP(int value) {
1382 return (value & TO_U_ROUNDTRIP_FLAG) != 0;
1385 static boolean TO_U_IS_PARTIAL(int value) {
1386 return (value & UConverterConstants.UNSIGNED_INT_MASK) < TO_U_MIN_CODE_POINT;
1389 static int TO_U_GET_PARTIAL_INDEX(int value) {
1393 static int TO_U_MASK_ROUNDTRIP(int value) {
1394 return value & ~TO_U_ROUNDTRIP_FLAG;
1397 private static int TO_U_MAKE_WORD(byte b, int value) {
1398 return ((b & UConverterConstants.UNSIGNED_BYTE_MASK) << TO_U_BYTE_SHIFT) | value;
1401 /* use after masking off the roundtrip flag */
1402 static boolean TO_U_IS_CODE_POINT(int value) {
1403 return (value & UConverterConstants.UNSIGNED_INT_MASK) <= TO_U_MAX_CODE_POINT;
1406 static int TO_U_GET_CODE_POINT(int value) {
1407 return (int) ((value & UConverterConstants.UNSIGNED_INT_MASK) - TO_U_MIN_CODE_POINT);
1410 private static int TO_U_GET_INDEX(int value) {
1411 return value & TO_U_INDEX_MASK;
1414 private static int TO_U_GET_LENGTH(int value) {
1415 return (value >>> TO_U_LENGTH_SHIFT) - TO_U_LENGTH_OFFSET;
1418 /* fromUnicode helpers ------------------------------------------------------ */
1420 /* most trie constants are shared with ucnvmbcs.h */
1421 private static final int STAGE_2_LEFT_SHIFT = 2;
1423 // private static final int STAGE_3_GRANULARITY = 4;
1425 /* trie access, returns the stage 3 value=index to stage 3b; s1Index=c>>10 */
1426 static int FROM_U(CharBuffer stage12, CharBuffer stage3, int s1Index, int c) {
1427 return stage3.get(((int) stage12.get((stage12.get(s1Index) + ((c >>> 4) & 0x3f))) << STAGE_2_LEFT_SHIFT)
1431 private static final int FROM_U_LENGTH_SHIFT = 24;
1432 private static final int FROM_U_ROUNDTRIP_FLAG = 1 << 31;
1433 static final int FROM_U_RESERVED_MASK = 0x60000000;
1434 private static final int FROM_U_DATA_MASK = 0xffffff;
1436 /* special value for "no mapping" to <subchar1> (impossible roundtrip to 0 bytes, value 01) */
1437 static final int FROM_U_SUBCHAR1 = 0x80000001;
1439 /* at most 3 bytes in the lower part of the value */
1440 private static final int FROM_U_MAX_DIRECT_LENGTH = 3;
1442 /* maximum number of indexed bytes */
1443 static final int MAX_BYTES = 0x1f;
1445 static boolean FROM_U_IS_PARTIAL(int value) {
1446 return (value >>> FROM_U_LENGTH_SHIFT) == 0;
1449 static int FROM_U_GET_PARTIAL_INDEX(int value) {
1453 static boolean FROM_U_IS_ROUNDTRIP(int value) {
1454 return (value & FROM_U_ROUNDTRIP_FLAG) != 0;
1457 private static int FROM_U_MASK_ROUNDTRIP(int value) {
1458 return value & ~FROM_U_ROUNDTRIP_FLAG;
1461 /* use after masking off the roundtrip flag */
1462 static int FROM_U_GET_LENGTH(int value) {
1463 return (value >>> FROM_U_LENGTH_SHIFT) & MAX_BYTES;
1466 /* get bytes or bytes index */
1467 static int FROM_U_GET_DATA(int value) {
1468 return value & FROM_U_DATA_MASK;
1471 /* get the pointer to an extension array from indexes[index] */
1472 static Buffer ARRAY(ByteBuffer indexes, int index, Class<?> itemType) {
1473 int oldpos = indexes.position();
1476 indexes.position(indexes.getInt(index << 2));
1477 if (itemType == int.class)
1478 b = indexes.asIntBuffer();
1479 else if (itemType == char.class)
1480 b = indexes.asCharBuffer();
1481 else if (itemType == short.class)
1482 b = indexes.asShortBuffer();
1484 // default or (itemType == byte.class)
1485 b = indexes.slice();
1486 indexes.position(oldpos);
1490 private static int GET_MAX_BYTES_PER_UCHAR(ByteBuffer indexes) {
1491 indexes.position(0);
1492 return indexes.getInt(EXT_COUNT_BYTES) & 0xff;
1496 * @return index of the UChar, if found; else <0
1498 static int findFromU(CharBuffer fromUSection, int length, char u) {
1499 int i, start, limit;
1512 /* linear search for the last part */
1513 if (u <= fromUSection.get(fromUSection.position() + start)) {
1516 if (++start < limit && u <= fromUSection.get(fromUSection.position() + start)) {
1519 if (++start < limit && u <= fromUSection.get(fromUSection.position() + start)) {
1522 /* always break at start==limit-1 */
1527 i = (start + limit) / 2;
1528 if (u < fromUSection.get(fromUSection.position() + i)) {
1535 /* did we really find it? */
1536 if (start < limit && u == fromUSection.get(fromUSection.position() + start)) {
1539 return -1; /* not found */
1544 * @return lookup value for the byte, if found; else 0
1546 static int findToU(IntBuffer toUSection, int length, short byt) {
1548 int i, start, limit;
1550 /* check the input byte against the lowest and highest section bytes */
1551 // agljport:comment instead of receiving a start position parameter for toUSection we'll rely on its position
1553 start = TO_U_GET_BYTE(toUSection.get(toUSection.position()));
1554 limit = TO_U_GET_BYTE(toUSection.get(toUSection.position() + length - 1));
1555 if (byt < start || limit < byt) {
1556 return 0; /* the byte is out of range */
1559 if (length == ((limit - start) + 1)) {
1560 /* direct access on a linear array */
1561 return TO_U_GET_VALUE(toUSection.get(toUSection.position() + byt - start)); /* could be 0 */
1564 /* word0 is suitable for <=toUSection[] comparison, word for <toUSection[] */
1565 word0 = TO_U_MAKE_WORD((byte) byt, 0) & UConverterConstants.UNSIGNED_INT_MASK;
1568 * Shift byte once instead of each section word and add 0xffffff. We will compare the shifted/added byte
1569 * (bbffffff) against section words which have byte values in the same bit position. If and only if byte bb <
1570 * section byte ss then bbffffff<ssvvvvvv for all v=0..f so we need not mask off the lower 24 bits of each
1573 word = word0 | TO_U_VALUE_MASK;
1586 /* linear search for the last part */
1587 if (word0 <= (toUSection.get(toUSection.position() + start) & UConverterConstants.UNSIGNED_INT_MASK)) {
1591 && word0 <= (toUSection.get(toUSection.position() + start) & UConverterConstants.UNSIGNED_INT_MASK)) {
1595 && word0 <= (toUSection.get(toUSection.position() + start) & UConverterConstants.UNSIGNED_INT_MASK)) {
1598 /* always break at start==limit-1 */
1603 i = (start + limit) / 2;
1604 if (word < (toUSection.get(toUSection.position() + i) & UConverterConstants.UNSIGNED_INT_MASK)) {
1611 /* did we really find it? */
1612 if (start < limit) {
1613 word = (toUSection.get(toUSection.position() + start) & UConverterConstants.UNSIGNED_INT_MASK);
1614 if (byt == TO_U_GET_BYTE((int)word)) {
1615 return TO_U_GET_VALUE((int) word); /* never 0 */
1618 return 0; /* not found */
1622 * TRUE if not an SI/SO stateful converter, or if the match length fits with the current converter state
1624 static boolean TO_U_VERIFY_SISO_MATCH(byte sisoState, int match) {
1625 return sisoState < 0 || (sisoState == 0) == (match == 1);
1629 * get the SI/SO toU state (state 0 is for SBCS, 1 for DBCS), or 1 for DBCS-only, or -1 if the converter is not
1632 * Note: For SI/SO stateful converters getting here, cnv->mode==0 is equivalent to firstLength==1.
1634 private static int SISO_STATE(UConverterSharedData sharedData, int mode) {
1635 return sharedData.mbcs.outputType == MBCS_OUTPUT_2_SISO ? (byte) mode
1636 : sharedData.mbcs.outputType == MBCS_OUTPUT_DBCS_ONLY ? 1 : -1;
1639 class CharsetDecoderMBCS extends CharsetDecoderICU {
1641 CharsetDecoderMBCS(CharsetICU cs) {
1645 protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
1646 /* Just call cnvMBCSToUnicodeWithOffsets() to remove duplicate code. */
1647 return cnvMBCSToUnicodeWithOffsets(source, target, offsets, flush);
1651 * continue partial match with new input never called for simple, single-character conversion
1653 private CoderResult continueMatchToU(ByteBuffer source, CharBuffer target, IntBuffer offsets, int srcIndex,
1655 CoderResult cr = CoderResult.UNDERFLOW;
1657 int[] value = new int[1];
1660 match = matchToU((byte) SISO_STATE(sharedData, mode), preToUArray, preToUBegin, preToULength, source,
1661 value, isToUUseFallback(), flush);
1664 if (match >= preToULength) {
1665 /* advance src pointer for the consumed input */
1666 source.position(source.position() + match - preToULength);
1669 /* the match did not use all of preToU[] - keep the rest for replay */
1670 length = preToULength - match;
1671 System.arraycopy(preToUArray, preToUBegin + match, preToUArray, preToUBegin, length);
1672 preToULength = (byte) -length;
1676 cr = writeToU(value[0], target, offsets, srcIndex);
1677 } else if (match < 0) {
1678 /* save state for partial match */
1681 /* just _append_ the newly consumed input to preToU[] */
1682 sArrayIndex = source.position();
1684 for (j = preToULength; j < match; ++j) {
1685 preToUArray[j] = source.get(sArrayIndex++);
1687 source.position(sArrayIndex); /* same as *src=srcLimit; because we reached the end of input */
1688 preToULength = (byte) match;
1689 } else /* match==0 */{
1693 * We need to split the previous input into two parts:
1695 * 1. The first codepage character is unmappable - that's how we got into trying the extension data in
1696 * the first place. We need to move it from the preToU buffer to the error buffer, set an error code,
1697 * and prepare the rest of the previous input for 2.
1699 * 2. The rest of the previous input must be converted once we come back from the callback for the first
1700 * character. At that time, we have to try again from scratch to convert these input characters. The
1701 * replay will be handled by the ucnv.c conversion code.
1704 /* move the first codepage character to the error field */
1705 System.arraycopy(preToUArray, preToUBegin, toUBytesArray, toUBytesBegin, preToUFirstLength);
1706 toULength = preToUFirstLength;
1708 /* move the rest up inside the buffer */
1709 length = preToULength - preToUFirstLength;
1711 System.arraycopy(preToUArray, preToUBegin + preToUFirstLength, preToUArray, preToUBegin, length);
1714 /* mark preToU for replay */
1715 preToULength = (byte) -length;
1717 /* set the error code for unassigned */
1718 cr = CoderResult.unmappableForLength(preToUFirstLength);
1724 * this works like matchFromU() except - the first character is in pre - no trie is used - the returned
1725 * matchLength is not offset by 2
1727 private int matchToU(byte sisoState, byte[] preArray, int preArrayBegin, int preLength, ByteBuffer source,
1728 int[] pMatchValue, boolean isUseFallback, boolean flush) {
1729 ByteBuffer cx = sharedData.mbcs.extIndexes;
1730 IntBuffer toUTable, toUSection;
1732 int value, matchValue, srcLength = 0;
1733 int i, j, index, length, matchLength;
1736 if (cx == null || cx.asIntBuffer().get(EXT_TO_U_LENGTH) <= 0) {
1737 return 0; /* no extension data, no match */
1741 toUTable = (IntBuffer) ARRAY(cx, EXT_TO_U_INDEX, int.class);
1745 i = j = matchLength = 0;
1746 if (source != null) {
1747 srcLength = source.remaining();
1750 if (sisoState == 0) {
1751 /* SBCS state of an SI/SO stateful converter, look at only exactly 1 byte */
1752 if (preLength > 1) {
1753 return 0; /* no match of a DBCS sequence in SBCS mode */
1754 } else if (preLength == 1) {
1756 } else /* preLength==0 */{
1757 if (srcLength > 1) {
1764 /* we must not remember fallback matches when not using fallbacks */
1766 /* match input units until there is a full match or the input is consumed */
1768 /* go to the next section */
1769 int oldpos = toUTable.position();
1770 toUSection = ((IntBuffer) toUTable.position(index)).slice();
1771 toUTable.position(oldpos);
1773 /* read first pair of the section */
1774 value = toUSection.get();
1775 length = TO_U_GET_BYTE(value);
1776 value = TO_U_GET_VALUE(value);
1777 if (value != 0 && (TO_U_IS_ROUNDTRIP(value) || isToUUseFallback(isUseFallback))
1778 && TO_U_VERIFY_SISO_MATCH(sisoState, i + j)) {
1779 /* remember longest match so far */
1781 matchLength = i + j;
1784 /* match pre[] then src[] */
1785 if (i < preLength) {
1786 b = (short) (preArray[preArrayBegin + i++] & UConverterConstants.UNSIGNED_BYTE_MASK);
1787 } else if (j < srcLength) {
1788 b = (short) (source.get(source.position() + j++) & UConverterConstants.UNSIGNED_BYTE_MASK);
1790 /* all input consumed, partial match */
1791 if (flush || (length = (i + j)) > MAX_BYTES) {
1793 * end of the entire input stream, stop with the longest match so far or: partial match must not
1794 * be longer than UCNV_EXT_MAX_BYTES because it must fit into state buffers
1798 /* continue with more input next time */
1803 /* search for the current UChar */
1804 value = findToU(toUSection, length, b);
1806 /* no match here, stop with the longest match so far */
1809 if (TO_U_IS_PARTIAL(value)) {
1810 /* partial match, continue */
1811 index = TO_U_GET_PARTIAL_INDEX(value);
1813 if ((TO_U_IS_ROUNDTRIP(value) || isToUUseFallback(isUseFallback)) && TO_U_VERIFY_SISO_MATCH(sisoState, i + j)) {
1814 /* full match, stop with result */
1816 matchLength = i + j;
1818 /* full match on fallback not taken, stop with the longest match so far */
1825 if (matchLength == 0) {
1826 /* no match at all */
1831 pMatchValue[0] = TO_U_MASK_ROUNDTRIP(matchValue);
1835 private CoderResult writeToU(int value, CharBuffer target, IntBuffer offsets, int srcIndex) {
1836 ByteBuffer cx = sharedData.mbcs.extIndexes;
1837 /* output the result */
1838 if (TO_U_IS_CODE_POINT(value)) {
1839 /* output a single code point */
1840 return toUWriteCodePoint(TO_U_GET_CODE_POINT(value), target, offsets, srcIndex);
1842 /* output a string - with correct data we have resultLength>0 */
1844 char[] a = new char[TO_U_GET_LENGTH(value)];
1845 CharBuffer cb = ((CharBuffer) ARRAY(cx, EXT_TO_U_UCHARS_INDEX, char.class));
1846 cb.position(TO_U_GET_INDEX(value));
1847 cb.get(a, 0, a.length);
1848 return toUWriteUChars(this, a, 0, a.length, target, offsets, srcIndex);
1852 private CoderResult toUWriteCodePoint(int c, CharBuffer target, IntBuffer offsets, int sourceIndex) {
1853 CoderResult cr = CoderResult.UNDERFLOW;
1854 int tBeginIndex = target.position();
1856 if (target.hasRemaining()) {
1858 target.put((char) c);
1859 c = UConverterConstants.U_SENTINEL;
1860 } else /* c is a supplementary code point */{
1861 target.put(UTF16.getLeadSurrogate(c));
1862 c = UTF16.getTrailSurrogate(c);
1863 if (target.hasRemaining()) {
1864 target.put((char) c);
1865 c = UConverterConstants.U_SENTINEL;
1870 if (offsets != null) {
1871 offsets.put(sourceIndex);
1872 if ((tBeginIndex + 1) < target.position()) {
1873 offsets.put(sourceIndex);
1878 /* write overflow from c */
1880 charErrorBufferLength = UTF16.append(charErrorBufferArray, 0, c);
1881 cr = CoderResult.OVERFLOW;
1888 * Input sequence: cnv->toUBytes[0..length[ @return if(U_FAILURE) return the length (toULength, byteIndex) for
1889 * the input else return 0 after output has been written to the target
1891 private int toU(int length, ByteBuffer source, CharBuffer target, IntBuffer offsets, int sourceIndex,
1892 boolean flush, CoderResult[] cr) {
1895 if (sharedData.mbcs.extIndexes != null
1896 && initialMatchToU(length, source, target, offsets, sourceIndex, flush, cr)) {
1897 return 0; /* an extension mapping handled the input */
1901 if (length == 4 && (options & MBCS_OPTION_GB18030) != 0) {
1906 linear = LINEAR_18030(toUBytesArray[0], toUBytesArray[1], toUBytesArray[2], toUBytesArray[3]);
1907 for (i = 0; i < gb18030Ranges.length; ++i) {
1908 range = gb18030Ranges[i];
1909 if (range[2] <= linear && linear <= range[3]) {
1910 /* found the sequence, output the Unicode code point for it */
1911 cr[0] = CoderResult.UNDERFLOW;
1913 /* add the linear difference between the input and start sequences to the start code point */
1914 linear = range[0] + (linear - range[2]);
1916 /* output this code point */
1917 cr[0] = toUWriteCodePoint((int) linear, target, offsets, sourceIndex);
1925 cr[0] = CoderResult.unmappableForLength(length);
1930 * target<targetLimit; set error code for overflow
1932 private boolean initialMatchToU(int firstLength, ByteBuffer source, CharBuffer target, IntBuffer offsets,
1933 int srcIndex, boolean flush, CoderResult[] cr) {
1934 int[] value = new int[1];
1938 match = matchToU((byte) SISO_STATE(sharedData, mode), toUBytesArray, toUBytesBegin, firstLength, source,
1939 value, isToUUseFallback(), flush);
1941 /* advance src pointer for the consumed input */
1942 source.position(source.position() + match - firstLength);
1944 /* write result to target */
1945 cr[0] = writeToU(value[0], target, offsets, srcIndex);
1947 } else if (match < 0) {
1948 /* save state for partial match */
1953 /* copy the first code point */
1954 sArray = toUBytesArray;
1955 sArrayIndex = toUBytesBegin;
1956 preToUFirstLength = (byte) firstLength;
1957 for (j = 0; j < firstLength; ++j) {
1958 preToUArray[j] = sArray[sArrayIndex++];
1961 /* now copy the newly consumed input */
1962 sArrayIndex = source.position();
1964 for (; j < match; ++j) {
1965 preToUArray[j] = source.get(sArrayIndex++);
1967 source.position(sArrayIndex);
1968 preToULength = (byte) match;
1970 } else /* match==0 no match */{
1975 private int simpleMatchToU(ByteBuffer source, boolean useFallback) {
1976 int[] value = new int[1];
1979 if (source.remaining() <= 0) {
1985 int sourcePosition, sourceLimit;
1986 if (source.isReadOnly()) {
1987 // source.array() would throw an exception
1988 sourcePosition = source.position(); // relative to source.array()
1989 sourceArray = new byte[Math.min(source.remaining(), EXT_MAX_BYTES)];
1990 source.get(sourceArray).position(sourcePosition);
1991 sourcePosition = 0; // relative to sourceArray
1992 sourceLimit = sourceArray.length;
1994 sourceArray = source.array();
1995 sourcePosition = source.position();
1996 sourceLimit = source.limit();
1998 match = matchToU((byte) -1, sourceArray, sourcePosition, sourceLimit, null, value, useFallback, true);
2000 if (match == source.remaining()) {
2001 /* write result for simple, single-character conversion */
2002 if (TO_U_IS_CODE_POINT(value[0])) {
2003 return TO_U_GET_CODE_POINT(value[0]);
2008 * return no match because - match>0 && value points to string: simple conversion cannot handle multiple
2009 * code points - match>0 && match!=length: not all input consumed, forbidden for this function - match==0:
2010 * no match found in the first place - match<0: partial match, not supported for simple conversion (and
2016 CoderResult cnvMBCSToUnicodeWithOffsets(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
2017 CoderResult[] cr = { CoderResult.UNDERFLOW };
2019 int sourceArrayIndex, sourceArrayIndexStart;
2020 int stateTable[][/* 256 */];
2021 char[] unicodeCodeUnits;
2028 int sourceIndex, nextSourceIndex;
2034 if (preToULength > 0) {
2036 * pass sourceIndex=-1 because we continue from an earlier buffer in the future, this may change with
2037 * continuous offsets
2039 cr[0] = continueMatchToU(source, target, offsets, -1, flush);
2041 if (cr[0].isError() || preToULength < 0) {
2046 if (sharedData.mbcs.countStates == 1) {
2047 if ((sharedData.mbcs.unicodeMask & UConverterConstants.HAS_SUPPLEMENTARY) == 0) {
2048 cr[0] = cnvMBCSSingleToBMPWithOffsets(source, target, offsets, flush);
2050 cr[0] = cnvMBCSSingleToUnicodeWithOffsets(source, target, offsets, flush);
2055 /* set up the local pointers */
2056 sourceArrayIndex = sourceArrayIndexStart = source.position();
2058 if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) {
2059 stateTable = sharedData.mbcs.swapLFNLStateTable;
2061 stateTable = sharedData.mbcs.stateTable;
2063 unicodeCodeUnits = sharedData.mbcs.unicodeCodeUnits;
2065 /* get the converter state from UConverter */
2066 offset = toUnicodeStatus;
2067 byteIndex = toULength;
2068 bytes = toUBytesArray;
2071 * if we are in the SBCS state for a DBCS-only converter, then load the DBCS state from the MBCS data
2072 * (dbcsOnlyState==0 if it is not a DBCS-only converter)
2076 state = sharedData.mbcs.dbcsOnlyState;
2079 /* sourceIndex=-1 if the current character began in the previous buffer */
2080 sourceIndex = byteIndex == 0 ? 0 : -1;
2081 nextSourceIndex = 0;
2083 /* conversion loop */
2084 while (sourceArrayIndex < source.limit()) {
2086 * This following test is to see if available input would overflow the output. It does not catch output
2087 * of more than one code unit that overflows as a result of a surrogate pair or callback output from the
2088 * last source byte. Therefore, those situations also test for overflows and will then break the loop,
2091 if (!target.hasRemaining()) {
2092 /* target is full */
2093 cr[0] = CoderResult.OVERFLOW;
2097 if (byteIndex == 0) {
2098 /* optimized loop for 1/2-byte input and BMP output */
2099 // agljport:todo see ucnvmbcs.c for deleted block
2101 entry = stateTable[state][source.get(sourceArrayIndex)&UConverterConstants.UNSIGNED_BYTE_MASK];
2102 if (MBCS_ENTRY_IS_TRANSITION(entry)) {
2103 state = (byte)MBCS_ENTRY_TRANSITION_STATE(entry);
2104 offset = MBCS_ENTRY_TRANSITION_OFFSET(entry);
2106 if (sourceArrayIndex < source.limit()
2107 && MBCS_ENTRY_IS_FINAL(entry = stateTable[state][source.get(sourceArrayIndex)&UConverterConstants.UNSIGNED_BYTE_MASK])
2108 && MBCS_ENTRY_FINAL_ACTION(entry) == MBCS_STATE_VALID_16
2109 && (c = unicodeCodeUnits[offset + MBCS_ENTRY_FINAL_VALUE_16(entry)]) < 0xfffe) {
2112 if (offsets != null) {
2113 offsets.put(sourceIndex);
2114 sourceIndex = (nextSourceIndex += 2);
2116 state = (byte)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2119 /* set the state and leave the optimized loop */
2121 bytes[0] = source.get(sourceArrayIndex - 1);
2126 if (MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2127 /* output BMP code point */
2129 target.put(MBCS_ENTRY_FINAL_VALUE_16(entry));
2130 if (offsets != null) {
2131 offsets.put(sourceIndex);
2132 sourceIndex = ++nextSourceIndex;
2134 state = (byte)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2136 /* leave the optimized loop */
2140 } while (sourceArrayIndex < source.limit() && target.hasRemaining());
2142 * these tests and break statements could be put inside the loop if C had "break outerLoop" like
2145 if (sourceArrayIndex >= source.limit()) {
2148 if (!target.hasRemaining()) {
2149 /* target is full */
2150 cr[0] = CoderResult.OVERFLOW;
2155 bytes[byteIndex++] = source.get(sourceArrayIndex++);
2156 } else /* byteIndex>0 */{
2158 entry = stateTable[state][(bytes[byteIndex++] = source.get(sourceArrayIndex++))
2159 & UConverterConstants.UNSIGNED_BYTE_MASK];
2162 if (MBCS_ENTRY_IS_TRANSITION(entry)) {
2163 state = (byte)MBCS_ENTRY_TRANSITION_STATE(entry);
2164 offset += MBCS_ENTRY_TRANSITION_OFFSET(entry);
2168 /* save the previous state for proper extension mapping with SI/SO-stateful converters */
2171 /* set the next state early so that we can reuse the entry variable */
2172 state = (byte)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
2175 * An if-else-if chain provides more reliable performance for the most common cases compared to a
2178 action = (byte)MBCS_ENTRY_FINAL_ACTION(entry);
2179 if (action == MBCS_STATE_VALID_16) {
2180 offset += MBCS_ENTRY_FINAL_VALUE_16(entry);
2181 c = unicodeCodeUnits[offset];
2183 /* output BMP code point */
2185 if (offsets != null) {
2186 offsets.put(sourceIndex);
2189 } else if (c == 0xfffe) {
2190 if (isFallbackUsed() && (entry = getFallback(sharedData.mbcs, offset)) != 0xfffe) {
2191 /* output fallback BMP code point */
2192 target.put((char)entry);
2193 if (offsets != null) {
2194 offsets.put(sourceIndex);
2199 /* callback(illegal) */
2200 cr[0] = CoderResult.malformedForLength(byteIndex);
2202 } else if (action == MBCS_STATE_VALID_DIRECT_16) {
2203 /* output BMP code point */
2204 target.put(MBCS_ENTRY_FINAL_VALUE_16(entry));
2205 if (offsets != null) {
2206 offsets.put(sourceIndex);
2209 } else if (action == MBCS_STATE_VALID_16_PAIR) {
2210 offset += MBCS_ENTRY_FINAL_VALUE_16(entry);
2211 c = unicodeCodeUnits[offset++];
2213 /* output BMP code point below 0xd800 */
2215 if (offsets != null) {
2216 offsets.put(sourceIndex);
2219 } else if (isFallbackUsed() ? c <= 0xdfff : c <= 0xdbff) {
2220 /* output roundtrip or fallback surrogate pair */
2221 target.put((char)(c & 0xdbff));
2222 if (offsets != null) {
2223 offsets.put(sourceIndex);
2226 if (target.hasRemaining()) {
2227 target.put(unicodeCodeUnits[offset]);
2228 if (offsets != null) {
2229 offsets.put(sourceIndex);
2232 /* target overflow */
2233 charErrorBufferArray[0] = unicodeCodeUnits[offset];
2234 charErrorBufferLength = 1;
2235 cr[0] = CoderResult.OVERFLOW;
2240 } else if (isFallbackUsed() ? (c & 0xfffe) == 0xe000 : c == 0xe000) {
2241 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
2242 target.put(unicodeCodeUnits[offset]);
2243 if (offsets != null) {
2244 offsets.put(sourceIndex);
2247 } else if (c == 0xffff) {
2248 /* callback(illegal) */
2249 cr[0] = CoderResult.malformedForLength(byteIndex);
2251 } else if (action == MBCS_STATE_VALID_DIRECT_20
2252 || (action == MBCS_STATE_FALLBACK_DIRECT_20 && isFallbackUsed())) {
2253 entry = MBCS_ENTRY_FINAL_VALUE(entry);
2254 /* output surrogate pair */
2255 target.put((char)(0xd800 | (char)(entry >> 10)));
2256 if (offsets != null) {
2257 offsets.put(sourceIndex);
2260 c = (char)(0xdc00 | (char)(entry & 0x3ff));
2261 if (target.hasRemaining()) {
2263 if (offsets != null) {
2264 offsets.put(sourceIndex);
2267 /* target overflow */
2268 charErrorBufferArray[0] = c;
2269 charErrorBufferLength = 1;
2270 cr[0] = CoderResult.OVERFLOW;
2275 } else if (action == MBCS_STATE_CHANGE_ONLY) {
2277 * This serves as a state change without any output. It is useful for reading simple stateful
2278 * encodings, for example using just Shift-In/Shift-Out codes. The 21 unused bits may later be used
2279 * for more sophisticated state transitions.
2281 if (sharedData.mbcs.dbcsOnlyState == 0) {
2284 /* SI/SO are illegal for DBCS-only conversion */
2285 state = (byte)(mode); /* restore the previous state */
2287 /* callback(illegal) */
2288 cr[0] = CoderResult.malformedForLength(byteIndex);
2290 } else if (action == MBCS_STATE_FALLBACK_DIRECT_16) {
2291 if (isFallbackUsed()) {
2292 /* output BMP code point */
2293 target.put(MBCS_ENTRY_FINAL_VALUE_16(entry));
2294 if (offsets != null) {
2295 offsets.put(sourceIndex);
2299 } else if (action == MBCS_STATE_UNASSIGNED) {
2300 /* just fall through */
2301 } else if (action == MBCS_STATE_ILLEGAL) {
2302 /* callback(illegal) */
2303 cr[0] = CoderResult.malformedForLength(byteIndex);
2305 /* reserved, must never occur */
2309 /* end of action codes: prepare for a new character */
2312 if (byteIndex == 0) {
2313 sourceIndex = nextSourceIndex;
2314 } else if (cr[0].isError()) {
2315 /* callback(illegal) */
2316 if (byteIndex > 1) {
2318 * Ticket 5691: consistent illegal sequences:
2319 * - We include at least the first byte in the illegal sequence.
2320 * - If any of the non-initial bytes could be the start of a character,
2321 * we stop the illegal sequence before the first one of those.
2323 boolean isDBCSOnly = (sharedData.mbcs.dbcsOnlyState != 0);
2325 for (i = 1; i < byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, (short)(bytes[i] & UConverterConstants.UNSIGNED_BYTE_MASK)); i++) {}
2326 if (i < byteIndex) {
2327 byte backOutDistance = (byte)(byteIndex - i);
2328 int bytesFromThisBuffer = sourceArrayIndex - sourceArrayIndexStart;
2329 byteIndex = i; /* length of reported illegal byte sequence */
2330 if (backOutDistance <= bytesFromThisBuffer) {
2331 sourceArrayIndex -= backOutDistance;
2333 /* Back out bytes from the previous buffer: Need to replay them. */
2334 this.preToULength = (byte)(bytesFromThisBuffer - backOutDistance);
2335 /* preToULength is negative! */
2336 for (int n = 0; n < -this.preToULength; n++) {
2337 this.preToUArray[n] = bytes[i+n];
2339 sourceArrayIndex = sourceArrayIndexStart;
2344 } else /* unassigned sequences indicated with byteIndex>0 */{
2345 /* try an extension mapping */
2346 int sourceBeginIndex = sourceArrayIndex;
2347 source.position(sourceArrayIndex);
2348 byteIndex = toU(byteIndex, source, target, offsets, sourceIndex, flush, cr);
2349 sourceArrayIndex = source.position();
2350 sourceIndex = nextSourceIndex += (sourceArrayIndex - sourceBeginIndex);
2352 if (cr[0].isError() || cr[0].isOverflow()) {
2353 /* not mappable or buffer overflow */
2359 /* set the converter state back into UConverter */
2360 toUnicodeStatus = offset;
2362 toULength = byteIndex;
2364 /* write back the updated pointers */
2365 source.position(sourceArrayIndex);
2370 * This version of cnvMBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages that
2371 * only map to and from the BMP. In addition to single-byte optimizations, the offset calculations become much
2374 private CoderResult cnvMBCSSingleToBMPWithOffsets(ByteBuffer source, CharBuffer target, IntBuffer offsets,
2376 CoderResult[] cr = { CoderResult.UNDERFLOW };
2378 int sourceArrayIndex, lastSource;
2379 int targetCapacity, length;
2387 /* set up the local pointers */
2388 sourceArrayIndex = source.position();
2389 targetCapacity = target.remaining();
2391 if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) {
2392 stateTable = sharedData.mbcs.swapLFNLStateTable;
2394 stateTable = sharedData.mbcs.stateTable;
2397 /* sourceIndex=-1 if the current character began in the previous buffer */
2399 lastSource = sourceArrayIndex;
2402 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter for the minimum of the
2403 * sourceLength and targetCapacity
2405 length = source.remaining();
2406 if (length < targetCapacity) {
2407 targetCapacity = length;
2410 /* conversion loop */
2411 while (targetCapacity > 0 && sourceArrayIndex < source.limit()) {
2412 entry = stateTable[0][source.get(sourceArrayIndex++) & UConverterConstants.UNSIGNED_BYTE_MASK];
2413 /* MBCS_ENTRY_IS_FINAL(entry) */
2415 /* test the most common case first */
2416 if (MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2417 /* output BMP code point */
2418 target.put(MBCS_ENTRY_FINAL_VALUE_16(entry));
2424 * An if-else-if chain provides more reliable performance for the most common cases compared to a
2427 action = (byte) (MBCS_ENTRY_FINAL_ACTION(entry));
2428 if (action == MBCS_STATE_FALLBACK_DIRECT_16) {
2429 if (isFallbackUsed()) {
2430 /* output BMP code point */
2431 target.put(MBCS_ENTRY_FINAL_VALUE_16(entry));
2435 } else if (action == MBCS_STATE_UNASSIGNED) {
2436 /* just fall through */
2437 } else if (action == MBCS_STATE_ILLEGAL) {
2438 /* callback(illegal) */
2439 cr[0] = CoderResult.malformedForLength(sourceArrayIndex - lastSource);
2441 /* reserved, must never occur */
2445 /* set offsets since the start or the last extension */
2446 if (offsets != null) {
2447 int count = sourceArrayIndex - lastSource;
2449 /* predecrement: do not set the offset for the callback-causing character */
2450 while (--count > 0) {
2451 offsets.put(sourceIndex++);
2453 /* offset and sourceIndex are now set for the current character */
2456 if (cr[0].isError()) {
2457 /* callback(illegal) */
2459 } else /* unassigned sequences indicated with byteIndex>0 */{
2460 /* try an extension mapping */
2461 lastSource = sourceArrayIndex;
2462 toUBytesArray[0] = source.get(sourceArrayIndex - 1);
2463 source.position(sourceArrayIndex);
2464 toULength = toU((byte) 1, source, target, offsets, sourceIndex, flush, cr);
2465 sourceArrayIndex = source.position();
2466 sourceIndex += 1 + (sourceArrayIndex - lastSource);
2468 if (cr[0].isError()) {
2469 /* not mappable or buffer overflow */
2473 /* recalculate the targetCapacity after an extension mapping */
2474 targetCapacity = target.remaining();
2475 length = source.remaining();
2476 if (length < targetCapacity) {
2477 targetCapacity = length;
2482 if (!cr[0].isError() && sourceArrayIndex < source.limit() && !target.hasRemaining()) {
2483 /* target is full */
2484 cr[0] = CoderResult.OVERFLOW;
2487 /* set offsets since the start or the last callback */
2488 if (offsets != null) {
2489 int count = sourceArrayIndex - lastSource;
2491 offsets.put(sourceIndex++);
2496 /* write back the updated pointers */
2497 source.position(sourceArrayIndex);
2502 /* This version of cnvMBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */
2503 private CoderResult cnvMBCSSingleToUnicodeWithOffsets(ByteBuffer source, CharBuffer target, IntBuffer offsets,
2505 CoderResult[] cr = { CoderResult.UNDERFLOW };
2507 int sourceArrayIndex;
2516 /* set up the local pointers */
2517 sourceArrayIndex = source.position();
2519 if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) {
2520 stateTable = sharedData.mbcs.swapLFNLStateTable;
2522 stateTable = sharedData.mbcs.stateTable;
2525 /* sourceIndex=-1 if the current character began in the previous buffer */
2528 /* conversion loop */
2529 while (sourceArrayIndex < source.limit()) {
2531 * This following test is to see if available input would overflow the output. It does not catch output
2532 * of more than one code unit that overflows as a result of a surrogate pair or callback output from the
2533 * last source byte. Therefore, those situations also test for overflows and will then break the loop,
2536 if (!target.hasRemaining()) {
2537 /* target is full */
2538 cr[0] = CoderResult.OVERFLOW;
2542 entry = stateTable[0][source.get(sourceArrayIndex++) & UConverterConstants.UNSIGNED_BYTE_MASK];
2543 /* MBCS_ENTRY_IS_FINAL(entry) */
2545 /* test the most common case first */
2546 if (MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
2547 /* output BMP code point */
2548 target.put(MBCS_ENTRY_FINAL_VALUE_16(entry));
2549 if (offsets != null) {
2550 offsets.put(sourceIndex);
2553 /* normal end of action codes: prepare for a new character */
2559 * An if-else-if chain provides more reliable performance for the most common cases compared to a
2562 action = (byte) (MBCS_ENTRY_FINAL_ACTION(entry));
2563 if (action == MBCS_STATE_VALID_DIRECT_20
2564 || (action == MBCS_STATE_FALLBACK_DIRECT_20 && isFallbackUsed())) {
2566 entry = MBCS_ENTRY_FINAL_VALUE(entry);
2567 /* output surrogate pair */
2568 target.put((char) (0xd800 | (char) (entry >>> 10)));
2569 if (offsets != null) {
2570 offsets.put(sourceIndex);
2572 c = (char) (0xdc00 | (char) (entry & 0x3ff));
2573 if (target.hasRemaining()) {
2575 if (offsets != null) {
2576 offsets.put(sourceIndex);
2579 /* target overflow */
2580 charErrorBufferArray[0] = c;
2581 charErrorBufferLength = 1;
2582 cr[0] = CoderResult.OVERFLOW;
2588 } else if (action == MBCS_STATE_FALLBACK_DIRECT_16) {
2589 if (isFallbackUsed()) {
2590 /* output BMP code point */
2591 target.put(MBCS_ENTRY_FINAL_VALUE_16(entry));
2592 if (offsets != null) {
2593 offsets.put(sourceIndex);
2599 } else if (action == MBCS_STATE_UNASSIGNED) {
2600 /* just fall through */
2601 } else if (action == MBCS_STATE_ILLEGAL) {
2602 /* callback(illegal) */
2603 cr[0] = CoderResult.malformedForLength(1);
2605 /* reserved, must never occur */
2610 if (cr[0].isError()) {
2611 /* callback(illegal) */
2613 } else /* unassigned sequences indicated with byteIndex>0 */{
2614 /* try an extension mapping */
2615 int sourceBeginIndex = sourceArrayIndex;
2616 toUBytesArray[0] = source.get(sourceArrayIndex - 1);
2617 source.position(sourceArrayIndex);
2618 toULength = toU((byte) 1, source, target, offsets, sourceIndex, flush, cr);
2619 sourceArrayIndex = source.position();
2620 sourceIndex += 1 + (sourceArrayIndex - sourceBeginIndex);
2622 if (cr[0].isError()) {
2623 /* not mappable or buffer overflow */
2629 /* write back the updated pointers */
2630 source.position(sourceArrayIndex);
2635 private int getFallback(UConverterMBCSTable mbcsTable, int offset) {
2636 MBCSToUFallback[] toUFallbacks;
2637 int i, start, limit;
2639 limit = mbcsTable.countToUFallbacks;
2641 /* do a binary search for the fallback mapping */
2642 toUFallbacks = mbcsTable.toUFallbacks;
2644 while (start < limit - 1) {
2645 i = (start + limit) >>> 1;
2646 if (offset < toUFallbacks[i].offset) {
2653 /* did we really find it? */
2654 if (offset == toUFallbacks[start].offset) {
2655 return toUFallbacks[start].codePoint;
2663 * This is a simple version of _MBCSGetNextUChar() that is used by other converter implementations. It only
2664 * returns an "assigned" result if it consumes the entire input. It does not use state from the converter, nor
2665 * error codes. It does not handle the EBCDIC swaplfnl option (set in UConverter). It handles conversion
2666 * extensions but not GB 18030.
2668 * @return U+fffe unassigned U+ffff illegal otherwise the Unicode code point
2670 int simpleGetNextUChar(ByteBuffer source, boolean useFallback) {
2674 // * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
2675 // * TODO In future releases, verify that this function is never called for SBCS
2676 // * conversions, i.e., that sharedData->mbcs.countStates==1 is still true.
2677 // * Removal improves code coverage.
2679 // /* use optimized function if possible */
2680 // if(sharedData->mbcs.countStates==1) {
2682 // return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback);
2684 // return 0xffff; /* illegal: more than a single byte for an SBCS converter */
2689 /* set up the local pointers */
2690 int[][] stateTable = sharedData.mbcs.stateTable;
2691 char[] unicodeCodeUnits = sharedData.mbcs.unicodeCodeUnits;
2693 /* converter state */
2695 int state = sharedData.mbcs.dbcsOnlyState;
2700 int i = source.position();
2701 int length = source.limit() - i;
2703 /* conversion loop */
2705 // entry=stateTable[state][(uint8_t)source[i++]];
2706 entry = stateTable[state][source.get(i++) & UConverterConstants.UNSIGNED_BYTE_MASK];
2708 if (MBCS_ENTRY_IS_TRANSITION(entry)) {
2709 state = MBCS_ENTRY_TRANSITION_STATE(entry);
2710 offset += MBCS_ENTRY_TRANSITION_OFFSET(entry);
2712 if (i == source.limit()) {
2713 return 0xffff; /* truncated character */
2717 * An if-else-if chain provides more reliable performance for the most common cases compared to a
2720 action = MBCS_ENTRY_FINAL_ACTION(entry);
2721 if (action == MBCS_STATE_VALID_16) {
2722 offset += MBCS_ENTRY_FINAL_VALUE_16(entry);
2723 c = unicodeCodeUnits[offset];
2726 } else if (isToUUseFallback()) {
2727 c = getFallback(sharedData.mbcs, offset);
2729 /* else done with 0xfffe */
2730 } else if (action == MBCS_STATE_VALID_DIRECT_16) {
2731 // /* output BMP code point */
2732 c = MBCS_ENTRY_FINAL_VALUE_16(entry);
2733 } else if (action == MBCS_STATE_VALID_16_PAIR) {
2734 offset += MBCS_ENTRY_FINAL_VALUE_16(entry);
2735 c = unicodeCodeUnits[offset++];
2737 /* output BMP code point below 0xd800 */
2738 } else if (isToUUseFallback() ? c <= 0xdfff : c <= 0xdbff) {
2739 /* output roundtrip or fallback supplementary code point */
2740 c = (((c & 0x3ff) << 10) + unicodeCodeUnits[offset] + (0x10000 - 0xdc00));
2741 } else if (isToUUseFallback() ? (c & 0xfffe) == 0xe000 : c == 0xe000) {
2742 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
2743 c = unicodeCodeUnits[offset];
2744 } else if (c == 0xffff) {
2749 } else if (action == MBCS_STATE_VALID_DIRECT_20) {
2750 /* output supplementary code point */
2751 c = 0x10000 + MBCS_ENTRY_FINAL_VALUE(entry);
2752 } else if (action == MBCS_STATE_FALLBACK_DIRECT_16) {
2753 if (!isToUUseFallback(useFallback)) {
2756 /* output BMP code point */
2757 c = MBCS_ENTRY_FINAL_VALUE_16(entry);
2759 } else if (action == MBCS_STATE_FALLBACK_DIRECT_20) {
2760 if (!isToUUseFallback(useFallback)) {
2763 /* output supplementary code point */
2764 c = 0x10000 + MBCS_ENTRY_FINAL_VALUE(entry);
2766 } else if (action == MBCS_STATE_UNASSIGNED) {
2770 * forbid MBCS_STATE_CHANGE_ONLY for this function, and MBCS_STATE_ILLEGAL and reserved action
2779 if (i != source.limit()) {
2780 /* illegal for this function: not all input consumed */
2785 /* try an extension mapping */
2786 if (sharedData.mbcs.extIndexes != null) {
2787 /* Increase the limit for proper handling. Used in LMBCS. */
2788 if (source.limit() > i + length) {
2789 source.limit(i + length);
2791 return simpleMatchToU(source, useFallback);
2797 private boolean hasValidTrailBytes(int[][] stateTable, short state) {
2798 int[] row = stateTable[state];
2800 /* First test for final entries in this state for some commonly valid byte values. */
2802 if (!MBCS_ENTRY_IS_TRANSITION(entry) && MBCS_ENTRY_FINAL_ACTION(entry) != MBCS_STATE_ILLEGAL) {
2806 if (!MBCS_ENTRY_IS_TRANSITION(entry) && MBCS_ENTRY_FINAL_ACTION(entry) != MBCS_STATE_ILLEGAL) {
2809 /* Then test for final entries in this state. */
2810 for (b = 0; b <= 0xff; b++) {
2812 if (!MBCS_ENTRY_IS_TRANSITION(entry) && MBCS_ENTRY_FINAL_ACTION(entry) != MBCS_STATE_ILLEGAL) {
2816 /* Then recurse for transition entries. */
2817 for (b = 0; b <= 0xff; b++) {
2819 if (MBCS_ENTRY_IS_TRANSITION(entry) &&
2820 hasValidTrailBytes(stateTable, (short)(MBCS_ENTRY_TRANSITION_STATE(entry) & UConverterConstants.UNSIGNED_BYTE_MASK))) {
2827 private boolean isSingleOrLead(int[][] stateTable, int state, boolean isDBCSOnly, int b) {
2828 int[] row = stateTable[state];
2830 if (MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */
2831 return hasValidTrailBytes(stateTable, (short)(MBCS_ENTRY_TRANSITION_STATE(entry) & UConverterConstants.UNSIGNED_BYTE_MASK));
2833 short action = (short)(MBCS_ENTRY_FINAL_ACTION(entry) & UConverterConstants.UNSIGNED_BYTE_MASK);
2834 if (action == MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {
2835 return false; /* SI/SO are illegal for DBCS-only conversion */
2837 return (action != MBCS_STATE_ILLEGAL);
2845 class CharsetEncoderMBCS extends CharsetEncoderICU {
2846 private boolean allowReplacementChanges = false;
2848 CharsetEncoderMBCS(CharsetICU cs) {
2849 super(cs, fromUSubstitution);
2850 allowReplacementChanges = true; // allow changes in implReplaceWith
2854 protected void implReset() {
2856 preFromUFirstCP = UConverterConstants.U_SENTINEL;
2859 @SuppressWarnings("fallthrough")
2860 protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
2861 CoderResult[] cr = { CoderResult.UNDERFLOW };
2862 // if (!source.hasRemaining() && fromUChar32 == 0)
2865 int sourceArrayIndex;
2867 byte[] pArray, bytes;
2868 int pArrayIndex, outputType, c;
2869 int prevSourceIndex, sourceIndex, nextSourceIndex;
2870 int stage2Entry = 0, value = 0, length = 0, prevLength;
2872 // long asciiRoundtrips;
2874 byte[] si_value = new byte[2];
2875 byte[] so_value = new byte[2];
2876 int si_value_length = 0, so_value_length = 0;
2878 boolean gotoUnassigned = false;
2882 if (!flush && preFromUFirstCP >= 0) {
2884 * pass sourceIndex=-1 because we continue from an earlier buffer in the future, this may change
2885 * with continuous offsets
2887 cr[0] = continueMatchFromU(source, target, offsets, flush, -1);
2889 if (cr[0].isError() || preFromULength < 0) {
2894 /* use optimized function if possible */
2895 outputType = sharedData.mbcs.outputType;
2896 uniMask = sharedData.mbcs.unicodeMask;
2897 if (outputType == MBCS_OUTPUT_1 && (uniMask & UConverterConstants.HAS_SURROGATES) == 0) {
2898 if ((uniMask & UConverterConstants.HAS_SUPPLEMENTARY) == 0) {
2899 cr[0] = cnvMBCSSingleFromBMPWithOffsets(source, target, offsets, flush);
2901 cr[0] = cnvMBCSSingleFromUnicodeWithOffsets(source, target, offsets, flush);
2904 } else if (outputType == MBCS_OUTPUT_2) {
2905 cr[0] = cnvMBCSDoubleFromUnicodeWithOffsets(source, target, offsets, flush);
2909 table = sharedData.mbcs.fromUnicodeTable;
2910 sourceArrayIndex = source.position();
2912 if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) {
2913 bytes = sharedData.mbcs.swapLFNLFromUnicodeBytes;
2915 bytes = sharedData.mbcs.fromUnicodeBytes;
2918 // asciiRoundtrips = sharedData.mbcs.asciiRoundtrips;
2920 /* get the converter state from UConverter */
2923 if (outputType == MBCS_OUTPUT_2_SISO) {
2924 prevLength = fromUnicodeStatus;
2925 if (prevLength == 0) {
2926 /* set the real value */
2930 /* prevent fromUnicodeStatus from being set to something non-0 */
2934 /* sourceIndex=-1 if the current character began in the previous buffer */
2935 prevSourceIndex = -1;
2936 sourceIndex = c == 0 ? 0 : -1;
2937 nextSourceIndex = 0;
2939 /* Get the SI/SO character for the converter */
2940 si_value_length = getSISOBytes(SISO_Option.SI, options, si_value);
2941 so_value_length = getSISOBytes(SISO_Option.SO, options, so_value);
2943 /* conversion loop */
2945 * This is another piece of ugly code: A goto into the loop if the converter state contains a first
2946 * surrogate from the previous function call. It saves me to check in each loop iteration a check of
2947 * if(c==0) and duplicating the trail-surrogate-handling code in the else branch of that check. I could
2948 * not find any other way to get around this other than using a function call for the conversion and
2949 * callback, which would be even more inefficient.
2951 * Markus Scherer 2000-jul-19
2953 boolean doloop = true;
2954 boolean doread = true;
2955 if (c != 0 && target.hasRemaining()) {
2956 if (UTF16.isLeadSurrogate((char) c) && (uniMask & UConverterConstants.HAS_SURROGATES) == 0) {
2957 // c is a lead surrogate, read another input
2958 SideEffects x = new SideEffects(c, sourceArrayIndex, sourceIndex, nextSourceIndex,
2959 prevSourceIndex, prevLength);
2960 doloop = getTrail(source, target, uniMask, x, flush, cr);
2963 sourceArrayIndex = x.sourceArrayIndex;
2964 sourceIndex = x.sourceIndex;
2965 nextSourceIndex = x.nextSourceIndex;
2966 prevSourceIndex = x.prevSourceIndex;
2967 prevLength = x.prevLength;
2969 // c is not a lead surrogate, do not read another input
2975 while (!doread || sourceArrayIndex < source.limit()) {
2977 * This following test is to see if available input would overflow the output. It does not catch
2978 * output of more than one byte that overflows as a result of a multi-byte character or callback
2979 * output from the last source character. Therefore, those situations also test for overflows
2980 * and will then break the loop, too.
2982 if (target.hasRemaining()) {
2984 * Get a correct Unicode code point: a single UChar for a BMP code point or a matched
2985 * surrogate pair for a "supplementary code point".
2989 // doread might be false only on the first looping
2991 c = source.get(sourceArrayIndex++);
2995 * This also tests if the codepage maps single surrogates. If it does, then surrogates
2996 * are not paired but mapped separately. Note that in this case unmatched surrogates are
2999 if (UTF16.isSurrogate((char) c)
3000 && (uniMask & UConverterConstants.HAS_SURROGATES) == 0) {
3001 if (UTF16.isLeadSurrogate((char) c)) {
3003 SideEffects x = new SideEffects(c, sourceArrayIndex, sourceIndex,
3004 nextSourceIndex, prevSourceIndex, prevLength);
3005 doloop = getTrail(source, target, uniMask, x, flush, cr);
3007 sourceArrayIndex = x.sourceArrayIndex;
3008 sourceIndex = x.sourceIndex;
3009 nextSourceIndex = x.nextSourceIndex;
3010 prevSourceIndex = x.prevSourceIndex;
3019 /* this is an unmatched trail code unit (2nd surrogate) */
3020 /* callback(illegal) */
3021 cr[0] = CoderResult.malformedForLength(1);
3028 /* convert the Unicode code point in c into codepage bytes */
3031 * The basic lookup is a triple-stage compact array (trie) lookup. For details see the
3032 * beginning of this file.
3034 * Single-byte codepages are handled with a different data structure by _MBCSSingle...
3037 * The result consists of a 32-bit value from stage 2 and a pointer to as many bytes as are
3038 * stored per character. The pointer points to the character's bytes in stage 3. Bits 15..0
3039 * of the stage 2 entry contain the stage 3 index for that pointer, while bits 31..16 are
3040 * flags for which of the 16 characters in the block are roundtrip-assigned.
3042 * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t respectively as
3043 * uint32_t, in the platform encoding. For 3-byte codepages, the bytes are always stored in
3046 * For EUC encodings that use only either 0x8e or 0x8f as the first byte of their longest
3047 * byte sequences, the first two bytes in this third stage indicate with their 7th bits
3048 * whether these bytes are to be written directly or actually need to be preceeded by one of
3049 * the two Single-Shift codes. With this, the third stage stores one byte fewer per
3050 * character than the actual maximum length of EUC byte sequences.
3052 * Other than that, leading zero bytes are removed and the other bytes output. A single zero
3053 * byte may be output if the "assigned" bit in stage 2 was on. The data structure does not
3054 * support zero byte output as a fallback, and also does not allow output of leading zeros.
3056 stage2Entry = MBCS_STAGE_2_FROM_U(table, c);
3058 /* get the bytes and the length for the output */
3059 switch (outputType) {
3060 /* This is handled above with the method cnvMBCSDoubleFromUnicodeWithOffsets() */
3061 /* case MBCS_OUTPUT_2:
3062 value = MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
3063 if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) {
3069 case MBCS_OUTPUT_2_SISO:
3070 /* 1/2-byte stateful with Shift-In/Shift-Out */
3072 * Save the old state in the converter object right here, then change the local
3073 * prevLength state variable if necessary. Then, if this character turns out to be
3074 * unassigned or a fallback that is not taken, the callback code must not save the new
3075 * state in the converter because the new state is for a character that is not output.
3076 * However, the callback must still restore the state from the converter in case the
3077 * callback function changed it for its output.
3079 fromUnicodeStatus = prevLength; /* save the old state */
3080 value = MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
3081 if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) {
3082 if (value == 0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) == false) {
3083 /* no mapping, leave value==0 */
3085 } else if (prevLength <= 1) {
3088 /* change from double-byte mode to single-byte */
3089 if (si_value_length == 1) {
3090 value|=si_value[0]<<8;
3092 } else if (si_value_length == 2) {
3093 value|=si_value[1]<<8;
3094 value|=si_value[0]<<16;
3100 if (prevLength == 2) {
3103 /* change from single-byte mode to double-byte */
3104 if (so_value_length == 1) {
3105 value|=so_value[0]<<16;
3107 } else if (so_value_length == 2) {
3108 value|=so_value[1]<<16;
3109 value|=so_value[0]<<24;
3116 case MBCS_OUTPUT_DBCS_ONLY:
3117 /* table with single-byte results, but only DBCS mappings used */
3118 value = MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
3119 if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) {
3120 /* no mapping or SBCS result, not taken for DBCS-only */
3121 value = stage2Entry = 0; /* stage2Entry=0 to reset roundtrip flags */
3129 pArrayIndex = MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
3130 value = ((pArray[pArrayIndex] & UConverterConstants.UNSIGNED_BYTE_MASK) << 16)
3131 | ((pArray[pArrayIndex + 1] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8)
3132 | (pArray[pArrayIndex + 2] & UConverterConstants.UNSIGNED_BYTE_MASK);
3133 if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) {
3135 } else if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xffff) {
3142 value = MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c);
3143 if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) {
3145 } else if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xffff) {
3147 } else if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xffffff) {
3153 case MBCS_OUTPUT_3_EUC:
3154 value = MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
3155 /* EUC 16-bit fixed-length representation */
3156 if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) {
3158 } else if ((value & 0x8000) == 0) {
3161 } else if ((value & 0x80) == 0) {
3168 case MBCS_OUTPUT_4_EUC:
3170 pArrayIndex = MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
3171 value = ((pArray[pArrayIndex] & UConverterConstants.UNSIGNED_BYTE_MASK) << 16)
3172 | ((pArray[pArrayIndex + 1] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8)
3173 | (pArray[pArrayIndex + 2] & UConverterConstants.UNSIGNED_BYTE_MASK);
3174 /* EUC 16-bit fixed-length representation applied to the first two bytes */
3175 if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) {
3177 } else if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xffff) {
3179 } else if ((value & 0x800000) == 0) {
3180 value |= 0x8e800000;
3182 } else if ((value & 0x8000) == 0) {
3183 value |= 0x8f008000;
3190 /* must not occur */
3192 * To avoid compiler warnings that value & length may be used without having been
3193 * initialized, we set them here. In reality, this is unreachable code. Not having a
3194 * default branch also causes warnings with some compilers.
3196 value = stage2Entry = 0; /* stage2Entry=0 to reset roundtrip flags */
3201 /* is this code point assigned, or do we use fallbacks? */
3202 if (gotoUnassigned || (!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || (isFromUUseFallback(c) && value != 0)))) {
3203 gotoUnassigned = false;
3205 * We allow a 0 byte output if the "assigned" bit is set for this entry. There is no way
3206 * with this data structure for fallback output to be a zero byte.
3210 SideEffects x = new SideEffects(c, sourceArrayIndex, sourceIndex, nextSourceIndex,
3211 prevSourceIndex, prevLength);
3212 doloop = unassigned(source, target, offsets, x, flush, cr);
3214 sourceArrayIndex = x.sourceArrayIndex;
3215 sourceIndex = x.sourceIndex;
3216 nextSourceIndex = x.nextSourceIndex;
3217 prevSourceIndex = x.prevSourceIndex;
3218 prevLength = x.prevLength;
3225 /* write the output character bytes from value and length */
3226 /* from the first if in the loop we know that targetCapacity>0 */
3227 if (length <= target.remaining()) {
3229 /* each branch falls through to the next one */
3231 target.put((byte) (value >>> 24));
3232 if (offsets != null) {
3233 offsets.put(sourceIndex);
3236 target.put((byte) (value >>> 16));
3237 if (offsets != null) {
3238 offsets.put(sourceIndex);
3241 target.put((byte) (value >>> 8));
3242 if (offsets != null) {
3243 offsets.put(sourceIndex);
3246 target.put((byte) value);
3247 if (offsets != null) {
3248 offsets.put(sourceIndex);
3251 /* will never occur */
3255 int errorBufferArrayIndex;
3258 * We actually do this backwards here: In order to save an intermediate variable, we
3259 * output first to the overflow buffer what does not fit into the regular target.
3261 /* we know that 1<=targetCapacity<length<=4 */
3262 length -= target.remaining();
3264 errorBufferArrayIndex = 0;
3266 /* each branch falls through to the next one */
3268 errorBuffer[errorBufferArrayIndex++] = (byte) (value >>> 16);
3270 errorBuffer[errorBufferArrayIndex++] = (byte) (value >>> 8);
3272 errorBuffer[errorBufferArrayIndex] = (byte) value;
3274 /* will never occur */
3277 errorBufferLength = (byte) length;
3279 /* now output what fits into the regular target */
3280 value >>>= 8 * length; /* length was reduced by targetCapacity */
3281 switch (target.remaining()) {
3282 /* each branch falls through to the next one */
3284 target.put((byte) (value >>> 16));
3285 if (offsets != null) {
3286 offsets.put(sourceIndex);
3289 target.put((byte) (value >>> 8));
3290 if (offsets != null) {
3291 offsets.put(sourceIndex);
3294 target.put((byte) value);
3295 if (offsets != null) {
3296 offsets.put(sourceIndex);
3299 /* will never occur */
3303 /* target overflow */
3304 cr[0] = CoderResult.OVERFLOW;
3309 /* normal end of conversion: prepare for a new character */
3311 if (offsets != null) {
3312 prevSourceIndex = sourceIndex;
3313 sourceIndex = nextSourceIndex;
3317 /* target is full */
3318 cr[0] = CoderResult.OVERFLOW;
3325 * the end of the input stream and detection of truncated input are handled by the framework, but for
3326 * EBCDIC_STATEFUL conversion we need to emit an SI at the very end
3328 * conditions: successful EBCDIC_STATEFUL in DBCS mode end of input and no truncated input
3330 if (outputType == MBCS_OUTPUT_2_SISO && prevLength == 2 && flush && sourceArrayIndex >= source.limit()
3333 /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */
3334 if (target.hasRemaining()) {
3335 target.put(si_value[0]);
3336 if (si_value_length == 2) {
3337 if (target.remaining() > 0) {
3338 target.put(si_value[1]);
3340 errorBuffer[0] = si_value[1];
3341 errorBufferLength = 1;
3342 cr[0] = CoderResult.OVERFLOW;
3345 if (offsets != null) {
3346 /* set the last source character's index (sourceIndex points at sourceLimit now) */
3347 offsets.put(prevSourceIndex);
3350 /* target is full */
3351 errorBuffer[0] = si_value[0];
3352 if (si_value_length == 2) {
3353 errorBuffer[1] = si_value[1];
3355 errorBufferLength = si_value_length;
3356 cr[0] = CoderResult.OVERFLOW;
3358 prevLength = 1; /* we switched into SBCS */
3361 /* set the converter state back into UConverter */
3363 fromUnicodeStatus = prevLength;
3365 source.position(sourceArrayIndex);
3366 } catch (BufferOverflowException ex) {
3367 cr[0] = CoderResult.OVERFLOW;
3374 * This is another simple conversion function for internal use by other conversion implementations. It does not
3375 * use the converter state nor call callbacks. It does not handle the EBCDIC swaplfnl option (set in
3376 * UConverter). It handles conversion extensions but not GB 18030.
3378 * It converts one single Unicode code point into codepage bytes, encoded as one 32-bit value. The function
3379 * returns the number of bytes in *pValue: 1..4 the number of bytes in *pValue 0 unassigned (*pValue undefined)
3380 * -1 illegal (currently not used, *pValue undefined)
3382 * *pValue will contain the resulting bytes with the last byte in bits 7..0, the second to last byte in bits
3383 * 15..8, etc. Currently, the function assumes but does not check that 0<=c<=0x10ffff.
3385 int fromUChar32(int c, int[] pValue, boolean isUseFallback) {
3387 // /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
3388 // const uint8_t *p;
3397 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
3398 if (c <= 0xffff || ((sharedData.mbcs.unicodeMask & UConverterConstants.HAS_SUPPLEMENTARY) != 0)) {
3399 table = sharedData.mbcs.fromUnicodeTable;
3401 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
3402 if (sharedData.mbcs.outputType == MBCS_OUTPUT_1) {
3403 value = MBCS_SINGLE_RESULT_FROM_U(table, sharedData.mbcs.fromUnicodeBytes, c);
3404 /* is this code point assigned, or do we use fallbacks? */
3405 if (isUseFallback ? value >= 0x800 : value >= 0xc00) {
3406 pValue[0] = value & 0xff;
3409 } else /* outputType!=MBCS_OUTPUT_1 */{
3410 stage2Entry = MBCS_STAGE_2_FROM_U(table, c);
3412 /* get the bytes and the length for the output */
3413 switch (sharedData.mbcs.outputType) {
3415 value = MBCS_VALUE_2_FROM_STAGE_2(sharedData.mbcs.fromUnicodeBytes, stage2Entry, c);
3416 if (value <= 0xff) {
3423 // /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
3424 // case MBCS_OUTPUT_DBCS_ONLY:
3425 // /* table with single-byte results, but only DBCS mappings used */
3426 // value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
3427 // if(value<=0xff) {
3428 // /* no mapping or SBCS result, not taken for DBCS-only */
3429 // value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
3436 byte[] bytes = sharedData.mbcs.fromUnicodeBytes;
3437 p = CharsetMBCS.MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
3438 value = ((bytes[p] & UConverterConstants.UNSIGNED_BYTE_MASK)<<16) |
3439 ((bytes[p+1] & UConverterConstants.UNSIGNED_BYTE_MASK)<<8) |
3440 (bytes[p+2] & UConverterConstants.UNSIGNED_BYTE_MASK);
3441 if (value <= 0xff) {
3443 } else if (value <= 0xffff) {
3449 // case MBCS_OUTPUT_4:
3450 // value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
3451 // if(value<=0xff) {
3453 // } else if(value<=0xffff) {
3455 // } else if(value<=0xffffff) {
3461 // case MBCS_OUTPUT_3_EUC:
3462 // value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
3463 // /* EUC 16-bit fixed-length representation */
3464 // if(value<=0xff) {
3466 // } else if((value&0x8000)==0) {
3469 // } else if((value&0x80)==0) {
3476 // case MBCS_OUTPUT_4_EUC:
3477 // p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
3478 // value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
3479 // /* EUC 16-bit fixed-length representation applied to the first two bytes */
3480 // if(value<=0xff) {
3482 // } else if(value<=0xffff) {
3484 // } else if((value&0x800000)==0) {
3485 // value|=0x8e800000;
3487 // } else if((value&0x8000)==0) {
3488 // value|=0x8f008000;
3496 /* must not occur */
3500 /* is this code point assigned, or do we use fallbacks? */
3501 if (MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)
3502 || (CharsetEncoderICU.isFromUUseFallback(isUseFallback, c) && value != 0)) {
3504 * We allow a 0 byte output if the "assigned" bit is set for this entry. There is no way with
3505 * this data structure for fallback output to be a zero byte.
3514 if (sharedData.mbcs.extIndexes != null) {
3515 length = simpleMatchFromU(c, pValue, isUseFallback);
3516 return length >= 0 ? length : -length; /* return abs(length); */
3524 * continue partial match with new input, requires cnv->preFromUFirstCP>=0 never called for simple,
3525 * single-character conversion
3527 private CoderResult continueMatchFromU(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush,
3529 CoderResult cr = CoderResult.UNDERFLOW;
3530 int[] value = new int[1];
3533 match = matchFromU(preFromUFirstCP, preFromUArray, preFromUBegin, preFromULength, source, value, useFallback, flush);
3535 match -= 2; /* remove 2 for the initial code point */
3537 if (match >= preFromULength) {
3538 /* advance src pointer for the consumed input */
3539 source.position(source.position() + match - preFromULength);
3542 /* the match did not use all of preFromU[] - keep the rest for replay */
3543 int length = preFromULength - match;
3544 System.arraycopy(preFromUArray, preFromUBegin + match, preFromUArray, preFromUBegin, length);
3545 preFromULength = (byte) -length;
3548 /* finish the partial match */
3549 preFromUFirstCP = UConverterConstants.U_SENTINEL;
3552 writeFromU(value[0], target, offsets, srcIndex);
3553 } else if (match < 0) {
3554 /* save state for partial match */
3558 /* just _append_ the newly consumed input to preFromU[] */
3559 sArrayIndex = source.position();
3560 match = -match - 2; /* remove 2 for the initial code point */
3561 for (j = preFromULength; j < match; ++j) {
3562 preFromUArray[j] = source.get(sArrayIndex++);
3564 source.position(sArrayIndex); /* same as *src=srcLimit; because we reached the end of input */
3565 preFromULength = (byte) match;
3566 } else { /* match==0 or 1 */
3570 * We need to split the previous input into two parts:
3572 * 1. The first code point is unmappable - that's how we got into trying the extension data in the first
3573 * place. We need to move it from the preFromU buffer to the error buffer, set an error code, and
3574 * prepare the rest of the previous input for 2.
3576 * 2. The rest of the previous input must be converted once we come back from the callback for the first
3577 * code point. At that time, we have to try again from scratch to convert these input characters. The
3578 * replay will be handled by the ucnv.c conversion code.
3582 /* matched, no mapping but request for <subchar1> */
3586 /* move the first code point to the error field */
3587 fromUChar32 = preFromUFirstCP;
3588 preFromUFirstCP = UConverterConstants.U_SENTINEL;
3590 /* mark preFromU for replay */
3591 preFromULength = (byte) -preFromULength;
3593 /* set the error code for unassigned */
3594 // TODO: figure out what the unmappable length really should be
3595 cr = CoderResult.unmappableForLength(1);
3602 * pointer to extension data; if NULL, returns 0
3604 * the first code point before all the other UChars
3606 * UChars that must match; !initialMatch: partial match with them
3608 * length of pre, >=0
3610 * UChars that can be used to complete a match
3612 * length of src, >=0
3613 * @param pMatchValue
3614 * [out] output result value for the match from the data structure
3615 * @param useFallback
3616 * "use fallback" flag, usually from cnv->useFallback
3618 * TRUE if the end of the input stream is reached
3619 * @return >1: matched, return value=total match length (number of input units matched) 1: matched, no mapping
3620 * but request for <subchar1> (only for the first code point) 0: no match <0: partial match, return
3621 * value=negative total match length (partial matches are never returned for flush==TRUE) (partial
3622 * matches are never returned as being longer than UCNV_EXT_MAX_UCHARS) the matchLength is 2 if only
3623 * firstCP matched, and >2 if firstCP and further code units matched
3625 // static int32_t ucnv_extMatchFromU(const int32_t *cx, UChar32 firstCP, const UChar *pre, int32_t preLength,
3626 // const UChar *src, int32_t srcLength, uint32_t *pMatchValue, UBool useFallback, UBool flush)
3627 private int matchFromU(int firstCP, char[] preArray, int preArrayBegin, int preLength, CharBuffer source,
3628 int[] pMatchValue, boolean isUseFallback, boolean flush) {
3629 ByteBuffer cx = sharedData.mbcs.extIndexes;
3631 CharBuffer stage12, stage3;
3634 CharBuffer fromUTableUChars, fromUSectionUChars;
3635 IntBuffer fromUTableValues, fromUSectionValues;
3637 int value, matchValue;
3638 int i, j, index, length, matchLength;
3642 return 0; /* no extension data, no match */
3645 /* trie lookup of firstCP */
3646 index = firstCP >>> 10; /* stage 1 index */
3647 if (index >= cx.asIntBuffer().get(EXT_FROM_U_STAGE_1_LENGTH)) {
3648 return 0; /* the first code point is outside the trie */
3651 stage12 = (CharBuffer) ARRAY(cx, EXT_FROM_U_STAGE_12_INDEX, char.class);
3652 stage3 = (CharBuffer) ARRAY(cx, EXT_FROM_U_STAGE_3_INDEX, char.class);
3653 index = FROM_U(stage12, stage3, index, firstCP);
3655 stage3b = (IntBuffer) ARRAY(cx, EXT_FROM_U_STAGE_3B_INDEX, int.class);
3656 value = stage3b.get(stage3b.position() + index);
3661 if (TO_U_IS_PARTIAL(value)) {
3662 /* partial match, enter the loop below */
3663 index = FROM_U_GET_PARTIAL_INDEX(value);
3666 fromUTableUChars = (CharBuffer) ARRAY(cx, EXT_FROM_U_UCHARS_INDEX, char.class);
3667 fromUTableValues = (IntBuffer) ARRAY(cx, EXT_FROM_U_VALUES_INDEX, int.class);
3670 i = j = matchLength = 0;
3672 /* we must not remember fallback matches when not using fallbacks */
3674 /* match input units until there is a full match or the input is consumed */
3676 /* go to the next section */
3677 int oldpos = fromUTableUChars.position();
3678 fromUSectionUChars = ((CharBuffer) fromUTableUChars.position(index)).slice();
3679 fromUTableUChars.position(oldpos);
3680 oldpos = fromUTableValues.position();
3681 fromUSectionValues = ((IntBuffer) fromUTableValues.position(index)).slice();
3682 fromUTableValues.position(oldpos);
3684 /* read first pair of the section */
3685 length = fromUSectionUChars.get();
3686 value = fromUSectionValues.get();
3687 if (value != 0 && (FROM_U_IS_ROUNDTRIP(value) || isFromUUseFallback(isUseFallback, firstCP))) {
3688 /* remember longest match so far */
3690 matchLength = 2 + i + j;
3693 /* match pre[] then src[] */
3694 if (i < preLength) {
3695 c = preArray[preArrayBegin + i++];
3696 } else if (source != null && j < source.remaining()) {
3697 c = source.get(source.position() + j++);
3699 /* all input consumed, partial match */
3700 if (flush || (length = (i + j)) > MAX_UCHARS) {
3702 * end of the entire input stream, stop with the longest match so far or: partial match must
3703 * not be longer than UCNV_EXT_MAX_UCHARS because it must fit into state buffers
3707 /* continue with more input next time */
3708 return -(2 + length);
3712 /* search for the current UChar */
3713 index = findFromU(fromUSectionUChars, length, c);
3715 /* no match here, stop with the longest match so far */
3718 value = fromUSectionValues.get(fromUSectionValues.position() + index);
3719 if (FROM_U_IS_PARTIAL(value)) {
3720 /* partial match, continue */
3721 index = FROM_U_GET_PARTIAL_INDEX(value);
3723 if (FROM_U_IS_ROUNDTRIP(value) || isFromUUseFallback(isUseFallback, firstCP)) {
3724 /* full match, stop with result */
3726 matchLength = 2 + i + j;
3728 /* full match on fallback not taken, stop with the longest match so far */
3735 if (matchLength == 0) {
3736 /* no match at all */
3739 } else /* result from firstCP trie lookup */{
3740 if (FROM_U_IS_ROUNDTRIP(value) || isFromUUseFallback(isUseFallback, firstCP)) {
3741 /* full match, stop with result */
3745 /* fallback not taken */
3750 if ((matchValue & FROM_U_RESERVED_MASK) != 0) {
3751 /* do not interpret values with reserved bits used, for forward compatibility */
3756 if (matchValue == FROM_U_SUBCHAR1) {
3757 return 1; /* assert matchLength==2 */
3760 pMatchValue[0] = FROM_U_MASK_ROUNDTRIP(matchValue);
3764 private int simpleMatchFromU(int cp, int[] pValue, boolean isUseFallback) {
3765 int[] value = new int[1];
3766 int match; // signed
3769 match = matchFromU(cp, null, 0, 0, null, value, isUseFallback, true);
3771 /* write result for simple, single-character conversion */
3773 boolean isRoundtrip;
3775 isRoundtrip = FROM_U_IS_ROUNDTRIP(value[0]);
3776 length = FROM_U_GET_LENGTH(value[0]);
3777 value[0] = FROM_U_GET_DATA(value[0]);
3779 if (length <= EXT_FROM_U_MAX_DIRECT_LENGTH) {
3780 pValue[0] = value[0];
3781 return isRoundtrip ? length : -length;
3782 // #if 0 /* not currently used */
3783 // } else if(length==4) {
3784 // /* de-serialize a 4-byte result */
3785 // const uint8_t *result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value;
3787 // ((uint32_t)result[0]<<24)|
3788 // ((uint32_t)result[1]<<16)|
3789 // ((uint32_t)result[2]<<8)|
3791 // return isRoundtrip ? 4 : -4;
3797 * return no match because - match>1 && resultLength>4: result too long for simple conversion - match==1: no
3798 * match found, <subchar1> preferred - match==0: no match found in the first place - match<0: partial
3799 * match, not supported for simple conversion (and flush==TRUE)
3804 @SuppressWarnings("fallthrough")
3805 private CoderResult writeFromU(int value, ByteBuffer target, IntBuffer offsets, int srcIndex) {
3806 ByteBuffer cx = sharedData.mbcs.extIndexes;
3808 byte bufferArray[] = new byte[1 + MAX_BYTES];
3809 int bufferArrayIndex = 0;
3811 int resultArrayIndex;
3812 int length, prevLength;
3814 length = FROM_U_GET_LENGTH(value);
3815 value = FROM_U_GET_DATA(value);
3817 /* output the result */
3818 if (length <= FROM_U_MAX_DIRECT_LENGTH) {
3820 * Generate a byte array and then write it below. This is not the fastest possible way, but it should be
3821 * ok for extension mappings, and it is much simpler. Offset and overflow handling are only done once
3824 int p = bufferArrayIndex + 1; /* reserve buffer[0] for shiftByte below */
3827 bufferArray[p++] = (byte) (value >>> 16);
3829 bufferArray[p++] = (byte) (value >>> 8);
3831 bufferArray[p++] = (byte) value;
3833 break; /* will never occur */
3835 resultArray = bufferArray;
3836 resultArrayIndex = bufferArrayIndex + 1;
3838 byte[] slice = new byte[length];
3840 ByteBuffer bb = ((ByteBuffer) ARRAY(cx, EXT_FROM_U_BYTES_INDEX, byte.class));
3842 bb.get(slice, 0, slice.length);
3844 resultArray = slice;
3845 resultArrayIndex = 0;
3848 /* with correct data we have length>0 */
3850 if ((prevLength = fromUnicodeStatus) != 0) {
3851 /* handle SI/SO stateful output */
3854 if (prevLength > 1 && length == 1) {
3855 /* change from double-byte mode to single-byte */
3856 shiftByte = (byte) UConverterConstants.SI;
3857 fromUnicodeStatus = 1;
3858 } else if (prevLength == 1 && length > 1) {
3859 /* change from single-byte mode to double-byte */
3860 shiftByte = (byte) UConverterConstants.SO;
3861 fromUnicodeStatus = 2;
3866 if (shiftByte != 0) {
3867 /* prepend the shift byte to the result bytes */
3868 bufferArray[0] = shiftByte;
3869 if (resultArray != bufferArray || resultArrayIndex != bufferArrayIndex + 1) {
3870 System.arraycopy(resultArray, resultArrayIndex, bufferArray, bufferArrayIndex + 1, length);
3872 resultArray = bufferArray;
3873 resultArrayIndex = bufferArrayIndex;
3878 return fromUWriteBytes(this, resultArray, resultArrayIndex, length, target, offsets, srcIndex);
3882 * @return if(U_FAILURE) return the code point for cnv->fromUChar32 else return 0 after output has been written
3885 private int fromU(int cp_, CharBuffer source, ByteBuffer target, IntBuffer offsets, int sourceIndex,
3886 int length, boolean flush, CoderResult[] cr) {
3888 long cp = cp_ & UConverterConstants.UNSIGNED_INT_MASK;
3890 useSubChar1 = false;
3892 if (sharedData.mbcs.extIndexes != null
3893 && initialMatchFromU((int) cp, source, target, offsets, sourceIndex, flush, cr)) {
3894 return 0; /* an extension mapping handled the input */
3898 if ((options & MBCS_OPTION_GB18030) != 0) {
3902 for (i = 0; i < gb18030Ranges.length; ++i) {
3903 range = gb18030Ranges[i];
3904 if (range[0] <= cp && cp <= range[1]) {
3905 /* found the Unicode code point, output the four-byte sequence for it */
3907 byte bytes[] = new byte[4];
3909 /* get the linear value of the first GB 18030 code in this range */
3910 linear = range[2] - LINEAR_18030_BASE;
3912 /* add the offset from the beginning of the range */
3913 linear += (cp - range[0]);
3915 bytes[3] = (byte) (0x30 + linear % 10);
3917 bytes[2] = (byte) (0x81 + linear % 126);
3919 bytes[1] = (byte) (0x30 + linear % 10);
3921 bytes[0] = (byte) (0x81 + linear);
3923 /* output this sequence */
3924 cr[0] = fromUWriteBytes(this, bytes, 0, 4, target, offsets, sourceIndex);
3931 cr[0] = CoderResult.unmappableForLength(length);
3936 * target<targetLimit; set error code for overflow
3938 private boolean initialMatchFromU(int cp, CharBuffer source, ByteBuffer target, IntBuffer offsets,
3939 int srcIndex, boolean flush, CoderResult[] cr) {
3940 int[] value = new int[1];
3944 match = matchFromU(cp, null, 0, 0, source, value, useFallback, flush);
3946 /* reject a match if the result is a single byte for DBCS-only */
3948 && !(FROM_U_GET_LENGTH(value[0]) == 1 && sharedData.mbcs.outputType == MBCS_OUTPUT_DBCS_ONLY)) {
3949 /* advance src pointer for the consumed input */
3950 source.position(source.position() + match - 2); /* remove 2 for the initial code point */
3952 /* write result to target */
3953 cr[0] = writeFromU(value[0], target, offsets, srcIndex);
3955 } else if (match < 0) {
3956 /* save state for partial match */
3960 /* copy the first code point */
3961 preFromUFirstCP = cp;
3963 /* now copy the newly consumed input */
3964 sArrayIndex = source.position();
3965 match = -match - 2; /* remove 2 for the initial code point */
3966 for (j = 0; j < match; ++j) {
3967 preFromUArray[j] = source.get(sArrayIndex++);
3969 source.position(sArrayIndex); /* same as *src=srcLimit; because we reached the end of input */
3970 preFromULength = (byte) match;
3972 } else if (match == 1) {
3973 /* matched, no mapping but request for <subchar1> */
3976 } else /* match==0 no match */{
3981 CoderResult cnvMBCSFromUnicodeWithOffsets(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
3982 // Just call encodeLoop to remove duplicate code.
3983 return encodeLoop(source, target, offsets, flush);
3987 * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages that map only to and from the
3988 * BMP. In addition to single-byte/state optimizations, the offset calculations become much easier.
3990 private CoderResult cnvMBCSSingleFromBMPWithOffsets(CharBuffer source, ByteBuffer target, IntBuffer offsets,
3993 CoderResult[] cr = { CoderResult.UNDERFLOW };
3995 int sourceArrayIndex, lastSource;
3996 int targetCapacity, length;
4001 char value, minValue;
4003 /* set up the local pointers */
4004 sourceArrayIndex = source.position();
4005 targetCapacity = target.remaining();
4006 table = sharedData.mbcs.fromUnicodeTable;
4008 if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) {
4009 results = sharedData.mbcs.swapLFNLFromUnicodeBytes; // agljport:comment should swapLFNLFromUnicodeBytes
4010 // be a ByteBuffer so results can be a 16-bit view
4013 results = sharedData.mbcs.fromUnicodeBytes; // agljport:comment should swapLFNLFromUnicodeBytes be a
4014 // ByteBuffer so results can be a 16-bit view of it?
4018 /* use all roundtrip and fallback results */
4021 /* use only roundtrips and fallbacks from private-use characters */
4025 /* get the converter state from UConverter */
4028 /* sourceIndex=-1 if the current character began in the previous buffer */
4029 sourceIndex = c == 0 ? 0 : -1;
4030 lastSource = sourceArrayIndex;
4033 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter for the minimum of the
4034 * sourceLength and targetCapacity
4036 length = source.limit() - sourceArrayIndex;
4037 if (length < targetCapacity) {
4038 targetCapacity = length;
4041 boolean doloop = true;
4042 if (c != 0 && targetCapacity > 0) {
4043 SideEffectsSingleBMP x = new SideEffectsSingleBMP(c, sourceArrayIndex);
4044 doloop = getTrailSingleBMP(source, x, cr);
4046 sourceArrayIndex = x.sourceArrayIndex;
4050 while (targetCapacity > 0) {
4052 * Get a correct Unicode code point: a single UChar for a BMP code point or a matched surrogate pair
4053 * for a "supplementary code point".
4055 c = source.get(sourceArrayIndex++);
4057 * Do not immediately check for single surrogates: Assume that they are unassigned and check for
4058 * them in that case. This speeds up the conversion of assigned characters.
4060 /* convert the Unicode code point in c into codepage bytes */
4061 value = MBCS_SINGLE_RESULT_FROM_U(table, results, c);
4063 /* is this code point assigned, or do we use fallbacks? */
4064 if (value >= minValue) {
4065 /* assigned, write the output character bytes from value and length */
4067 /* this is easy because we know that there is enough space */
4068 target.put((byte) value);
4071 /* normal end of conversion: prepare for a new character */
4074 } else if (!UTF16.isSurrogate((char) c)) {
4075 /* normal, unassigned BMP character */
4076 } else if (UTF16.isLeadSurrogate((char) c)) {
4078 SideEffectsSingleBMP x = new SideEffectsSingleBMP(c, sourceArrayIndex);
4079 doloop = getTrailSingleBMP(source, x, cr);
4081 sourceArrayIndex = x.sourceArrayIndex;
4085 /* this is an unmatched trail code unit (2nd surrogate) */
4086 /* callback(illegal) */
4087 cr[0] = CoderResult.malformedForLength(1);
4091 /* c does not have a mapping */
4093 /* get the number of code units for c to correctly advance sourceIndex */
4094 length = UTF16.getCharCount(c);
4096 /* set offsets since the start or the last extension */
4097 if (offsets != null) {
4098 int count = sourceArrayIndex - lastSource;
4100 /* do not set the offset for this character */
4104 offsets.put(sourceIndex++);
4107 /* offsets and sourceIndex are now set for the current character */
4110 /* try an extension mapping */
4111 lastSource = sourceArrayIndex;
4112 source.position(sourceArrayIndex);
4113 c = fromU(c, source, target, offsets, sourceIndex, length, flush, cr);
4114 sourceArrayIndex = source.position();
4115 sourceIndex += length + (sourceArrayIndex - lastSource);
4116 lastSource = sourceArrayIndex;
4118 if (cr[0].isError()) {
4119 /* not mappable or buffer overflow */
4122 /* a mapping was written to the target, continue */
4124 /* recalculate the targetCapacity after an extension mapping */
4125 targetCapacity = target.remaining();
4126 length = source.limit() - sourceArrayIndex;
4127 if (length < targetCapacity) {
4128 targetCapacity = length;
4134 if (sourceArrayIndex < source.limit() && !target.hasRemaining()) {
4135 /* target is full */
4136 cr[0] = CoderResult.OVERFLOW;
4139 /* set offsets since the start or the last callback */
4140 if (offsets != null) {
4141 int count = sourceArrayIndex - lastSource;
4143 offsets.put(sourceIndex++);
4148 /* set the converter state back into UConverter */
4151 /* write back the updated pointers */
4152 source.position(sourceArrayIndex);
4157 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */
4158 private CoderResult cnvMBCSSingleFromUnicodeWithOffsets(CharBuffer source, ByteBuffer target,
4159 IntBuffer offsets, boolean flush) {
4161 CoderResult[] cr = { CoderResult.UNDERFLOW };
4163 int sourceArrayIndex;
4166 byte[] results; // agljport:comment results is used to to get 16-bit values out of byte[] array
4169 int sourceIndex, nextSourceIndex;
4171 char value, minValue;
4173 /* set up the local pointers */
4175 sourceArrayIndex = source.position();
4177 table = sharedData.mbcs.fromUnicodeTable;
4179 if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) {
4180 results = sharedData.mbcs.swapLFNLFromUnicodeBytes; // agljport:comment should swapLFNLFromUnicodeBytes
4181 // be a ByteBuffer so results can be a 16-bit view
4184 results = sharedData.mbcs.fromUnicodeBytes; // agljport:comment should swapLFNLFromUnicodeBytes be a
4185 // ByteBuffer so results can be a 16-bit view of it?
4189 /* use all roundtrip and fallback results */
4192 /* use only roundtrips and fallbacks from private-use characters */
4195 // agljport:comment hasSupplementary only used in getTrail block which now simply repeats the mask operation
4196 uniMask = sharedData.mbcs.unicodeMask;
4198 /* get the converter state from UConverter */
4201 /* sourceIndex=-1 if the current character began in the previous buffer */
4202 sourceIndex = c == 0 ? 0 : -1;
4203 nextSourceIndex = 0;
4205 boolean doloop = true;
4206 boolean doread = true;
4207 if (c != 0 && target.hasRemaining()) {
4208 if (UTF16.isLeadSurrogate((char) c)) {
4209 SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, nextSourceIndex);
4210 doloop = getTrailDouble(source, target, uniMask, x, flush, cr);
4213 sourceArrayIndex = x.sourceArrayIndex;
4214 sourceIndex = x.sourceIndex;
4215 nextSourceIndex = x.nextSourceIndex;
4222 while (!doread || sourceArrayIndex < source.limit()) {
4224 * This following test is to see if available input would overflow the output. It does not catch
4225 * output of more than one byte that overflows as a result of a multi-byte character or callback
4226 * output from the last source character. Therefore, those situations also test for overflows and
4227 * will then break the loop, too.
4229 if (target.hasRemaining()) {
4231 * Get a correct Unicode code point: a single UChar for a BMP code point or a matched surrogate
4232 * pair for a "supplementary code point".
4236 c = source.get(sourceArrayIndex++);
4238 if (UTF16.isSurrogate((char) c)) {
4239 if (UTF16.isLeadSurrogate((char) c)) {
4241 SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex,
4243 doloop = getTrailDouble(source, target, uniMask, x, flush, cr);
4245 sourceArrayIndex = x.sourceArrayIndex;
4246 sourceIndex = x.sourceIndex;
4247 nextSourceIndex = x.nextSourceIndex;
4255 /* this is an unmatched trail code unit (2nd surrogate) */
4256 /* callback(illegal) */
4257 cr[0] = CoderResult.malformedForLength(1);
4265 /* convert the Unicode code point in c into codepage bytes */
4266 value = MBCS_SINGLE_RESULT_FROM_U(table, results, c);
4268 /* is this code point assigned, or do we use fallbacks? */
4269 if (value >= minValue) {
4270 /* assigned, write the output character bytes from value and length */
4272 /* this is easy because we know that there is enough space */
4273 target.put((byte) value);
4274 if (offsets != null) {
4275 offsets.put(sourceIndex);
4278 /* normal end of conversion: prepare for a new character */
4280 sourceIndex = nextSourceIndex;
4281 } else { /* unassigned */
4282 /* try an extension mapping */
4283 SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex,
4285 doloop = unassignedDouble(source, target, x, flush, cr);
4287 sourceArrayIndex = x.sourceArrayIndex;
4288 sourceIndex = x.sourceIndex;
4289 nextSourceIndex = x.nextSourceIndex;
4294 /* target is full */
4295 cr[0] = CoderResult.OVERFLOW;
4301 /* set the converter state back into UConverter */
4304 /* write back the updated pointers */
4305 source.position(sourceArrayIndex);
4310 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */
4311 private CoderResult cnvMBCSDoubleFromUnicodeWithOffsets(CharBuffer source, ByteBuffer target,
4312 IntBuffer offsets, boolean flush) {
4313 CoderResult[] cr = { CoderResult.UNDERFLOW };
4315 int sourceArrayIndex;
4320 int c, sourceIndex, nextSourceIndex;
4327 /* use optimized function if possible */
4328 uniMask = sharedData.mbcs.unicodeMask;
4330 /* set up the local pointers */
4331 sourceArrayIndex = source.position();
4333 table = sharedData.mbcs.fromUnicodeTable;
4335 if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) {
4336 bytes = sharedData.mbcs.swapLFNLFromUnicodeBytes;
4338 bytes = sharedData.mbcs.fromUnicodeBytes;
4341 /* get the converter state from UConverter */
4344 /* sourceIndex=-1 if the current character began in the previous buffer */
4345 sourceIndex = c == 0 ? 0 : -1;
4346 nextSourceIndex = 0;
4348 /* conversion loop */
4349 boolean doloop = true;
4350 boolean doread = true;
4351 if (c != 0 && target.hasRemaining()) {
4352 if (UTF16.isLeadSurrogate((char) c)) {
4353 SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, nextSourceIndex);
4354 doloop = getTrailDouble(source, target, uniMask, x, flush, cr);
4357 sourceArrayIndex = x.sourceArrayIndex;
4358 sourceIndex = x.sourceIndex;
4359 nextSourceIndex = x.nextSourceIndex;
4366 while (!doread || sourceArrayIndex < source.limit()) {
4368 * This following test is to see if available input would overflow the output. It does not catch
4369 * output of more than one byte that overflows as a result of a multi-byte character or callback
4370 * output from the last source character. Therefore, those situations also test for overflows and
4371 * will then break the loop, too.
4373 if (target.hasRemaining()) {
4376 * Get a correct Unicode code point: a single UChar for a BMP code point or a matched
4377 * surrogate pair for a "supplementary code point".
4379 c = source.get(sourceArrayIndex++);
4382 * This also tests if the codepage maps single surrogates. If it does, then surrogates are
4383 * not paired but mapped separately. Note that in this case unmatched surrogates are not
4386 if (UTF16.isSurrogate((char) c) && (uniMask & UConverterConstants.HAS_SURROGATES) == 0) {
4387 if (UTF16.isLeadSurrogate((char) c)) {
4389 SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex,
4391 doloop = getTrailDouble(source, target, uniMask, x, flush, cr);
4393 sourceArrayIndex = x.sourceArrayIndex;
4394 sourceIndex = x.sourceIndex;
4395 nextSourceIndex = x.nextSourceIndex;
4404 /* this is an unmatched trail code unit (2nd surrogate) */
4405 /* callback(illegal) */
4406 cr[0] = CoderResult.malformedForLength(1);
4414 /* convert the Unicode code point in c into codepage bytes */
4415 stage2Entry = MBCS_STAGE_2_FROM_U(table, c);
4417 /* get the bytes and the length for the output */
4419 value = MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
4420 if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) {
4426 /* is this code point assigned, or do we use fallbacks? */
4427 if (!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || (isFromUUseFallback(c) && value != 0))) {
4429 * We allow a 0 byte output if the "assigned" bit is set for this entry. There is no way
4430 * with this data structure for fallback output to be a zero byte.
4434 SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex,
4437 doloop = unassignedDouble(source, target, x, flush, cr);
4439 sourceArrayIndex = x.sourceArrayIndex;
4440 sourceIndex = x.sourceIndex;
4441 nextSourceIndex = x.nextSourceIndex;
4448 /* write the output character bytes from value and length */
4449 /* from the first if in the loop we know that targetCapacity>0 */
4451 /* this is easy because we know that there is enough space */
4452 target.put((byte) value);
4453 if (offsets != null) {
4454 offsets.put(sourceIndex);
4456 } else /* length==2 */{
4457 target.put((byte) (value >>> 8));
4458 if (2 <= target.remaining()) {
4459 target.put((byte) value);
4460 if (offsets != null) {
4461 offsets.put(sourceIndex);
4462 offsets.put(sourceIndex);
4465 if (offsets != null) {
4466 offsets.put(sourceIndex);
4468 errorBuffer[0] = (byte) value;
4469 errorBufferLength = 1;
4471 /* target overflow */
4472 cr[0] = CoderResult.OVERFLOW;
4478 /* normal end of conversion: prepare for a new character */
4480 sourceIndex = nextSourceIndex;
4483 /* target is full */
4484 cr[0] = CoderResult.OVERFLOW;
4490 /* set the converter state back into UConverter */
4493 /* write back the updated pointers */
4494 source.position(sourceArrayIndex);
4499 private final class SideEffectsSingleBMP {
4500 int c, sourceArrayIndex;
4502 SideEffectsSingleBMP(int c_, int sourceArrayIndex_) {
4504 sourceArrayIndex = sourceArrayIndex_;
4508 // function made out of block labeled getTrail in ucnv_MBCSSingleFromUnicodeWithOffsets
4509 // assumes input c is lead surrogate
4510 private final boolean getTrailSingleBMP(CharBuffer source, SideEffectsSingleBMP x, CoderResult[] cr) {
4511 if (x.sourceArrayIndex < source.limit()) {
4512 /* test the following code unit */
4513 char trail = source.get(x.sourceArrayIndex);
4514 if (UTF16.isTrailSurrogate(trail)) {
4515 ++x.sourceArrayIndex;
4516 x.c = UCharacter.getCodePoint((char) x.c, trail);
4517 /* this codepage does not map supplementary code points */
4518 /* callback(unassigned) */
4519 cr[0] = CoderResult.unmappableForLength(2);
4522 /* this is an unmatched lead code unit (1st surrogate) */
4523 /* callback(illegal) */
4524 cr[0] = CoderResult.malformedForLength(1);
4534 private final class SideEffects {
4535 int c, sourceArrayIndex, sourceIndex, nextSourceIndex, prevSourceIndex, prevLength;
4536 boolean doread = true;
4538 SideEffects(int c_, int sourceArrayIndex_, int sourceIndex_, int nextSourceIndex_, int prevSourceIndex_,
4541 sourceArrayIndex = sourceArrayIndex_;
4542 sourceIndex = sourceIndex_;
4543 nextSourceIndex = nextSourceIndex_;
4544 prevSourceIndex = prevSourceIndex_;
4545 prevLength = prevLength_;
4549 // function made out of block labeled getTrail in ucnv_MBCSFromUnicodeWithOffsets
4550 // assumes input c is lead surrogate
4551 private final boolean getTrail(CharBuffer source, ByteBuffer target, int uniMask, SideEffects x,
4552 boolean flush, CoderResult[] cr) {
4553 if (x.sourceArrayIndex < source.limit()) {
4554 /* test the following code unit */
4555 char trail = source.get(x.sourceArrayIndex);
4556 if (UTF16.isTrailSurrogate(trail)) {
4557 ++x.sourceArrayIndex;
4558 ++x.nextSourceIndex;
4559 /* convert this supplementary code point */
4560 x.c = UCharacter.getCodePoint((char) x.c, trail);
4561 if ((uniMask & UConverterConstants.HAS_SUPPLEMENTARY) == 0) {
4562 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
4563 fromUnicodeStatus = x.prevLength; /* save the old state */
4564 /* callback(unassigned) */
4566 return unassigned(source, target, null, x, flush, cr);
4572 /* this is an unmatched lead code unit (1st surrogate) */
4573 /* callback(illegal) */
4574 cr[0] = CoderResult.malformedForLength(1);
4583 // function made out of block labeled unassigned in ucnv_MBCSFromUnicodeWithOffsets
4584 private final boolean unassigned(CharBuffer source, ByteBuffer target, IntBuffer offsets, SideEffects x,
4585 boolean flush, CoderResult[] cr) {
4586 /* try an extension mapping */
4587 int sourceBegin = x.sourceArrayIndex;
4588 source.position(x.sourceArrayIndex);
4589 x.c = fromU(x.c, source, target, null, x.sourceIndex, x.nextSourceIndex, flush, cr);
4590 x.sourceArrayIndex = source.position();
4591 x.nextSourceIndex += x.sourceArrayIndex - sourceBegin;
4592 x.prevLength = fromUnicodeStatus;
4594 if (cr[0].isError()) {
4595 /* not mappable or buffer overflow */
4598 /* a mapping was written to the target, continue */
4600 /* recalculate the targetCapacity after an extension mapping */
4601 // x.targetCapacity=pArgs.targetLimit-x.targetArrayIndex;
4602 /* normal end of conversion: prepare for a new character */
4603 if (offsets != null) {
4604 x.prevSourceIndex = x.sourceIndex;
4605 x.sourceIndex = x.nextSourceIndex;
4611 private final class SideEffectsDouble {
4612 int c, sourceArrayIndex, sourceIndex, nextSourceIndex;
4613 boolean doread = true;
4615 SideEffectsDouble(int c_, int sourceArrayIndex_, int sourceIndex_, int nextSourceIndex_) {
4617 sourceArrayIndex = sourceArrayIndex_;
4618 sourceIndex = sourceIndex_;
4619 nextSourceIndex = nextSourceIndex_;
4623 // function made out of block labeled getTrail in ucnv_MBCSDoubleFromUnicodeWithOffsets
4624 // assumes input c is lead surrogate
4625 private final boolean getTrailDouble(CharBuffer source, ByteBuffer target, int uniMask,
4626 SideEffectsDouble x, boolean flush, CoderResult[] cr) {
4627 if (x.sourceArrayIndex < source.limit()) {
4628 /* test the following code unit */
4629 char trail = source.get(x.sourceArrayIndex);
4630 if (UTF16.isTrailSurrogate(trail)) {
4631 ++x.sourceArrayIndex;
4632 ++x.nextSourceIndex;
4633 /* convert this supplementary code point */
4634 x.c = UCharacter.getCodePoint((char) x.c, trail);
4635 if ((uniMask & UConverterConstants.HAS_SUPPLEMENTARY) == 0) {
4636 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
4637 /* callback(unassigned) */
4639 return unassignedDouble(source, target, x, flush, cr);
4645 /* this is an unmatched lead code unit (1st surrogate) */
4646 /* callback(illegal) */
4647 cr[0] = CoderResult.malformedForLength(1);
4656 // function made out of block labeled unassigned in ucnv_MBCSDoubleFromUnicodeWithOffsets
4657 private final boolean unassignedDouble(CharBuffer source, ByteBuffer target, SideEffectsDouble x,
4658 boolean flush, CoderResult[] cr) {
4659 /* try an extension mapping */
4660 int sourceBegin = x.sourceArrayIndex;
4661 source.position(x.sourceArrayIndex);
4662 x.c = fromU(x.c, source, target, null, x.sourceIndex, x.nextSourceIndex, flush, cr);
4663 x.sourceArrayIndex = source.position();
4664 x.nextSourceIndex += x.sourceArrayIndex - sourceBegin;
4666 if (cr[0].isError()) {
4667 /* not mappable or buffer overflow */
4670 /* a mapping was written to the target, continue */
4672 /* recalculate the targetCapacity after an extension mapping */
4673 // x.targetCapacity=pArgs.targetLimit-x.targetArrayIndex;
4674 /* normal end of conversion: prepare for a new character */
4675 x.sourceIndex = x.nextSourceIndex;
4681 * Overrides super class method
4689 protected CoderResult cbFromUWriteSub(CharsetEncoderICU encoder, CharBuffer source, ByteBuffer target,
4690 IntBuffer offsets) {
4691 CharsetMBCS cs = (CharsetMBCS) encoder.charset();
4695 if (cs.subChar1 != 0
4696 && (cs.sharedData.mbcs.extIndexes != null ? encoder.useSubChar1
4697 : (encoder.invalidUCharBuffer[0] <= 0xff))) {
4699 * select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS
4702 subchar = new byte[] { cs.subChar1 };
4705 /* select subChar in all other cases */
4706 subchar = cs.subChar;
4707 length = cs.subCharLen;
4710 /* reset the selector for the next code point */
4711 encoder.useSubChar1 = false;
4713 if (cs.sharedData.mbcs.outputType == MBCS_OUTPUT_2_SISO) {
4714 byte[] buffer = new byte[4];
4717 /* fromUnicodeStatus contains prevLength */
4720 if (encoder.fromUnicodeStatus == 2) {
4721 /* DBCS mode and SBCS sub char: change to SBCS */
4722 encoder.fromUnicodeStatus = 1;
4723 buffer[i++] = UConverterConstants.SI;
4725 buffer[i++] = subchar[0];
4728 if (encoder.fromUnicodeStatus <= 1) {
4729 /* SBCS mode and DBCS sub char: change to DBCS */
4730 encoder.fromUnicodeStatus = 2;
4731 buffer[i++] = UConverterConstants.SO;
4733 buffer[i++] = subchar[0];
4734 buffer[i++] = subchar[1];
4737 throw new IllegalArgumentException();
4743 return CharsetEncoderICU.fromUWriteBytes(encoder, subchar, 0, length, target, offsets, source.position());
4747 * Gets called whenever CharsetEncoder.replaceWith gets called. allowReplacementChanges only allows subChar and
4748 * subChar1 to be modified outside construction (since replaceWith is called once during construction).
4750 * @param replacement
4751 * The replacement for subchar.
4753 protected void implReplaceWith(byte[] replacement) {
4754 if (allowReplacementChanges) {
4755 CharsetMBCS cs = (CharsetMBCS) this.charset();
4757 System.arraycopy(replacement, 0, cs.subChar, 0, replacement.length);
4758 cs.subCharLen = (byte) replacement.length;
4764 public CharsetDecoder newDecoder() {
4765 return new CharsetDecoderMBCS(this);
4768 public CharsetEncoder newEncoder() {
4769 return new CharsetEncoderMBCS(this);
4772 @SuppressWarnings("fallthrough")
4773 void MBCSGetFilteredUnicodeSetForUnicode(UConverterSharedData data, UnicodeSet setFillIn, int which, int filter){
4774 UConverterMBCSTable mbcsTable;
4776 char st1,maxStage1, st2;
4780 mbcsTable = data.mbcs;
4781 table = mbcsTable.fromUnicodeTable;
4782 if((mbcsTable.unicodeMask & UConverterConstants.HAS_SUPPLEMENTARY)!=0){
4788 c=0; /* keep track of current code point while enumerating */
4790 if(mbcsTable.outputType==MBCS_OUTPUT_1){
4791 char stage2, stage3;
4794 results = ByteBuffer.wrap(mbcsTable.fromUnicodeBytes).asCharBuffer();
4796 if(which==ROUNDTRIP_SET) {
4797 /* use only roundtrips */
4800 /* use all roundtrip and fallback results */
4803 for(st1=0;st1<maxStage1;++st1){
4807 for(st2=0; st2<64; ++st2){
4808 st3 = table[stage2 + st2];
4810 /*read the stage 3 block */
4813 if(results.get(stage3++)>=minValue){
4817 }while((++c&0xf) !=0);
4819 c+= 16; /*empty stage 2 block */
4823 c+=1024; /* empty stage 2 block */
4831 boolean useFallBack;
4832 bytes = mbcsTable.fromUnicodeBytes;
4833 useFallBack = (which == ROUNDTRIP_AND_FALLBACK_SET);
4834 switch(mbcsTable.outputType) {
4836 case MBCS_OUTPUT_4_EUC:
4846 //ByteBuffer buffer = (ByteBuffer)charTobyte(table);
4848 for(st1=0;st1<maxStage1;++st1){
4850 if(st2>(maxStage1>>1)){
4852 for(st2=0;st2<128;++st2){
4853 /*read the stage 3 block */
4854 st3 = table[stage2*2 + st2]<<16;
4855 st3+=table[stage2*2 + ++st2];
4857 //if((st3=table[stage2+st2])!=0){
4858 stage3 = st3Multiplier*16*(st3&UConverterConstants.UNSIGNED_SHORT_MASK);
4860 /* get the roundtrip flags for the stage 3 block */
4862 st3 &= UConverterConstants.UNSIGNED_SHORT_MASK;
4864 case UCNV_SET_FILTER_NONE:
4869 stage3+=st3Multiplier;
4870 }else if (useFallBack) {
4873 switch(st3Multiplier) {
4876 b|= ByteBuffer.wrap(bytes).getChar(stage3++);
4880 b|= ByteBuffer.wrap(bytes).getChar(stage3++);
4884 b|= ByteBuffer.wrap(bytes).getChar(stage3) | ByteBuffer.wrap(bytes).getChar(stage3+1);
4894 }while((++c&0xf)!=0);
4896 case UCNV_SET_FILTER_DBCS_ONLY:
4897 /* Ignore single bytes results (<0x100). */
4899 if(((st3&1) != 0 || useFallBack) &&
4900 (UConverterConstants.UNSIGNED_SHORT_MASK & (ByteBuffer.wrap(bytes).getChar(stage3))) >= 0x100){
4905 }while((++c&0xf) != 0);
4907 case UCNV_SET_FILTER_2022_CN :
4908 /* only add code points that map to CNS 11643 planes 1&2 for non-EXT ISO-2202-CN. */
4910 if(((st3&1) != 0 || useFallBack) &&
4911 ((value= (UConverterConstants.UNSIGNED_BYTE_MASK & (ByteBuffer.wrap(bytes).get(stage3))))==0x81 || value==0x82) ){
4916 }while((++c&0xf)!=0);
4918 case UCNV_SET_FILTER_SJIS:
4919 /* only add code points that map tp Shift-JIS codes corrosponding to JIS X 0280. */
4922 if(((st3&1) != 0 || useFallBack) && (value=(UConverterConstants.UNSIGNED_SHORT_MASK & (ByteBuffer.wrap(bytes).getChar(stage3))))>=0x8140 && value<=0xeffc){
4927 }while((++c&0xf)!=0);
4929 case UCNV_SET_FILTER_GR94DBCS:
4930 /* only add code points that maps to ISO 2022 GR 94 DBCS codes*/
4932 if(((st3&1) != 0 || useFallBack) &&
4933 (UConverterConstants.UNSIGNED_SHORT_MASK & ((value=(UConverterConstants.UNSIGNED_SHORT_MASK & (ByteBuffer.wrap(bytes).getChar(stage3))))- 0xa1a1))<=(0xfefe - 0xa1a1) &&
4934 (UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1)) <= (0xfe - 0xa1)){
4939 }while((++c&0xf)!=0);
4941 case UCNV_SET_FILTER_HZ:
4942 /*Only add code points that are suitable for HZ DBCS*/
4944 if( ((st3&1) != 0 || useFallBack) &&
4945 (UConverterConstants.UNSIGNED_SHORT_MASK & ((value=(UConverterConstants.UNSIGNED_SHORT_MASK & (ByteBuffer.wrap(bytes).getChar(stage3))))-0xa1a1))<=(0xfdfe - 0xa1a1) &&
4946 (UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1)) <= (0xfe - 0xa1)){
4951 }while((++c&0xf) != 0);
4957 c+=16; /* empty stage 3 block */
4961 c+=1024; /*empty stage2 block */
4965 extGetUnicodeSet(setFillIn, which, filter, data);
4968 static void extGetUnicodeSetString(ByteBuffer cx,UnicodeSet setFillIn, boolean useFallback,
4969 int minLength, int c, char s[],int length,int sectionIndex){
4970 CharBuffer fromUSectionUChar;
4971 IntBuffer fromUSectionValues;
4972 fromUSectionUChar = (CharBuffer)ARRAY(cx, EXT_FROM_U_UCHARS_INDEX,char.class );
4973 fromUSectionValues = (IntBuffer)ARRAY(cx, EXT_FROM_U_VALUES_INDEX,int.class );
4974 int fromUSectionUCharIndex = fromUSectionUChar.position()+sectionIndex;
4975 int fromUSectionValuesIndex = fromUSectionValues.position()+sectionIndex;
4976 int value, i, count;
4978 /* read first pair of the section */
4979 count = fromUSectionUChar.get(fromUSectionUCharIndex++);
4980 value = fromUSectionValues.get(fromUSectionValuesIndex++);
4981 if(value!=0 && (FROM_U_IS_ROUNDTRIP(value) || useFallback) && FROM_U_GET_LENGTH(value)>=minLength) {
4985 StringBuilder normalizedStringBuilder = new StringBuilder();
4986 for(int j=0; j<length;j++){
4987 normalizedStringBuilder.append(s[j]);
4989 String normalizedString = normalizedStringBuilder.toString();
4990 for(int j=0;j<length;j++){
4991 setFillIn.add(normalizedString);
4996 for(i=0; i<count; ++i){
4997 s[length] = fromUSectionUChar.get(fromUSectionUCharIndex + i);
4998 value = fromUSectionValues.get(fromUSectionValuesIndex + i);
5001 /* no mapping, do nothing */
5002 } else if (FROM_U_IS_PARTIAL(value)) {
5003 extGetUnicodeSetString( cx, setFillIn, useFallback, minLength, UConverterConstants.U_SENTINEL, s, length+1,
5004 FROM_U_GET_PARTIAL_INDEX(value));
5005 } else if ((useFallback ? (value&FROM_U_RESERVED_MASK)==0:((value&(FROM_U_ROUNDTRIP_FLAG|FROM_U_RESERVED_MASK))==FROM_U_ROUNDTRIP_FLAG))
5006 && FROM_U_GET_LENGTH(value)>=minLength) {
5007 StringBuilder normalizedStringBuilder = new StringBuilder(); // String for composite characters
5008 for(int j=0; j<(length+1);j++){
5009 normalizedStringBuilder.append(s[j]);
5011 setFillIn.add(normalizedStringBuilder.toString());
5018 static void extGetUnicodeSet(UnicodeSet setFillIn, int which, int filter, UConverterSharedData Data){
5019 int st1, stage1Length, st2, st3, minLength;
5022 CharBuffer stage12, stage3;
5025 boolean useFallback;
5026 char s[] = new char[MAX_UCHARS];
5028 ByteBuffer cx = Data.mbcs.extIndexes;
5032 stage12 = (CharBuffer)ARRAY(cx, EXT_FROM_U_STAGE_12_INDEX,char.class );
5033 stage3 = (CharBuffer)ARRAY(cx, EXT_FROM_U_STAGE_3_INDEX,char.class );
5034 stage3b = (IntBuffer)ARRAY(cx, EXT_FROM_U_STAGE_3B_INDEX,int.class );
5036 stage1Length = cx.asIntBuffer().get(EXT_FROM_U_STAGE_1_LENGTH);
5037 useFallback = (which==ROUNDTRIP_AND_FALLBACK_SET);
5040 if(filter == UCNV_SET_FILTER_2022_CN) {
5042 } else if (Data.mbcs.outputType == MBCS_OUTPUT_DBCS_ONLY || filter != UCNV_SET_FILTER_NONE) {
5043 /* DBCS-only, ignore single-byte results */
5049 for(st1=0; st1< stage1Length; ++st1){
5050 st2 = stage12.get(st1);
5051 if(st2>stage1Length) {
5053 for(st2=0;st2<64;++st2){
5054 st3=((int) stage12.get(ps2+st2))<<STAGE_2_LEFT_SHIFT;
5058 value = stage3b.get(UConverterConstants.UNSIGNED_SHORT_MASK&stage3.get(ps3++));
5060 /* no mapping do nothing */
5061 }else if (FROM_U_IS_PARTIAL(value)){
5063 length=UTF16.append(s, length, c);
5064 extGetUnicodeSetString(cx,setFillIn,useFallback,minLength,c,s,length,FROM_U_GET_PARTIAL_INDEX(value));
5065 } else if ((useFallback ? (value&FROM_U_RESERVED_MASK)==0 :((value&(FROM_U_ROUNDTRIP_FLAG|FROM_U_RESERVED_MASK))== FROM_U_ROUNDTRIP_FLAG)) &&
5066 FROM_U_GET_LENGTH(value)>=minLength){
5069 case UCNV_SET_FILTER_2022_CN:
5070 if(!(FROM_U_GET_LENGTH(value)==3 && FROM_U_GET_DATA(value)<=0x82ffff)){
5074 case UCNV_SET_FILTER_SJIS:
5075 if(!(FROM_U_GET_LENGTH(value)==2 && (value=FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)){
5079 case UCNV_SET_FILTER_GR94DBCS:
5080 if(!(FROM_U_GET_LENGTH(value)==2 && (UConverterConstants.UNSIGNED_SHORT_MASK & ((value=FROM_U_GET_DATA(value)) - 0xa1a1))<=(0xfefe - 0xa1a1)
5081 && (UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1))<= (0xfe - 0xa1))){
5086 case UCNV_SET_FILTER_HZ:
5087 if(!(FROM_U_GET_LENGTH(value)==2 && (UConverterConstants.UNSIGNED_SHORT_MASK & ((value=FROM_U_GET_DATA(value)) - 0xa1a1))<=(0xfdfe - 0xa1a1)
5088 && (UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1))<= (0xfe - 0xa1))){
5094 * UCNV_SET_FILTER_NONE,
5095 * or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength
5102 }while((++c&0xf) != 0);
5105 c+=16; /* emplty stage3 block */
5109 c+=1024; /* empty stage 2 block*/
5114 void MBCSGetUnicodeSetForUnicode(UConverterSharedData data, UnicodeSet setFillIn, int which){
5115 MBCSGetFilteredUnicodeSetForUnicode(data, setFillIn, which,
5116 this.sharedData.mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? UCNV_SET_FILTER_DBCS_ONLY : UCNV_SET_FILTER_NONE );
5119 void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
5120 if((options & MBCS_OPTION_GB18030)!=0){
5121 setFillIn.add(0, 0xd7ff);
5122 setFillIn.add(0xe000, 0x10ffff);
5125 this.MBCSGetUnicodeSetForUnicode(sharedData, setFillIn, which);