2 *******************************************************************************
\r
3 * Copyright (C) 2006-2009, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 *******************************************************************************
\r
9 package com.ibm.icu.charset;
\r
11 import java.io.BufferedInputStream;
\r
12 import java.io.IOException;
\r
13 import java.io.InputStream;
\r
14 import java.nio.Buffer;
\r
15 import java.nio.BufferOverflowException;
\r
16 import java.nio.ByteBuffer;
\r
17 import java.nio.CharBuffer;
\r
18 import java.nio.IntBuffer;
\r
19 import java.nio.charset.CharsetDecoder;
\r
20 import java.nio.charset.CharsetEncoder;
\r
21 import java.nio.charset.CoderResult;
\r
23 import com.ibm.icu.charset.UConverterSharedData.UConverterType;
\r
24 import com.ibm.icu.impl.ICUData;
\r
25 import com.ibm.icu.impl.ICUResourceBundle;
\r
26 import com.ibm.icu.impl.InvalidFormatException;
\r
27 import com.ibm.icu.lang.UCharacter;
\r
28 import com.ibm.icu.text.UTF16;
\r
29 import com.ibm.icu.text.UnicodeSet;
\r
30 import com.ibm.icu.charset.UConverterConstants;
\r
32 class CharsetMBCS extends CharsetICU {
\r
34 private byte[] fromUSubstitution = null;
\r
35 UConverterSharedData sharedData = null;
\r
36 private static final int MAX_VERSION_LENGTH = 4;
\r
38 // these variables are used in getUnicodeSet() and may be changed in future
\r
39 // typedef enum UConverterSetFilter {
\r
40 static final int UCNV_SET_FILTER_NONE = 1;
\r
41 static final int UCNV_SET_FILTER_DBCS_ONLY = 2;
\r
42 static final int UCNV_SET_FILTER_2022_CN = 3;
\r
43 static final int UCNV_SET_FILTER_SJIS= 4 ;
\r
44 static final int UCNV_SET_FILTER_GR94DBCS = 5;
\r
45 static final int UCNV_SET_FILTER_HZ = 6;
\r
46 static final int UCNV_SET_FILTER_COUNT = 7;
\r
47 // } UConverterSetFilter;
\r
50 * Fallbacks to Unicode are stored outside the normal state table and code point structures in a vector of items of
\r
51 * this type. They are sorted by offset.
\r
53 final class MBCSToUFallback {
\r
59 * This is the MBCS part of the UConverterTable union (a runtime data structure). It keeps all the per-converter
\r
60 * data and points into the loaded mapping tables.
\r
62 static final class UConverterMBCSTable {
\r
66 boolean stateTableOwned;
\r
67 int countToUFallbacks;
\r
69 int stateTable[/* countStates */][/* 256 */];
\r
70 int swapLFNLStateTable[/* countStates */][/* 256 */]; /* for swaplfnl */
\r
71 char unicodeCodeUnits[/* countUnicodeResults */];
\r
72 MBCSToUFallback toUFallbacks[/* countToUFallbacks */];
\r
75 char fromUnicodeTable[];
\r
76 byte fromUnicodeBytes[];
\r
77 byte swapLFNLFromUnicodeBytes[]; /* for swaplfnl */
\r
78 int fromUBytesLength;
\r
79 short outputType, unicodeMask;
\r
81 /* converter name for swaplfnl */
\r
82 String swapLFNLName;
\r
84 /* extension data */
\r
85 UConverterSharedData baseSharedData;
\r
86 // int extIndexes[];
\r
87 ByteBuffer extIndexes; // create int[] view etc. as needed
\r
89 CharBuffer mbcsIndex; /* for fast conversion from most of BMP to MBCS (utf8Friendly data) */
\r
90 char sbcsIndex[/* SBCS_FAST_LIMIT>>6 */]; /* for fast conversion from low BMP to SBCS (utf8Friendly data) */
\r
91 boolean utf8Friendly; /* for utf8Friendly data */
\r
92 char maxFastUChar; /* for utf8Friendly data */
\r
95 long asciiRoundtrips;
\r
97 UConverterMBCSTable() {
\r
98 utf8Friendly = false;
\r
100 sbcsIndex = new char[SBCS_FAST_LIMIT>>6];
\r
104 * UConverterMBCSTable(UConverterMBCSTable t) { countStates = t.countStates; dbcsOnlyState = t.dbcsOnlyState;
\r
105 * stateTableOwned = t.stateTableOwned; countToUFallbacks = t.countToUFallbacks; stateTable = t.stateTable;
\r
106 * swapLFNLStateTable = t.swapLFNLStateTable; unicodeCodeUnits = t.unicodeCodeUnits; toUFallbacks =
\r
107 * t.toUFallbacks; fromUnicodeTable = t.fromUnicodeTable; fromUnicodeBytes = t.fromUnicodeBytes;
\r
108 * swapLFNLFromUnicodeBytes = t.swapLFNLFromUnicodeBytes; fromUBytesLength = t.fromUBytesLength; outputType =
\r
109 * t.outputType; unicodeMask = t.unicodeMask; swapLFNLName = t.swapLFNLName; baseSharedData = t.baseSharedData;
\r
110 * extIndexes = t.extIndexes; }
\r
114 /* Constants used in MBCS data header */
\r
116 static final int MBCS_OPT_LENGTH_MASK=0x3f;
\r
117 static final int MBCS_OPT_NO_FROM_U=0x40;
\r
119 * If any of the following options bits are set,
\r
120 * then the file must be rejected.
\r
122 static final int MBCS_OPT_INCOMPATIBLE_MASK=0xffc0;
\r
124 * Remove bits from this mask as more options are recognized
\r
125 * by all implementations that use this constant.
\r
127 static final int MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK=0xff80;
\r
129 /* Constants for fast and UTF-8-friendly conversion. */
\r
131 static final int SBCS_FAST_MAX=0x0fff; /* maximum code point with UTF-8-friendly SBCS runtime code, see makeconv SBCS_UTF8_MAX */
\r
132 static final int SBCS_FAST_LIMIT=SBCS_FAST_MAX+1; /* =0x1000 */
\r
133 static final int MBCS_FAST_MAX=0xd7ff; /* maximum code point with UTF-8-friendly MBCS runtime code, see makeconv MBCS_UTF8_MAX */
\r
134 static final int MBCS_FAST_LIMIT=MBCS_FAST_MAX+1; /* =0xd800 */
\r
137 * MBCS data header. See data format description above.
\r
139 final class MBCSHeader {
\r
140 byte version[/* U_MAX_VERSION_LENGTH */];
\r
141 int countStates, countToUFallbacks, offsetToUCodeUnits, offsetFromUTable, offsetFromUBytes;
\r
143 int fromUBytesLength;
\r
145 /* new and required in version 5 */
\r
148 /* new and optional in version 5; used if options&MBCS_OPT_NO_FROM_U */
\r
149 int fullStage2Length; /* number of 32-bit units */
\r
152 version = new byte[MAX_VERSION_LENGTH];
\r
156 public CharsetMBCS(String icuCanonicalName, String javaCanonicalName, String[] aliases, String classPath,
\r
157 ClassLoader loader) throws InvalidFormatException {
\r
158 super(icuCanonicalName, javaCanonicalName, aliases);
\r
160 /* See if the icuCanonicalName contains certain option information. */
\r
161 if (icuCanonicalName.indexOf(UConverterConstants.OPTION_SWAP_LFNL_STRING) > -1) {
\r
162 options = UConverterConstants.OPTION_SWAP_LFNL;
\r
163 icuCanonicalName = icuCanonicalName.substring(0, icuCanonicalName.indexOf(UConverterConstants.OPTION_SWAP_LFNL_STRING));
\r
164 super.icuCanonicalName = icuCanonicalName;
\r
167 // now try to load the data
\r
168 sharedData = loadConverter(1, icuCanonicalName, classPath, loader);
\r
170 maxBytesPerChar = sharedData.staticData.maxBytesPerChar;
\r
171 minBytesPerChar = sharedData.staticData.minBytesPerChar;
\r
172 maxCharsPerByte = 1;
\r
173 fromUSubstitution = sharedData.staticData.subChar;
\r
174 subChar = sharedData.staticData.subChar;
\r
175 subCharLen = sharedData.staticData.subCharLen;
\r
176 subChar1 = sharedData.staticData.subChar1;
\r
177 fromUSubstitution = new byte[sharedData.staticData.subCharLen];
\r
178 System.arraycopy(sharedData.staticData.subChar, 0, fromUSubstitution, 0, sharedData.staticData.subCharLen);
\r
180 initializeConverter(options);
\r
183 public CharsetMBCS(String icuCanonicalName, String javaCanonicalName, String[] aliases)
\r
184 throws InvalidFormatException {
\r
185 this(icuCanonicalName, javaCanonicalName, aliases, ICUResourceBundle.ICU_BUNDLE, null);
\r
188 private UConverterSharedData loadConverter(int nestedLoads, String myName, String classPath, ClassLoader loader)
\r
189 throws InvalidFormatException {
\r
190 boolean noFromU = false;
\r
191 // Read converter data from file
\r
192 UConverterStaticData staticData = new UConverterStaticData();
\r
193 UConverterDataReader reader = null;
\r
195 String resourceName = classPath + "/" + myName + "." + UConverterSharedData.DATA_TYPE;
\r
198 if (loader != null) {
\r
199 i = ICUData.getRequiredStream(loader, resourceName);
\r
201 i = ICUData.getRequiredStream(resourceName);
\r
203 BufferedInputStream b = new BufferedInputStream(i, UConverterConstants.CNV_DATA_BUFFER_SIZE);
\r
204 reader = new UConverterDataReader(b);
\r
205 reader.readStaticData(staticData);
\r
206 } catch (IOException e) {
\r
207 throw new InvalidFormatException();
\r
208 } catch (Exception e) {
\r
209 throw new InvalidFormatException();
\r
212 UConverterSharedData data = null;
\r
213 int type = staticData.conversionType;
\r
215 if (type != UConverterSharedData.UConverterType.MBCS
\r
216 || staticData.structSize != UConverterStaticData.SIZE_OF_UCONVERTER_STATIC_DATA) {
\r
217 throw new InvalidFormatException();
\r
220 data = new UConverterSharedData(1, null, false, 0);
\r
221 data.dataReader = reader;
\r
222 data.staticData = staticData;
\r
223 data.sharedDataCached = false;
\r
226 UConverterMBCSTable mbcsTable = data.mbcs;
\r
227 MBCSHeader header = new MBCSHeader();
\r
229 reader.readMBCSHeader(header);
\r
230 } catch (IOException e) {
\r
231 throw new InvalidFormatException();
\r
235 // int[] extIndexesArray = null;
\r
236 String baseNameString = null;
\r
237 int[][] stateTableArray = null;
\r
238 MBCSToUFallback[] toUFallbacksArray = null;
\r
239 char[] unicodeCodeUnitsArray = null;
\r
240 char[] fromUnicodeTableArray = null;
\r
241 byte[] fromUnicodeBytesArray = null;
\r
243 if (header.version[0] == 5 && header.version[1] >= 3 && (header.options & MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK) == 0) {
\r
244 noFromU = ((header.options & MBCS_OPT_NO_FROM_U) != 0);
\r
245 } else if (header.version[0] != 4) {
\r
246 throw new InvalidFormatException();
\r
249 mbcsTable.outputType = (byte) header.flags;
\r
251 /* extension data, header version 4.2 and higher */
\r
252 offset = header.flags >>> 8;
\r
253 // if(offset!=0 && mbcsTable.outputType == MBCS_OUTPUT_EXT_ONLY) {
\r
254 if (mbcsTable.outputType == MBCS_OUTPUT_EXT_ONLY) {
\r
256 baseNameString = reader.readBaseTableName();
\r
258 // agljport:commment subtract 32 for sizeof(_MBCSHeader) and length of baseNameString and 1 null
\r
259 // terminator byte all already read;
\r
260 mbcsTable.extIndexes = reader.readExtIndexes(offset
\r
261 - (reader.bytesRead - reader.staticDataBytesRead));
\r
263 } catch (IOException e) {
\r
264 throw new InvalidFormatException();
\r
268 // agljport:add this would be unnecessary if extIndexes were memory mapped
\r
270 * if(mbcsTable.extIndexes != null) {
\r
272 * try { //int nbytes = mbcsTable.extIndexes[UConverterExt.UCNV_EXT_TO_U_LENGTH]*4 +
\r
273 * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_TO_U_UCHARS_LENGTH]*2 +
\r
274 * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_LENGTH]*6 +
\r
275 * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_BYTES_LENGTH] +
\r
276 * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_STAGE_12_LENGTH]*2 +
\r
277 * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_STAGE_3_LENGTH]*2 +
\r
278 * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_STAGE_3B_LENGTH]*4; //int nbytes =
\r
279 * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_SIZE] //byte[] extTables = dataReader.readExtTables(nbytes);
\r
280 * //mbcsTable.extTables = ByteBuffer.wrap(extTables); } catch(IOException e) { System.err.println("Caught
\r
281 * IOException: " + e.getMessage()); pErrorCode[0] = UErrorCode.U_INVALID_FORMAT_ERROR; return; } }
\r
283 if (mbcsTable.outputType == MBCS_OUTPUT_EXT_ONLY) {
\r
284 UConverterSharedData baseSharedData = null;
\r
285 ByteBuffer extIndexes;
\r
288 /* extension-only file, load the base table and set values appropriately */
\r
289 extIndexes = mbcsTable.extIndexes;
\r
290 if (extIndexes == null) {
\r
291 /* extension-only file without extension */
\r
292 throw new InvalidFormatException();
\r
295 if (nestedLoads != 1) {
\r
296 /* an extension table must not be loaded as a base table */
\r
297 throw new InvalidFormatException();
\r
300 /* load the base table */
\r
301 baseName = baseNameString;
\r
302 if (baseName.equals(staticData.name)) {
\r
303 /* forbid loading this same extension-only file */
\r
304 throw new InvalidFormatException();
\r
307 // agljport:fix args.size=sizeof(UConverterLoadArgs);
\r
308 baseSharedData = loadConverter(2, baseName, classPath, loader);
\r
310 if (baseSharedData.staticData.conversionType != UConverterType.MBCS
\r
311 || baseSharedData.mbcs.baseSharedData != null) {
\r
312 // agljport:fix ucnv_unload(baseSharedData);
\r
313 throw new InvalidFormatException();
\r
316 /* copy the base table data */
\r
317 // agljport:comment deep copy in C changes mbcs through local reference mbcsTable; in java we probably don't
\r
318 // need the deep copy so can just make sure mbcs and its local reference both refer to the same new object
\r
319 mbcsTable = data.mbcs = baseSharedData.mbcs;
\r
321 /* overwrite values with relevant ones for the extension converter */
\r
322 mbcsTable.baseSharedData = baseSharedData;
\r
323 mbcsTable.extIndexes = extIndexes;
\r
326 * It would be possible to share the swapLFNL data with a base converter, but the generated name would have
\r
327 * to be different, and the memory would have to be free'd only once. It is easier to just create the data
\r
328 * for the extension converter separately when it is requested.
\r
330 mbcsTable.swapLFNLStateTable = null;
\r
331 mbcsTable.swapLFNLFromUnicodeBytes = null;
\r
332 mbcsTable.swapLFNLName = null;
\r
335 * Set a special, runtime-only outputType if the extension converter is a DBCS version of a base converter
\r
336 * that also maps single bytes.
\r
338 if (staticData.conversionType == UConverterType.DBCS
\r
339 || (staticData.conversionType == UConverterType.MBCS && staticData.minBytesPerChar >= 2)) {
\r
341 if (baseSharedData.mbcs.outputType == MBCS_OUTPUT_2_SISO) {
\r
342 /* the base converter is SI/SO-stateful */
\r
345 /* get the dbcs state from the state table entry for SO=0x0e */
\r
346 entry = mbcsTable.stateTable[0][0xe];
\r
347 if (MBCS_ENTRY_IS_FINAL(entry) && MBCS_ENTRY_FINAL_ACTION(entry) == MBCS_STATE_CHANGE_ONLY
\r
348 && MBCS_ENTRY_FINAL_STATE(entry) != 0) {
\r
349 mbcsTable.dbcsOnlyState = (byte) MBCS_ENTRY_FINAL_STATE(entry);
\r
351 mbcsTable.outputType = MBCS_OUTPUT_DBCS_ONLY;
\r
353 } else if (baseSharedData.staticData.conversionType == UConverterType.MBCS
\r
354 && baseSharedData.staticData.minBytesPerChar == 1
\r
355 && baseSharedData.staticData.maxBytesPerChar == 2 && mbcsTable.countStates <= 127) {
\r
357 /* non-stateful base converter, need to modify the state table */
\r
358 int newStateTable[][/* 256 */];
\r
359 int state[]; // this works because java 2-D array is array of references and we can have state =
\r
360 // newStateTable[i];
\r
363 /* allocate a new state table and copy the base state table contents */
\r
364 count = mbcsTable.countStates;
\r
365 newStateTable = new int[(count + 1) * 1024][256];
\r
367 for (i = 0; i < mbcsTable.stateTable.length; ++i)
\r
368 System.arraycopy(mbcsTable.stateTable[i], 0, newStateTable[i], 0,
\r
369 mbcsTable.stateTable[i].length);
\r
371 /* change all final single-byte entries to go to a new all-illegal state */
\r
372 state = newStateTable[0];
\r
373 for (i = 0; i < 256; ++i) {
\r
374 if (MBCS_ENTRY_IS_FINAL(state[i])) {
\r
375 state[i] = MBCS_ENTRY_TRANSITION(count, 0);
\r
379 /* build the new all-illegal state */
\r
380 state = newStateTable[count];
\r
381 for (i = 0; i < 256; ++i) {
\r
382 state[i] = MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0);
\r
384 mbcsTable.stateTable = newStateTable;
\r
385 mbcsTable.countStates = (byte) (count + 1);
\r
386 mbcsTable.stateTableOwned = true;
\r
388 mbcsTable.outputType = MBCS_OUTPUT_DBCS_ONLY;
\r
393 * unlike below for files with base tables, do not get the unicodeMask from the sharedData; instead, use the
\r
394 * base table's unicodeMask, which we copied in the memcpy above; this is necessary because the static data
\r
395 * unicodeMask, especially the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data
\r
398 /* conversion file with a base table; an additional extension table is optional */
\r
399 /* make sure that the output type is known */
\r
400 switch (mbcsTable.outputType) {
\r
401 case MBCS_OUTPUT_1:
\r
402 case MBCS_OUTPUT_2:
\r
403 case MBCS_OUTPUT_3:
\r
404 case MBCS_OUTPUT_4:
\r
405 case MBCS_OUTPUT_3_EUC:
\r
406 case MBCS_OUTPUT_4_EUC:
\r
407 case MBCS_OUTPUT_2_SISO:
\r
411 throw new InvalidFormatException();
\r
414 stateTableArray = new int[header.countStates][256];
\r
415 toUFallbacksArray = new MBCSToUFallback[header.countToUFallbacks];
\r
416 for (int i = 0; i < toUFallbacksArray.length; ++i)
\r
417 toUFallbacksArray[i] = new MBCSToUFallback();
\r
418 unicodeCodeUnitsArray = new char[(header.offsetFromUTable - header.offsetToUCodeUnits) / 2];
\r
419 fromUnicodeTableArray = new char[(header.offsetFromUBytes - header.offsetFromUTable) / 2];
\r
420 fromUnicodeBytesArray = new byte[header.fromUBytesLength];
\r
422 reader.readMBCSTable(stateTableArray, toUFallbacksArray, unicodeCodeUnitsArray, fromUnicodeTableArray,
\r
423 fromUnicodeBytesArray);
\r
424 } catch (IOException e) {
\r
425 throw new InvalidFormatException();
\r
428 mbcsTable.countStates = (byte) header.countStates;
\r
429 mbcsTable.countToUFallbacks = header.countToUFallbacks;
\r
430 mbcsTable.stateTable = stateTableArray;
\r
431 mbcsTable.toUFallbacks = toUFallbacksArray;
\r
432 mbcsTable.unicodeCodeUnits = unicodeCodeUnitsArray;
\r
434 mbcsTable.fromUnicodeTable = fromUnicodeTableArray;
\r
435 mbcsTable.fromUnicodeBytes = fromUnicodeBytesArray;
\r
436 mbcsTable.fromUBytesLength = header.fromUBytesLength;
\r
439 * converter versions 6.1 and up contain a unicodeMask that is used here to select the most efficient
\r
440 * function implementations
\r
442 // agljport:fix info.size=sizeof(UDataInfo);
\r
443 // agljport:fix udata_getInfo((UDataMemory *)sharedData->dataMemory, &info);
\r
444 // agljport:fix if(info.formatVersion[0]>6 || (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) {
\r
445 /* mask off possible future extensions to be safe */
\r
446 mbcsTable.unicodeMask = (short) (staticData.unicodeMask & 3);
\r
447 // agljport:fix } else {
\r
448 /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */
\r
449 // agljport:fix mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES;
\r
453 // agljport:commment subtract 32 for sizeof(_MBCSHeader) and length of baseNameString and 1 null
\r
454 // terminator byte all already read;
\r
455 // int namelen = baseNameString != null? baseNameString.length() + 1: 0;
\r
456 mbcsTable.extIndexes = reader.readExtIndexes(offset
\r
457 - (reader.bytesRead - reader.staticDataBytesRead));
\r
458 } catch (IOException e) {
\r
459 throw new InvalidFormatException();
\r
463 if (header.version[1] >= 3 && (mbcsTable.unicodeMask & UConverterConstants.HAS_SURROGATES) == 0 &&
\r
464 (mbcsTable.countStates == 1 ? ((char)header.version[2] >= (SBCS_FAST_MAX>>8)) : ((char)header.version[2] >= (MBCS_FAST_MAX>>8)))) {
\r
465 mbcsTable.utf8Friendly = true;
\r
467 if (mbcsTable.countStates == 1) {
\r
469 * SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBCS_FAST_MAX or higher.
\r
470 * Build a table with indexes to each block, to be used instaed of
\r
471 * the regular stage 1/2 table.
\r
473 for (int i = 0; i < (SBCS_FAST_LIMIT>>6); ++i) {
\r
474 mbcsTable.sbcsIndex[i] = mbcsTable.fromUnicodeTable[mbcsTable.fromUnicodeTable[i>>4]+((i<<2)&0x3c)];
\r
476 /* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header.version[2]>(SBCS_FAST_MAX>>8) */
\r
477 mbcsTable.maxFastUChar = SBCS_FAST_MAX;
\r
480 * MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBCS_FAST_MAX or higher.
\r
481 * The .cnv file is prebuilt with an additional stage table with indexes to each block.
\r
484 mbcsTable.mbcsIndex = ByteBuffer.wrap(mbcsTable.fromUnicodeBytes).asCharBuffer();
\r
486 mbcsTable.maxFastUChar = (char)((header.version[2]<<8) | 0xff);
\r
489 /* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */
\r
491 long asciiRoundtrips = 0xffffffff;
\r
492 for (int i = 0; i < 0x80; ++i) {
\r
493 if (mbcsTable.stateTable[0][i] != MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, i)) {
\r
494 asciiRoundtrips&=~((long)1<<(i>>2))&UConverterConstants.UNSIGNED_INT_MASK;
\r
497 mbcsTable.asciiRoundtrips = asciiRoundtrips&UConverterConstants.UNSIGNED_INT_MASK;
\r
501 int stage1Length = (mbcsTable.unicodeMask&UConverterConstants.HAS_SUPPLEMENTARY) != 0 ? 0x440 : 0x40;
\r
502 int stage2Length = (header.offsetFromUBytes - header.offsetFromUTable)/4 - stage1Length/2;
\r
503 reconstituteData(mbcsTable, stage1Length, stage2Length, header.fullStage2Length);
\r
505 if (mbcsTable.outputType == MBCS_OUTPUT_DBCS_ONLY || mbcsTable.outputType == MBCS_OUTPUT_2_SISO) {
\r
507 * MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not roundtrip.
\r
508 * MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength correctly.
\r
510 mbcsTable.asciiRoundtrips = 0;
\r
516 private static boolean writeStage3Roundtrip(UConverterMBCSTable mbcsTable, long value, int codePoints[]) {
\r
525 table = mbcsTable.fromUnicodeTable;
\r
526 bytes = mbcsTable.fromUnicodeBytes;
\r
528 /* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */
\r
529 switch(mbcsTable.outputType) {
\r
530 case MBCS_OUTPUT_3_EUC:
\r
531 if(value<=0xffff) {
\r
532 /* short sequences are stored directly */
\r
533 /* code set 0 or 1 */
\r
534 } else if(value<=0x8effff) {
\r
537 } else /* first byte is 0x8f */ {
\r
542 case MBCS_OUTPUT_4_EUC:
\r
543 if(value<=0xffffff) {
\r
544 /* short sequences are stored directly */
\r
545 /* code set 0 or 1 */
\r
546 } else if(value<=0x8effffff) {
\r
549 } else /* first byte is 0x8f */ {
\r
558 for(i=0; i<=0x1f; ++value, ++i) {
\r
564 /* locate the stage 2 & 3 data */
\r
565 stage2 = table[c>>10] + ((c>>4)&0x3f);
\r
566 st3 = table[stage2*2]<<16|table[stage2*2 + 1];
\r
567 st3 = (int)(char)(st3 * 16 + (c&0xf));
\r
569 /* write the codepage bytes into stage 3 */
\r
570 switch(mbcsTable.outputType) {
\r
571 case MBCS_OUTPUT_3:
\r
572 case MBCS_OUTPUT_4_EUC:
\r
574 bytes[p] = (byte)(value>>16);
\r
575 bytes[p+1] = (byte)(value>>8);
\r
576 bytes[p+2] = (byte)value;
\r
578 case MBCS_OUTPUT_4:
\r
579 bytes[st3*4] = (byte)(value >> 24);
\r
580 bytes[st3*4 + 1] = (byte)(value >> 16);
\r
581 bytes[st3*4 + 2] = (byte)(value >> 8);
\r
582 bytes[st3*4 + 3] = (byte)value;
\r
585 /* 2 bytes per character */
\r
586 bytes[st3*2] = (byte)(value >> 8);
\r
587 bytes[st3*2 + 1] = (byte)value;
\r
591 /* set the roundtrip flag */
\r
592 temp = (1L<<(16+(c&0xf)));
\r
593 table[stage2*2] |= (char)(temp>>16);
\r
594 table[stage2*2 + 1] |= (char)temp;
\r
599 private static void reconstituteData(UConverterMBCSTable mbcsTable, int stage1Length, int stage2Length, int fullStage2Length) {
\r
600 int datalength = stage1Length*2+fullStage2Length*4+mbcsTable.fromUBytesLength;
\r
602 byte[] stage = new byte[datalength];
\r
604 for (int i = 0; i < stage1Length; ++i) {
\r
605 stage[i*2] = (byte)(mbcsTable.fromUnicodeTable[i]>>8);
\r
606 stage[i*2+1] = (byte)(mbcsTable.fromUnicodeTable[i]);
\r
609 offset = ((fullStage2Length - stage2Length) * 4) + (stage1Length * 2);
\r
610 for (int i = 0; i < stage2Length; ++i) {
\r
611 stage[offset + i*4] = (byte)(mbcsTable.fromUnicodeTable[stage1Length + i*2]>>8);
\r
612 stage[offset + i*4+1] = (byte)(mbcsTable.fromUnicodeTable[stage1Length + i*2]);
\r
613 stage[offset + i*4+2] = (byte)(mbcsTable.fromUnicodeTable[stage1Length + i*2+1]>>8);
\r
614 stage[offset + i*4+3] = (byte)(mbcsTable.fromUnicodeTable[stage1Length + i*2+1]);
\r
617 /* indexes into stage 2 count from the bottom of the fromUnicodeTable */
\r
619 /* reconsitute the initial part of stage 2 from the mbcsIndex */
\r
621 int stageUTF8Length=((int)(mbcsTable.maxFastUChar+1))>>6;
\r
622 int stageUTF8Index=0;
\r
623 int st1, st2, st3, i;
\r
625 for (st1 = 0; stageUTF8Index < stageUTF8Length; ++st1) {
\r
626 st2 = ((char)stage[2*st1]<<8) | stage[2*st1+1];
\r
627 if (st2 != stage1Length/2) {
\r
628 /* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */
\r
629 for (i = 0; i < 16; ++i) {
\r
630 st3 = mbcsTable.mbcsIndex.get(stageUTF8Index++);
\r
632 /* a stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */
\r
635 * 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are
\r
636 * allocated together as a single 64-block for access from the mbcsIndex
\r
638 stage[4*st2] = (byte)(st3>>24); stage[4*st2+1] = (byte)(st3>>16); stage[4*st2+2] = (byte)(st3>>8); stage[4*st2+3] = (byte)(st3); st2++; st3++;
\r
639 stage[4*st2] = (byte)(st3>>24); stage[4*st2+1] = (byte)(st3>>16); stage[4*st2+2] = (byte)(st3>>8); stage[4*st2+3] = (byte)(st3); st2++; st3++;
\r
640 stage[4*st2] = (byte)(st3>>24); stage[4*st2+1] = (byte)(st3>>16); stage[4*st2+2] = (byte)(st3>>8); stage[4*st2+3] = (byte)(st3); st2++; st3++;
\r
641 stage[4*st2] = (byte)(st3>>24); stage[4*st2+1] = (byte)(st3>>16); stage[4*st2+2] = (byte)(st3>>8); stage[4*st2+3] = (byte)(st3);
\r
643 /* no stage 3 block, skip */
\r
648 /* no stage 2 block, skip */
\r
649 stageUTF8Index+=16;
\r
654 char[] stage1 = new char[stage.length/2];
\r
655 for (int i = 0; i < stage1.length; ++i) {
\r
656 stage1[i] = (char)(((stage[i*2])<<8)|(stage[i*2+1] & UConverterConstants.UNSIGNED_BYTE_MASK));
\r
658 byte[] stage2 = new byte[stage.length - ((stage1Length * 2) + (fullStage2Length * 4))];
\r
659 System.arraycopy(stage, ((stage1Length * 2) + (fullStage2Length * 4)), stage2, 0, stage2.length);
\r
661 mbcsTable.fromUnicodeTable = stage1;
\r
662 mbcsTable.fromUnicodeBytes = stage2;
\r
664 /* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */
\r
665 MBCSEnumToUnicode(mbcsTable);
\r
669 * Internal function enumerating the toUnicode data of an MBCS converter.
\r
670 * Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U
\r
671 * table, but could also be used for a future getUnicodeSet() option
\r
672 * that includes reverse fallbacks (after updating this function's implementation).
\r
673 * Currently only handles roundtrip mappings.
\r
674 * Does not currently handle extensions.
\r
676 private static void MBCSEnumToUnicode(UConverterMBCSTable mbcsTable) {
\r
678 * Properties for each state, to speed up the enumeration.
\r
679 * Ignorable actions are unassigned/illegal/state-change-only:
\r
680 * They do not lead to mappings.
\r
683 * 1 direct/initial state (stateful converters have mulitple)
\r
684 * 0 non-initial state with transitions or with nonignorable result actions
\r
685 * -1 final state with only ignorable actions
\r
688 * The lowest byte value with non-ignorable actions is
\r
689 * value<<5 (rounded down).
\r
692 * The highest byte value with non-ignorable actions is
\r
693 * (value<<5)&0x1f (rounded up).
\r
695 byte stateProps[] = new byte[MBCS_MAX_STATE_COUNT];
\r
698 /* recurse from state 0 and set all stateProps */
\r
699 getStateProp(mbcsTable.stateTable, stateProps, 0);
\r
701 for (state = 0; state < mbcsTable.countStates; ++state) {
\r
702 if (stateProps[state] >= 0x40) {
\r
703 /* start from each direct state */
\r
704 enumToU(mbcsTable, stateProps, state, 0, 0);
\r
711 private static boolean enumToU(UConverterMBCSTable mbcsTable, byte stateProps[], int state, int offset, int value) {
\r
712 int[] codePoints = new int[32];
\r
714 char[] unicodeCodeUnits;
\r
718 row = mbcsTable.stateTable[state];
\r
719 unicodeCodeUnits = mbcsTable.unicodeCodeUnits;
\r
722 anyCodePoints = -1; /* becomes non-negative if there is a mapping */
\r
724 b = (stateProps[state]&0x38)<<2;
\r
725 if (b == 0 && stateProps[state] >= 0x40) {
\r
726 /* skip byte sequences with leading zeros because they are note stored in the fromUnicode table */
\r
727 codePoints[0] = UConverterConstants.U_SENTINEL;
\r
730 limit = ((stateProps[state]&7)+1)<<5;
\r
731 while (b < limit) {
\r
732 int entry = row[b];
\r
733 if (MBCS_ENTRY_IS_TRANSITION(entry)) {
\r
734 int nextState = MBCS_ENTRY_TRANSITION_STATE(entry);
\r
735 if (stateProps[nextState] >= 0) {
\r
736 /* recurse to a state with non-ignorable actions */
\r
737 if (!enumToU(mbcsTable, stateProps, nextState, offset+MBCS_ENTRY_TRANSITION_OFFSET(entry), value|b)) {
\r
741 codePoints[b&0x1f] = UConverterConstants.U_SENTINEL;
\r
747 * An if-else-if chain provides more reliable performance for
\r
748 * the most common cases compared to a switch.
\r
750 action = MBCS_ENTRY_FINAL_ACTION(entry);
\r
751 if (action == MBCS_STATE_VALID_DIRECT_16) {
\r
752 /* output BMP code point */
\r
753 c = (char)MBCS_ENTRY_FINAL_VALUE_16(entry);
\r
754 } else if (action == MBCS_STATE_VALID_16) {
\r
755 int finalOffset = offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
\r
756 c = unicodeCodeUnits[finalOffset];
\r
758 /* output BMP code point */
\r
760 c = UConverterConstants.U_SENTINEL;
\r
762 } else if (action == MBCS_STATE_VALID_16_PAIR) {
\r
763 int finalOffset = offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
\r
764 c = unicodeCodeUnits[finalOffset++];
\r
766 /* output BMP code point below 0xd800 */
\r
767 } else if (c <= 0xdbff) {
\r
768 /* output roundtrip or fallback supplementary code point */
\r
769 c = ((c&0x3ff)<<10)+unicodeCodeUnits[finalOffset]+(0x10000-0xdc00);
\r
770 } else if (c == 0xe000) {
\r
771 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
\r
772 c = unicodeCodeUnits[finalOffset];
\r
774 c = UConverterConstants.U_SENTINEL;
\r
776 } else if (action == MBCS_STATE_VALID_DIRECT_20) {
\r
777 /* output supplementary code point */
\r
778 c = (int)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
\r
780 c = UConverterConstants.U_SENTINEL;
\r
783 codePoints[b&0x1f] = c;
\r
786 if (((++b)&0x1f) == 0) {
\r
787 if(anyCodePoints>=0) {
\r
788 if(!writeStage3Roundtrip(mbcsTable, value|(b-0x20)&UConverterConstants.UNSIGNED_INT_MASK, codePoints)) {
\r
800 * Only called if stateProps[state]==-1.
\r
801 * A recursive call may do stateProps[state]|=0x40 if this state is the target of an
\r
802 * MBCS_STATE_CHANGE_ONLY.
\r
804 private static byte getStateProp(int stateTable[][], byte stateProps[], int state) {
\r
806 int min, max, entry, nextState;
\r
808 row = stateTable[state];
\r
809 stateProps[state] = 0;
\r
811 /* find first non-ignorable state */
\r
812 for (min = 0;;++min) {
\r
814 nextState = MBCS_ENTRY_STATE(entry);
\r
815 if (stateProps[nextState] == -1) {
\r
816 getStateProp(stateTable, stateProps, nextState);
\r
818 if (MBCS_ENTRY_IS_TRANSITION(entry)) {
\r
819 if (stateProps[nextState] >- 0) {
\r
822 } else if (MBCS_ENTRY_FINAL_ACTION(entry) < MBCS_STATE_UNASSIGNED) {
\r
826 stateProps[state] = -0x40; /* (byte)0xc0 */
\r
827 return stateProps[state];
\r
830 stateProps[state]|=(byte)((min>>5)<<3);
\r
832 /* find last non-ignorable state */
\r
833 for (max = 0xff; min < max; --max) {
\r
835 nextState = MBCS_ENTRY_STATE(entry);
\r
836 if (stateProps[nextState] == -1) {
\r
837 getStateProp(stateTable, stateProps, nextState);
\r
839 if (MBCS_ENTRY_IS_TRANSITION(entry)) {
\r
840 if (stateProps[nextState] >- 0) {
\r
843 } else if (MBCS_ENTRY_FINAL_ACTION(entry) < MBCS_STATE_UNASSIGNED) {
\r
847 stateProps[state]|=(byte)(max>>5);
\r
849 /* recurse further and collect direct-state information */
\r
850 while (min <= max) {
\r
852 nextState = MBCS_ENTRY_STATE(entry);
\r
853 if (stateProps[nextState] == -1) {
\r
854 getStateProp(stateTable, stateProps, nextState);
\r
856 if (MBCS_ENTRY_IS_TRANSITION(entry)) {
\r
857 stateProps[nextState]|=0x40;
\r
858 if (MBCS_ENTRY_FINAL_ACTION(entry) <= MBCS_STATE_FALLBACK_DIRECT_20) {
\r
859 stateProps[state]|=0x40;
\r
864 return stateProps[state];
\r
867 protected void initializeConverter(int myOptions) {
\r
868 UConverterMBCSTable mbcsTable;
\r
869 ByteBuffer extIndexes;
\r
871 byte maxBytesPerUChar;
\r
873 mbcsTable = sharedData.mbcs;
\r
874 outputType = mbcsTable.outputType;
\r
876 if (outputType == MBCS_OUTPUT_DBCS_ONLY) {
\r
877 /* the swaplfnl option does not apply, remove it */
\r
878 this.options = myOptions &= ~UConverterConstants.OPTION_SWAP_LFNL;
\r
881 if ((myOptions & UConverterConstants.OPTION_SWAP_LFNL) != 0) {
\r
882 /* do this because double-checked locking is broken */
\r
885 // agljport:todo umtx_lock(NULL);
\r
886 isCached = mbcsTable.swapLFNLStateTable != null;
\r
887 // agljport:todo umtx_unlock(NULL);
\r
891 if (!EBCDICSwapLFNL()) {
\r
892 /* this option does not apply, remove it */
\r
893 this.options = myOptions &= ~UConverterConstants.OPTION_SWAP_LFNL;
\r
895 } catch (Exception e) {
\r
896 /* something went wrong. */
\r
902 if (icuCanonicalName.toLowerCase().indexOf("gb18030") >= 0) {
\r
903 /* set a flag for GB 18030 mode, which changes the callback behavior */
\r
904 this.options |= MBCS_OPTION_GB18030;
\r
907 /* fix maxBytesPerUChar depending on outputType and options etc. */
\r
908 if (outputType == MBCS_OUTPUT_2_SISO) {
\r
909 maxBytesPerChar = 3; /* SO+DBCS */
\r
912 extIndexes = mbcsTable.extIndexes;
\r
913 if (extIndexes != null) {
\r
914 maxBytesPerUChar = (byte) GET_MAX_BYTES_PER_UCHAR(extIndexes);
\r
915 if (outputType == MBCS_OUTPUT_2_SISO) {
\r
916 ++maxBytesPerUChar; /* SO + multiple DBCS */
\r
919 if (maxBytesPerUChar > maxBytesPerChar) {
\r
920 maxBytesPerChar = maxBytesPerUChar;
\r
924 /* EBCDIC swap LF<->NL--------------------------------------------------------------------------------*/
\r
926 * This code modifies a standard EBCDIC<->Unicode mappling table for
\r
927 * OS/390 (z/OS) Unix System Services (Open Edition).
\r
928 * The difference is in the mapping of Line Feed and New Line control codes:
\r
929 * Standard EBDIC maps
\r
934 * but OS/390 USS EBCDIC swaps the control codes for LF and NL,
\r
940 * This code modifies a loaded standard EBCDIC<->Unicode mapping table
\r
941 * by copying it into allocated memory and swapping the LF and NL values.
\r
942 * It allows to support the same EBCDIC charset in both version without
\r
943 * duplicating the entire installed table.
\r
945 /* standard EBCDIC codes */
\r
946 private static final short EBCDIC_LF = 0x0025;
\r
947 private static final short EBCDIC_NL = 0x0015;
\r
949 /* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */
\r
950 private static final short EBCDIC_RT_LF = 0x0f25;
\r
951 private static final short EBCDIC_RT_NL = 0x0f15;
\r
953 /* Unicode code points */
\r
954 private static final short U_LF = 0x000A;
\r
955 private static final short U_NL = 0x0085;
\r
957 private boolean EBCDICSwapLFNL() throws Exception {
\r
958 UConverterMBCSTable mbcsTable;
\r
964 int[][] newStateTable;
\r
970 int sizeofFromUBytes;
\r
972 mbcsTable = sharedData.mbcs;
\r
974 table = mbcsTable.fromUnicodeTable;
\r
975 bytes = mbcsTable.fromUnicodeBytes;
\r
979 * Check that this is an EBCDIC table with SBCS portion -
\r
980 * SBCS or EBCDIC with standard EBCDIC LF and NL mappings.
\r
982 * If not, ignore the option Options are always ignored if they do not apply.
\r
984 if (!((mbcsTable.outputType == MBCS_OUTPUT_1 || mbcsTable.outputType == MBCS_OUTPUT_2_SISO) &&
\r
985 mbcsTable.stateTable[0][EBCDIC_LF] == MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) &&
\r
986 mbcsTable.stateTable[0][EBCDIC_NL] == MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL))) {
\r
990 if (mbcsTable.outputType == MBCS_OUTPUT_1) {
\r
991 if (!(EBCDIC_RT_LF == MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) &&
\r
992 EBCDIC_RT_NL == MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL))) {
\r
995 } else /* MBCS_OUTPUT_2_SISO */ {
\r
996 stage2Entry = MBCS_STAGE_2_FROM_U(table, U_LF);
\r
997 if (!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF) &&
\r
998 EBCDIC_LF == MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF))) {
\r
1002 stage2Entry = MBCS_STAGE_2_FROM_U(table, U_NL);
\r
1003 if (!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL) &&
\r
1004 EBCDIC_NL == MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL))) {
\r
1009 if (mbcsTable.fromUBytesLength > 0) {
\r
1011 * We _know_ the number of bytes in the fromUnicodeBytes array
\r
1012 * starting with header.version 4.1.
\r
1014 sizeofFromUBytes = mbcsTable.fromUBytesLength;
\r
1018 * There used to be code to enumerate the fromUnicode
\r
1019 * trie and find the highest entry, but it was removed in ICU 3.2
\r
1020 * because it was not tested and caused a low code coverage number.
\r
1022 throw new Exception("U_INVALID_FORMAT_ERROR");
\r
1026 * The table has an appropriate format.
\r
1027 * Allocate and build
\r
1028 * - a modified to-Unicode state table
\r
1029 * - a modified from-Unicode output array
\r
1030 * - a converter name string with the swap option appended
\r
1032 // size = mbcsTable.countStates * 1024 + sizeofFromUBytes + UConverterConstants.MAX_CONVERTER_NAME_LENGTH + 20;
\r
1034 /* copy and modify the to-Unicode state table */
\r
1035 newStateTable = new int[mbcsTable.stateTable.length][mbcsTable.stateTable[0].length];
\r
1036 for (int i = 0; i < newStateTable.length; i++) {
\r
1037 System.arraycopy(mbcsTable.stateTable[i], 0, newStateTable[i], 0, newStateTable[i].length);
\r
1040 newStateTable[0][EBCDIC_LF] = MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL);
\r
1041 newStateTable[0][EBCDIC_NL] = MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF);
\r
1043 /* copy and modify the from-Unicode result table */
\r
1044 newResults = new byte[sizeofFromUBytes];
\r
1045 System.arraycopy(bytes, 0, newResults, 0, sizeofFromUBytes);
\r
1046 /* conveniently, the table access macros work on the left side of expressions */
\r
1047 if (mbcsTable.outputType == MBCS_OUTPUT_1) {
\r
1048 MBCS_SINGLE_RESULT_FROM_U_SET(table, newResults, U_LF, EBCDIC_RT_NL);
\r
1049 MBCS_SINGLE_RESULT_FROM_U_SET(table, newResults, U_NL, EBCDIC_RT_LF);
\r
1050 } else /* MBCS_OUTPUT_2_SISO */ {
\r
1051 stage2Entry = MBCS_STAGE_2_FROM_U(table, U_LF);
\r
1052 MBCS_VALUE_2_FROM_STAGE_2_SET(newResults, stage2Entry, U_LF, EBCDIC_NL);
\r
1054 stage2Entry = MBCS_STAGE_2_FROM_U(table, U_NL);
\r
1055 MBCS_VALUE_2_FROM_STAGE_2_SET(newResults, stage2Entry, U_NL, EBCDIC_LF);
\r
1058 /* set the canonical converter name */
\r
1059 newName = new String(icuCanonicalName);
\r
1060 newName.concat(UConverterConstants.OPTION_SWAP_LFNL_STRING);
\r
1062 if (mbcsTable.swapLFNLStateTable == null) {
\r
1063 mbcsTable.swapLFNLStateTable = newStateTable;
\r
1064 mbcsTable.swapLFNLFromUnicodeBytes = newResults;
\r
1065 mbcsTable.swapLFNLName = newName;
\r
1071 * MBCS output types for conversions from Unicode. These per-converter types determine the storage method in stage 3
\r
1072 * of the lookup table, mostly how many bytes are stored per entry.
\r
1074 static final int MBCS_OUTPUT_1 = 0; /* 0 */
\r
1075 static final int MBCS_OUTPUT_2 = MBCS_OUTPUT_1 + 1; /* 1 */
\r
1076 static final int MBCS_OUTPUT_3 = MBCS_OUTPUT_2 + 1; /* 2 */
\r
1077 static final int MBCS_OUTPUT_4 = MBCS_OUTPUT_3 + 1; /* 3 */
\r
1078 static final int MBCS_OUTPUT_3_EUC = 8; /* 8 */
\r
1079 static final int MBCS_OUTPUT_4_EUC = MBCS_OUTPUT_3_EUC + 1; /* 9 */
\r
1080 static final int MBCS_OUTPUT_2_SISO = 12; /* c */
\r
1081 static final int MBCS_OUTPUT_2_HZ = MBCS_OUTPUT_2_SISO + 1; /* d */
\r
1082 static final int MBCS_OUTPUT_EXT_ONLY = MBCS_OUTPUT_2_HZ + 1; /* e */
\r
1083 // static final int MBCS_OUTPUT_COUNT = MBCS_OUTPUT_EXT_ONLY + 1;
\r
1084 static final int MBCS_OUTPUT_DBCS_ONLY = 0xdb; /* runtime-only type for DBCS-only handling of SISO tables */
\r
1086 /* GB 18030 data ------------------------------------------------------------ */
\r
1088 /* helper macros for linear values for GB 18030 four-byte sequences */
\r
1089 private static long LINEAR_18030(long a, long b, long c, long d) {
\r
1090 return ((((a & 0xff) * 10 + (b & 0xff)) * 126L + (c & 0xff)) * 10L + (d & 0xff));
\r
1093 private static long LINEAR_18030_BASE = LINEAR_18030(0x81, 0x30, 0x81, 0x30);
\r
1095 private static long LINEAR(long x) {
\r
1096 return LINEAR_18030(x >>> 24, (x >>> 16) & 0xff, (x >>> 8) & 0xff, x & 0xff);
\r
1100 * Some ranges of GB 18030 where both the Unicode code points and the GB four-byte sequences are contiguous and are
\r
1101 * handled algorithmically by the special callback functions below. The values are start & end of Unicode & GB
\r
1104 * Note that single surrogates are not mapped by GB 18030 as of the re-released mapping tables from 2000-nov-30.
\r
1106 private static final long gb18030Ranges[][] = new long[/* 13 */][/* 4 */] {
\r
1107 { 0x10000L, 0x10FFFFL, LINEAR(0x90308130L), LINEAR(0xE3329A35L) },
\r
1108 { 0x9FA6L, 0xD7FFL, LINEAR(0x82358F33L), LINEAR(0x8336C738L) },
\r
1109 { 0x0452L, 0x200FL, LINEAR(0x8130D330L), LINEAR(0x8136A531L) },
\r
1110 { 0xE865L, 0xF92BL, LINEAR(0x8336D030L), LINEAR(0x84308534L) },
\r
1111 { 0x2643L, 0x2E80L, LINEAR(0x8137A839L), LINEAR(0x8138FD38L) },
\r
1112 { 0xFA2AL, 0xFE2FL, LINEAR(0x84309C38L), LINEAR(0x84318537L) },
\r
1113 { 0x3CE1L, 0x4055L, LINEAR(0x8231D438L), LINEAR(0x8232AF32L) },
\r
1114 { 0x361BL, 0x3917L, LINEAR(0x8230A633L), LINEAR(0x8230F237L) },
\r
1115 { 0x49B8L, 0x4C76L, LINEAR(0x8234A131L), LINEAR(0x8234E733L) },
\r
1116 { 0x4160L, 0x4336L, LINEAR(0x8232C937L), LINEAR(0x8232F837L) },
\r
1117 { 0x478EL, 0x4946L, LINEAR(0x8233E838L), LINEAR(0x82349638L) },
\r
1118 { 0x44D7L, 0x464BL, LINEAR(0x8233A339L), LINEAR(0x8233C931L) },
\r
1119 { 0xFFE6L, 0xFFFFL, LINEAR(0x8431A234L), LINEAR(0x8431A439L) } };
\r
1121 /* bit flag for UConverter.options indicating GB 18030 special handling */
\r
1122 private static final int MBCS_OPTION_GB18030 = 0x8000;
\r
1125 static final int MBCS_MAX_STATE_COUNT = 128;
\r
1128 * MBCS action codes for conversions to Unicode. These values are in bits 23..20 of the state table entries.
\r
1130 static final int MBCS_STATE_VALID_DIRECT_16 = 0;
\r
1131 static final int MBCS_STATE_VALID_DIRECT_20 = MBCS_STATE_VALID_DIRECT_16 + 1;
\r
1132 static final int MBCS_STATE_FALLBACK_DIRECT_16 = MBCS_STATE_VALID_DIRECT_20 + 1;
\r
1133 static final int MBCS_STATE_FALLBACK_DIRECT_20 = MBCS_STATE_FALLBACK_DIRECT_16 + 1;
\r
1134 static final int MBCS_STATE_VALID_16 = MBCS_STATE_FALLBACK_DIRECT_20 + 1;
\r
1135 static final int MBCS_STATE_VALID_16_PAIR = MBCS_STATE_VALID_16 + 1;
\r
1136 static final int MBCS_STATE_UNASSIGNED = MBCS_STATE_VALID_16_PAIR + 1;
\r
1137 static final int MBCS_STATE_ILLEGAL = MBCS_STATE_UNASSIGNED + 1;
\r
1138 static final int MBCS_STATE_CHANGE_ONLY = MBCS_STATE_ILLEGAL + 1;
\r
1140 static int MBCS_ENTRY_SET_STATE(int entry, int state) {
\r
1141 return (int)(((entry)&0x80ffffff)|((int)(state)<<24L));
\r
1144 static int MBCS_ENTRY_STATE(int entry) {
\r
1145 return (((entry)>>24)&0x7f);
\r
1148 /* Methods for state table entries */
\r
1149 static int MBCS_ENTRY_TRANSITION(int state, int offset) {
\r
1150 return (state << 24L) | offset;
\r
1153 static int MBCS_ENTRY_FINAL(int state, int action, int value) {
\r
1154 return (int) (0x80000000 | ((int) (state) << 24L) | ((action) << 20L) | (value));
\r
1157 static boolean MBCS_ENTRY_IS_TRANSITION(int entry) {
\r
1158 return (entry) >= 0;
\r
1161 static boolean MBCS_ENTRY_IS_FINAL(int entry) {
\r
1162 return (entry) < 0;
\r
1165 static int MBCS_ENTRY_TRANSITION_STATE(int entry) {
\r
1166 return ((entry) >>> 24);
\r
1169 static int MBCS_ENTRY_TRANSITION_OFFSET(int entry) {
\r
1170 return ((entry) & 0xffffff);
\r
1173 static int MBCS_ENTRY_FINAL_STATE(int entry) {
\r
1174 return ((entry) >>> 24) & 0x7f;
\r
1177 static boolean MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(int entry) {
\r
1178 return ((entry) < 0x80100000);
\r
1181 static int MBCS_ENTRY_FINAL_ACTION(int entry) {
\r
1182 return ((entry) >>> 20) & 0xf;
\r
1185 static int MBCS_ENTRY_FINAL_VALUE(int entry) {
\r
1186 return ((entry) & 0xfffff);
\r
1189 static char MBCS_ENTRY_FINAL_VALUE_16(int entry) {
\r
1190 return (char) (entry);
\r
1193 static boolean MBCS_IS_ASCII_ROUNDTRIP(int b, long asciiRoundtrips) {
\r
1194 return (((asciiRoundtrips) & (1<<((b)>>2)))!=0);
\r
1198 * This macro version of _MBCSSingleSimpleGetNextUChar() gets a code point from a byte. It works for single-byte,
\r
1199 * single-state codepages that only map to and from BMP code points, and it always returns fallback values.
\r
1201 static char MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(UConverterMBCSTable mbcs, final int b) {
\r
1202 return MBCS_ENTRY_FINAL_VALUE_16(mbcs.stateTable[0][b & UConverterConstants.UNSIGNED_BYTE_MASK]);
\r
1205 /* single-byte fromUnicode: get the 16-bit result word */
\r
1206 static char MBCS_SINGLE_RESULT_FROM_U(char[] table, byte[] results, int c) {
\r
1207 int i1 = table[c >>> 10] + ((c >>> 4) & 0x3f);
\r
1208 int i = 2 * (table[i1] + (c & 0xf)); // used as index into byte[] array treated as char[] array
\r
1209 return (char) (((results[i] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | (results[i + 1] & UConverterConstants.UNSIGNED_BYTE_MASK));
\r
1212 /* single-byte fromUnicode: set the 16-bit result word with newValue*/
\r
1213 static void MBCS_SINGLE_RESULT_FROM_U_SET(char[] table, byte[] results, int c, int newValue) {
\r
1214 int i1 = table[c >>> 10] + ((c >>> 4) & 0x3f);
\r
1215 int i = 2 * (table[i1] + (c & 0xf)); // used as index into byte[] array treated as char[] array
\r
1216 results[i] = (byte)((newValue >> 8) & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
1217 results[i + 1] = (byte)(newValue & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
1220 /* multi-byte fromUnicode: get the 32-bit stage 2 entry */
\r
1221 static int MBCS_STAGE_2_FROM_U(char[] table, int c) {
\r
1222 int i = 2 * (table[(c) >>> 10] + ((c >>> 4) & 0x3f)); // 2x because used as index into char[] array treated as
\r
1224 return ((table[i] & UConverterConstants.UNSIGNED_SHORT_MASK) << 16)
\r
1225 | (table[i + 1] & UConverterConstants.UNSIGNED_SHORT_MASK);
\r
1228 private static boolean MBCS_FROM_U_IS_ROUNDTRIP(int stage2Entry, int c) {
\r
1229 return (((stage2Entry) & (1 << (16 + ((c) & 0xf)))) != 0);
\r
1232 static char MBCS_VALUE_2_FROM_STAGE_2(byte[] bytes, int stage2Entry, int c) {
\r
1233 int i = 2 * (16 * ((char) stage2Entry & UConverterConstants.UNSIGNED_SHORT_MASK) + (c & 0xf));
\r
1234 return (char) (((bytes[i] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | (bytes[i + 1] & UConverterConstants.UNSIGNED_BYTE_MASK));
\r
1237 static void MBCS_VALUE_2_FROM_STAGE_2_SET(byte[] bytes, int stage2Entry, int c, int newValue) {
\r
1238 int i = 2 * (16 * ((char) stage2Entry & UConverterConstants.UNSIGNED_SHORT_MASK) + (c & 0xf));
\r
1239 bytes[i] = (byte)((newValue >> 8) & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
1240 bytes[i + 1] = (byte)(newValue & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
1243 private static int MBCS_VALUE_4_FROM_STAGE_2(byte[] bytes, int stage2Entry, int c) {
\r
1244 int i = 4 * (16 * ((char) stage2Entry & UConverterConstants.UNSIGNED_SHORT_MASK) + (c & 0xf));
\r
1245 return ((bytes[i] & UConverterConstants.UNSIGNED_BYTE_MASK) << 24)
\r
1246 | ((bytes[i + 1] & UConverterConstants.UNSIGNED_BYTE_MASK) << 16)
\r
1247 | ((bytes[i + 2] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8)
\r
1248 | (bytes[i + 3] & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
1251 static int MBCS_POINTER_3_FROM_STAGE_2(byte[] bytes, int stage2Entry, int c) {
\r
1252 return ((16 * ((char) (stage2Entry) & UConverterConstants.UNSIGNED_SHORT_MASK) + ((c) & 0xf)) * 3);
\r
1255 // ------------UConverterExt-------------------------------------------------------
\r
1257 static final int EXT_INDEXES_LENGTH = 0; /* 0 */
\r
1259 static final int EXT_TO_U_INDEX = EXT_INDEXES_LENGTH + 1; /* 1 */
\r
1260 static final int EXT_TO_U_LENGTH = EXT_TO_U_INDEX + 1;
\r
1261 static final int EXT_TO_U_UCHARS_INDEX = EXT_TO_U_LENGTH + 1;
\r
1262 static final int EXT_TO_U_UCHARS_LENGTH = EXT_TO_U_UCHARS_INDEX + 1;
\r
1264 static final int EXT_FROM_U_UCHARS_INDEX = EXT_TO_U_UCHARS_LENGTH + 1; /* 5 */
\r
1265 static final int EXT_FROM_U_VALUES_INDEX = EXT_FROM_U_UCHARS_INDEX + 1;
\r
1266 static final int EXT_FROM_U_LENGTH = EXT_FROM_U_VALUES_INDEX + 1;
\r
1267 static final int EXT_FROM_U_BYTES_INDEX = EXT_FROM_U_LENGTH + 1;
\r
1268 static final int EXT_FROM_U_BYTES_LENGTH = EXT_FROM_U_BYTES_INDEX + 1;
\r
1270 static final int EXT_FROM_U_STAGE_12_INDEX = EXT_FROM_U_BYTES_LENGTH + 1; /* 10 */
\r
1271 static final int EXT_FROM_U_STAGE_1_LENGTH = EXT_FROM_U_STAGE_12_INDEX + 1;
\r
1272 static final int EXT_FROM_U_STAGE_12_LENGTH = EXT_FROM_U_STAGE_1_LENGTH + 1;
\r
1273 static final int EXT_FROM_U_STAGE_3_INDEX = EXT_FROM_U_STAGE_12_LENGTH + 1;
\r
1274 static final int EXT_FROM_U_STAGE_3_LENGTH = EXT_FROM_U_STAGE_3_INDEX + 1;
\r
1275 static final int EXT_FROM_U_STAGE_3B_INDEX = EXT_FROM_U_STAGE_3_LENGTH + 1;
\r
1276 static final int EXT_FROM_U_STAGE_3B_LENGTH = EXT_FROM_U_STAGE_3B_INDEX + 1;
\r
1278 private static final int EXT_COUNT_BYTES = EXT_FROM_U_STAGE_3B_LENGTH + 1; /* 17 */
\r
1279 // private static final int EXT_COUNT_UCHARS = EXT_COUNT_BYTES + 1;
\r
1280 // private static final int EXT_FLAGS = EXT_COUNT_UCHARS + 1;
\r
1282 // private static final int EXT_RESERVED_INDEX = EXT_FLAGS + 1; /* 20, moves with additional indexes */
\r
1284 // private static final int EXT_SIZE=31;
\r
1285 // private static final int EXT_INDEXES_MIN_LENGTH=32;
\r
1287 static final int EXT_FROM_U_MAX_DIRECT_LENGTH = 3;
\r
1289 /* toUnicode helpers -------------------------------------------------------- */
\r
1291 private static final int TO_U_BYTE_SHIFT = 24;
\r
1292 private static final int TO_U_VALUE_MASK = 0xffffff;
\r
1293 private static final int TO_U_MIN_CODE_POINT = 0x1f0000;
\r
1294 private static final int TO_U_MAX_CODE_POINT = 0x2fffff;
\r
1295 private static final int TO_U_ROUNDTRIP_FLAG = (1 << 23);
\r
1296 private static final int TO_U_INDEX_MASK = 0x3ffff;
\r
1297 private static final int TO_U_LENGTH_SHIFT = 18;
\r
1298 private static final int TO_U_LENGTH_OFFSET = 12;
\r
1300 /* maximum number of indexed UChars */
\r
1301 static final int MAX_UCHARS = 19;
\r
1303 static int TO_U_GET_BYTE(int word) {
\r
1304 return word >>> TO_U_BYTE_SHIFT;
\r
1307 static int TO_U_GET_VALUE(int word) {
\r
1308 return word & TO_U_VALUE_MASK;
\r
1311 static boolean TO_U_IS_ROUNDTRIP(int value) {
\r
1312 return (value & TO_U_ROUNDTRIP_FLAG) != 0;
\r
1315 static boolean TO_U_IS_PARTIAL(int value) {
\r
1316 return (value & UConverterConstants.UNSIGNED_INT_MASK) < TO_U_MIN_CODE_POINT;
\r
1319 static int TO_U_GET_PARTIAL_INDEX(int value) {
\r
1323 static int TO_U_MASK_ROUNDTRIP(int value) {
\r
1324 return value & ~TO_U_ROUNDTRIP_FLAG;
\r
1327 private static int TO_U_MAKE_WORD(byte b, int value) {
\r
1328 return ((b & UConverterConstants.UNSIGNED_BYTE_MASK) << TO_U_BYTE_SHIFT) | value;
\r
1331 /* use after masking off the roundtrip flag */
\r
1332 static boolean TO_U_IS_CODE_POINT(int value) {
\r
1333 return (value & UConverterConstants.UNSIGNED_INT_MASK) <= TO_U_MAX_CODE_POINT;
\r
1336 static int TO_U_GET_CODE_POINT(int value) {
\r
1337 return (int) ((value & UConverterConstants.UNSIGNED_INT_MASK) - TO_U_MIN_CODE_POINT);
\r
1340 private static int TO_U_GET_INDEX(int value) {
\r
1341 return value & TO_U_INDEX_MASK;
\r
1344 private static int TO_U_GET_LENGTH(int value) {
\r
1345 return (value >>> TO_U_LENGTH_SHIFT) - TO_U_LENGTH_OFFSET;
\r
1348 /* fromUnicode helpers ------------------------------------------------------ */
\r
1350 /* most trie constants are shared with ucnvmbcs.h */
\r
1351 private static final int STAGE_2_LEFT_SHIFT = 2;
\r
1353 // private static final int STAGE_3_GRANULARITY = 4;
\r
1355 /* trie access, returns the stage 3 value=index to stage 3b; s1Index=c>>10 */
\r
1356 static int FROM_U(CharBuffer stage12, CharBuffer stage3, int s1Index, int c) {
\r
1357 return stage3.get(((int) stage12.get((stage12.get(s1Index) + ((c >>> 4) & 0x3f))) << STAGE_2_LEFT_SHIFT)
\r
1361 private static final int FROM_U_LENGTH_SHIFT = 24;
\r
1362 private static final int FROM_U_ROUNDTRIP_FLAG = 1 << 31;
\r
1363 static final int FROM_U_RESERVED_MASK = 0x60000000;
\r
1364 private static final int FROM_U_DATA_MASK = 0xffffff;
\r
1366 /* special value for "no mapping" to <subchar1> (impossible roundtrip to 0 bytes, value 01) */
\r
1367 static final int FROM_U_SUBCHAR1 = 0x80000001;
\r
1369 /* at most 3 bytes in the lower part of the value */
\r
1370 private static final int FROM_U_MAX_DIRECT_LENGTH = 3;
\r
1372 /* maximum number of indexed bytes */
\r
1373 static final int MAX_BYTES = 0x1f;
\r
1375 static boolean FROM_U_IS_PARTIAL(int value) {
\r
1376 return (value >>> FROM_U_LENGTH_SHIFT) == 0;
\r
1379 static int FROM_U_GET_PARTIAL_INDEX(int value) {
\r
1383 static boolean FROM_U_IS_ROUNDTRIP(int value) {
\r
1384 return (value & FROM_U_ROUNDTRIP_FLAG) != 0;
\r
1387 private static int FROM_U_MASK_ROUNDTRIP(int value) {
\r
1388 return value & ~FROM_U_ROUNDTRIP_FLAG;
\r
1391 /* use after masking off the roundtrip flag */
\r
1392 static int FROM_U_GET_LENGTH(int value) {
\r
1393 return (value >>> FROM_U_LENGTH_SHIFT) & MAX_BYTES;
\r
1396 /* get bytes or bytes index */
\r
1397 static int FROM_U_GET_DATA(int value) {
\r
1398 return value & FROM_U_DATA_MASK;
\r
1401 /* get the pointer to an extension array from indexes[index] */
\r
1402 static Buffer ARRAY(ByteBuffer indexes, int index, Class itemType) {
\r
1403 int oldpos = indexes.position();
\r
1406 indexes.position(indexes.getInt(index << 2));
\r
1407 if (itemType == int.class)
\r
1408 b = indexes.asIntBuffer();
\r
1409 else if (itemType == char.class)
\r
1410 b = indexes.asCharBuffer();
\r
1411 else if (itemType == short.class)
\r
1412 b = indexes.asShortBuffer();
\r
1414 // default or (itemType == byte.class)
\r
1415 b = indexes.slice();
\r
1416 indexes.position(oldpos);
\r
1420 private static int GET_MAX_BYTES_PER_UCHAR(ByteBuffer indexes) {
\r
1421 indexes.position(0);
\r
1422 return indexes.getInt(EXT_COUNT_BYTES) & 0xff;
\r
1426 * @return index of the UChar, if found; else <0
\r
1428 static int findFromU(CharBuffer fromUSection, int length, char u) {
\r
1429 int i, start, limit;
\r
1431 /* binary search */
\r
1435 i = limit - start;
\r
1439 /* start<limit-1 */
\r
1442 /* linear search for the last part */
\r
1443 if (u <= fromUSection.get(fromUSection.position() + start)) {
\r
1446 if (++start < limit && u <= fromUSection.get(fromUSection.position() + start)) {
\r
1449 if (++start < limit && u <= fromUSection.get(fromUSection.position() + start)) {
\r
1452 /* always break at start==limit-1 */
\r
1457 i = (start + limit) / 2;
\r
1458 if (u < fromUSection.get(fromUSection.position() + i)) {
\r
1465 /* did we really find it? */
\r
1466 if (start < limit && u == fromUSection.get(fromUSection.position() + start)) {
\r
1469 return -1; /* not found */
\r
1474 * @return lookup value for the byte, if found; else 0
\r
1476 static int findToU(IntBuffer toUSection, int length, short byt) {
\r
1478 int i, start, limit;
\r
1480 /* check the input byte against the lowest and highest section bytes */
\r
1481 // agljport:comment instead of receiving a start position parameter for toUSection we'll rely on its position
\r
1483 start = TO_U_GET_BYTE(toUSection.get(toUSection.position()));
\r
1484 limit = TO_U_GET_BYTE(toUSection.get(toUSection.position() + length - 1));
\r
1485 if (byt < start || limit < byt) {
\r
1486 return 0; /* the byte is out of range */
\r
1489 if (length == ((limit - start) + 1)) {
\r
1490 /* direct access on a linear array */
\r
1491 return TO_U_GET_VALUE(toUSection.get(toUSection.position() + byt - start)); /* could be 0 */
\r
1494 /* word0 is suitable for <=toUSection[] comparison, word for <toUSection[] */
\r
1495 word0 = TO_U_MAKE_WORD((byte) byt, 0) & UConverterConstants.UNSIGNED_INT_MASK;
\r
1498 * Shift byte once instead of each section word and add 0xffffff. We will compare the shifted/added byte
\r
1499 * (bbffffff) against section words which have byte values in the same bit position. If and only if byte bb <
\r
1500 * section byte ss then bbffffff<ssvvvvvv for all v=0..f so we need not mask off the lower 24 bits of each
\r
1503 word = word0 | TO_U_VALUE_MASK;
\r
1505 /* binary search */
\r
1509 i = limit - start;
\r
1513 /* start<limit-1 */
\r
1516 /* linear search for the last part */
\r
1517 if (word0 <= (toUSection.get(toUSection.position() + start) & UConverterConstants.UNSIGNED_INT_MASK)) {
\r
1520 if (++start < limit
\r
1521 && word0 <= (toUSection.get(toUSection.position() + start) & UConverterConstants.UNSIGNED_INT_MASK)) {
\r
1524 if (++start < limit
\r
1525 && word0 <= (toUSection.get(toUSection.position() + start) & UConverterConstants.UNSIGNED_INT_MASK)) {
\r
1528 /* always break at start==limit-1 */
\r
1533 i = (start + limit) / 2;
\r
1534 if (word < (toUSection.get(toUSection.position() + i) & UConverterConstants.UNSIGNED_INT_MASK)) {
\r
1541 /* did we really find it? */
\r
1542 if (start < limit) {
\r
1543 word = (toUSection.get(toUSection.position() + start) & UConverterConstants.UNSIGNED_INT_MASK);
\r
1544 if (byt == TO_U_GET_BYTE((int)word)) {
\r
1545 return TO_U_GET_VALUE((int) word); /* never 0 */
\r
1548 return 0; /* not found */
\r
1552 * TRUE if not an SI/SO stateful converter, or if the match length fits with the current converter state
\r
1554 static boolean TO_U_VERIFY_SISO_MATCH(byte sisoState, int match) {
\r
1555 return sisoState < 0 || (sisoState == 0) == (match == 1);
\r
1559 * get the SI/SO toU state (state 0 is for SBCS, 1 for DBCS), or 1 for DBCS-only, or -1 if the converter is not
\r
1562 * Note: For SI/SO stateful converters getting here, cnv->mode==0 is equivalent to firstLength==1.
\r
1564 private static int SISO_STATE(UConverterSharedData sharedData, int mode) {
\r
1565 return sharedData.mbcs.outputType == MBCS_OUTPUT_2_SISO ? (byte) mode
\r
1566 : sharedData.mbcs.outputType == MBCS_OUTPUT_DBCS_ONLY ? 1 : -1;
\r
1569 class CharsetDecoderMBCS extends CharsetDecoderICU {
\r
1571 CharsetDecoderMBCS(CharsetICU cs) {
\r
1575 protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
\r
1576 /* Just call cnvMBCSToUnicodeWithOffsets() to remove duplicate code. */
\r
1577 return cnvMBCSToUnicodeWithOffsets(source, target, offsets, flush);
\r
1581 * continue partial match with new input never called for simple, single-character conversion
\r
1583 private CoderResult continueMatchToU(ByteBuffer source, CharBuffer target, IntBuffer offsets, int srcIndex,
\r
1585 CoderResult cr = CoderResult.UNDERFLOW;
\r
1587 int[] value = new int[1];
\r
1588 int match, length;
\r
1590 match = matchToU((byte) SISO_STATE(sharedData, mode), preToUArray, preToUBegin, preToULength, source,
\r
1591 value, isToUUseFallback(), flush);
\r
1594 if (match >= preToULength) {
\r
1595 /* advance src pointer for the consumed input */
\r
1596 source.position(source.position() + match - preToULength);
\r
1599 /* the match did not use all of preToU[] - keep the rest for replay */
\r
1600 length = preToULength - match;
\r
1601 System.arraycopy(preToUArray, preToUBegin + match, preToUArray, preToUBegin, length);
\r
1602 preToULength = (byte) -length;
\r
1605 /* write result */
\r
1606 cr = writeToU(value[0], target, offsets, srcIndex);
\r
1607 } else if (match < 0) {
\r
1608 /* save state for partial match */
\r
1609 int j, sArrayIndex;
\r
1611 /* just _append_ the newly consumed input to preToU[] */
\r
1612 sArrayIndex = source.position();
\r
1614 for (j = preToULength; j < match; ++j) {
\r
1615 preToUArray[j] = source.get(sArrayIndex++);
\r
1617 source.position(sArrayIndex); /* same as *src=srcLimit; because we reached the end of input */
\r
1618 preToULength = (byte) match;
\r
1619 } else /* match==0 */{
\r
1623 * We need to split the previous input into two parts:
\r
1625 * 1. The first codepage character is unmappable - that's how we got into trying the extension data in
\r
1626 * the first place. We need to move it from the preToU buffer to the error buffer, set an error code,
\r
1627 * and prepare the rest of the previous input for 2.
\r
1629 * 2. The rest of the previous input must be converted once we come back from the callback for the first
\r
1630 * character. At that time, we have to try again from scratch to convert these input characters. The
\r
1631 * replay will be handled by the ucnv.c conversion code.
\r
1634 /* move the first codepage character to the error field */
\r
1635 System.arraycopy(preToUArray, preToUBegin, toUBytesArray, toUBytesBegin, preToUFirstLength);
\r
1636 toULength = preToUFirstLength;
\r
1638 /* move the rest up inside the buffer */
\r
1639 length = preToULength - preToUFirstLength;
\r
1641 System.arraycopy(preToUArray, preToUBegin + preToUFirstLength, preToUArray, preToUBegin, length);
\r
1644 /* mark preToU for replay */
\r
1645 preToULength = (byte) -length;
\r
1647 /* set the error code for unassigned */
\r
1648 cr = CoderResult.unmappableForLength(preToUFirstLength);
\r
1654 * this works like natchFromU() except - the first character is in pre - no trie is used - the returned
\r
1655 * matchLength is not offset by 2
\r
1657 private int matchToU(byte sisoState, byte[] preArray, int preArrayBegin, int preLength, ByteBuffer source,
\r
1658 int[] pMatchValue, boolean isUseFallback, boolean flush) {
\r
1659 ByteBuffer cx = sharedData.mbcs.extIndexes;
\r
1660 IntBuffer toUTable, toUSection;
\r
1662 int value, matchValue, srcLength = 0;
\r
1663 int i, j, index, length, matchLength;
\r
1666 if (cx == null || cx.asIntBuffer().get(EXT_TO_U_LENGTH) <= 0) {
\r
1667 return 0; /* no extension data, no match */
\r
1671 toUTable = (IntBuffer) ARRAY(cx, EXT_TO_U_INDEX, int.class);
\r
1675 i = j = matchLength = 0;
\r
1676 if (source != null) {
\r
1677 srcLength = source.remaining();
\r
1680 if (sisoState == 0) {
\r
1681 /* SBCS state of an SI/SO stateful converter, look at only exactly 1 byte */
\r
1682 if (preLength > 1) {
\r
1683 return 0; /* no match of a DBCS sequence in SBCS mode */
\r
1684 } else if (preLength == 1) {
\r
1686 } else /* preLength==0 */{
\r
1687 if (srcLength > 1) {
\r
1694 /* we must not remember fallback matches when not using fallbacks */
\r
1696 /* match input units until there is a full match or the input is consumed */
\r
1698 /* go to the next section */
\r
1699 int oldpos = toUTable.position();
\r
1700 toUSection = ((IntBuffer) toUTable.position(index)).slice();
\r
1701 toUTable.position(oldpos);
\r
1703 /* read first pair of the section */
\r
1704 value = toUSection.get();
\r
1705 length = TO_U_GET_BYTE(value);
\r
1706 value = TO_U_GET_VALUE(value);
\r
1707 if (value != 0 && (TO_U_IS_ROUNDTRIP(value) || isToUUseFallback(isUseFallback))
\r
1708 && TO_U_VERIFY_SISO_MATCH(sisoState, i + j)) {
\r
1709 /* remember longest match so far */
\r
1710 matchValue = value;
\r
1711 matchLength = i + j;
\r
1714 /* match pre[] then src[] */
\r
1715 if (i < preLength) {
\r
1716 b = (short) (preArray[preArrayBegin + i++] & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
1717 } else if (j < srcLength) {
\r
1718 b = (short) (source.get(source.position() + j++) & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
1720 /* all input consumed, partial match */
\r
1721 if (flush || (length = (i + j)) > MAX_BYTES) {
\r
1723 * end of the entire input stream, stop with the longest match so far or: partial match must not
\r
1724 * be longer than UCNV_EXT_MAX_BYTES because it must fit into state buffers
\r
1728 /* continue with more input next time */
\r
1733 /* search for the current UChar */
\r
1734 value = findToU(toUSection, length, b);
\r
1736 /* no match here, stop with the longest match so far */
\r
1739 if (TO_U_IS_PARTIAL(value)) {
\r
1740 /* partial match, continue */
\r
1741 index = TO_U_GET_PARTIAL_INDEX(value);
\r
1743 if ((TO_U_IS_ROUNDTRIP(value) || isToUUseFallback(isUseFallback)) && TO_U_VERIFY_SISO_MATCH(sisoState, i + j)) {
\r
1744 /* full match, stop with result */
\r
1745 matchValue = value;
\r
1746 matchLength = i + j;
\r
1748 /* full match on fallback not taken, stop with the longest match so far */
\r
1755 if (matchLength == 0) {
\r
1756 /* no match at all */
\r
1760 /* return result */
\r
1761 pMatchValue[0] = TO_U_MASK_ROUNDTRIP(matchValue);
\r
1762 return matchLength;
\r
1765 private CoderResult writeToU(int value, CharBuffer target, IntBuffer offsets, int srcIndex) {
\r
1766 ByteBuffer cx = sharedData.mbcs.extIndexes;
\r
1767 /* output the result */
\r
1768 if (TO_U_IS_CODE_POINT(value)) {
\r
1769 /* output a single code point */
\r
1770 return toUWriteCodePoint(TO_U_GET_CODE_POINT(value), target, offsets, srcIndex);
\r
1772 /* output a string - with correct data we have resultLength>0 */
\r
1774 char[] a = new char[TO_U_GET_LENGTH(value)];
\r
1775 CharBuffer cb = ((CharBuffer) ARRAY(cx, EXT_TO_U_UCHARS_INDEX, char.class));
\r
1776 cb.position(TO_U_GET_INDEX(value));
\r
1777 cb.get(a, 0, a.length);
\r
1778 return toUWriteUChars(this, a, 0, a.length, target, offsets, srcIndex);
\r
1782 private CoderResult toUWriteCodePoint(int c, CharBuffer target, IntBuffer offsets, int sourceIndex) {
\r
1783 CoderResult cr = CoderResult.UNDERFLOW;
\r
1784 int tBeginIndex = target.position();
\r
1786 if (target.hasRemaining()) {
\r
1787 if (c <= 0xffff) {
\r
1788 target.put((char) c);
\r
1789 c = UConverterConstants.U_SENTINEL;
\r
1790 } else /* c is a supplementary code point */{
\r
1791 target.put(UTF16.getLeadSurrogate(c));
\r
1792 c = UTF16.getTrailSurrogate(c);
\r
1793 if (target.hasRemaining()) {
\r
1794 target.put((char) c);
\r
1795 c = UConverterConstants.U_SENTINEL;
\r
1799 /* write offsets */
\r
1800 if (offsets != null) {
\r
1801 offsets.put(sourceIndex);
\r
1802 if ((tBeginIndex + 1) < target.position()) {
\r
1803 offsets.put(sourceIndex);
\r
1808 /* write overflow from c */
\r
1810 charErrorBufferLength = UTF16.append(charErrorBufferArray, 0, c);
\r
1811 cr = CoderResult.OVERFLOW;
\r
1818 * Input sequence: cnv->toUBytes[0..length[ @return if(U_FAILURE) return the length (toULength, byteIndex) for
\r
1819 * the input else return 0 after output has been written to the target
\r
1821 private int toU(int length, ByteBuffer source, CharBuffer target, IntBuffer offsets, int sourceIndex,
\r
1822 boolean flush, CoderResult[] cr) {
\r
1825 if (sharedData.mbcs.extIndexes != null
\r
1826 && initialMatchToU(length, source, target, offsets, sourceIndex, flush, cr)) {
\r
1827 return 0; /* an extension mapping handled the input */
\r
1831 if (length == 4 && (options & MBCS_OPTION_GB18030) != 0) {
\r
1836 linear = LINEAR_18030(toUBytesArray[0], toUBytesArray[1], toUBytesArray[2], toUBytesArray[3]);
\r
1837 for (i = 0; i < gb18030Ranges.length; ++i) {
\r
1838 range = gb18030Ranges[i];
\r
1839 if (range[2] <= linear && linear <= range[3]) {
\r
1840 /* found the sequence, output the Unicode code point for it */
\r
1841 cr[0] = CoderResult.UNDERFLOW;
\r
1843 /* add the linear difference between the input and start sequences to the start code point */
\r
1844 linear = range[0] + (linear - range[2]);
\r
1846 /* output this code point */
\r
1847 cr[0] = toUWriteCodePoint((int) linear, target, offsets, sourceIndex);
\r
1855 cr[0] = CoderResult.unmappableForLength(length);
\r
1860 * target<targetLimit; set error code for overflow
\r
1862 private boolean initialMatchToU(int firstLength, ByteBuffer source, CharBuffer target, IntBuffer offsets,
\r
1863 int srcIndex, boolean flush, CoderResult[] cr) {
\r
1864 int[] value = new int[1];
\r
1867 /* try to match */
\r
1868 match = matchToU((byte) SISO_STATE(sharedData, mode), toUBytesArray, toUBytesBegin, firstLength, source,
\r
1869 value, isToUUseFallback(), flush);
\r
1871 /* advance src pointer for the consumed input */
\r
1872 source.position(source.position() + match - firstLength);
\r
1874 /* write result to target */
\r
1875 cr[0] = writeToU(value[0], target, offsets, srcIndex);
\r
1877 } else if (match < 0) {
\r
1878 /* save state for partial match */
\r
1883 /* copy the first code point */
\r
1884 sArray = toUBytesArray;
\r
1885 sArrayIndex = toUBytesBegin;
\r
1886 preToUFirstLength = (byte) firstLength;
\r
1887 for (j = 0; j < firstLength; ++j) {
\r
1888 preToUArray[j] = sArray[sArrayIndex++];
\r
1891 /* now copy the newly consumed input */
\r
1892 sArrayIndex = source.position();
\r
1894 for (; j < match; ++j) {
\r
1895 preToUArray[j] = source.get(sArrayIndex++);
\r
1897 source.position(sArrayIndex);
\r
1898 preToULength = (byte) match;
\r
1900 } else /* match==0 no match */{
\r
1905 private int simpleMatchToU(ByteBuffer source, boolean useFallback) {
\r
1906 int[] value = new int[1];
\r
1909 if (source.remaining() <= 0) {
\r
1913 /* try to match */
\r
1914 match = matchToU((byte) -1, source.array(), source.position(), source.limit(), null, value, useFallback, true);
\r
1916 if (match == (source.limit() - source.position())) {
\r
1917 /* write result for simple, single-character conversion */
\r
1918 if (TO_U_IS_CODE_POINT(value[0])) {
\r
1919 return TO_U_GET_CODE_POINT(value[0]);
\r
1924 * return no match because - match>0 && value points to string: simple conversion cannot handle multiple
\r
1925 * code points - match>0 && match!=length: not all input consumed, forbidden for this function - match==0:
\r
1926 * no match found in the first place - match<0: partial match, not supported for simple conversion (and
\r
1932 CoderResult cnvMBCSToUnicodeWithOffsets(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
\r
1933 CoderResult[] cr = { CoderResult.UNDERFLOW };
\r
1935 int sourceArrayIndex, sourceArrayIndexStart;
\r
1936 int stateTable[][/* 256 */];
\r
1937 char[] unicodeCodeUnits;
\r
1944 int sourceIndex, nextSourceIndex;
\r
1950 if (preToULength > 0) {
\r
1952 * pass sourceIndex=-1 because we continue from an earlier buffer in the future, this may change with
\r
1953 * continuous offsets
\r
1955 cr[0] = continueMatchToU(source, target, offsets, -1, flush);
\r
1957 if (cr[0].isError() || preToULength < 0) {
\r
1962 if (sharedData.mbcs.countStates == 1) {
\r
1963 if ((sharedData.mbcs.unicodeMask & UConverterConstants.HAS_SUPPLEMENTARY) == 0) {
\r
1964 cr[0] = cnvMBCSSingleToBMPWithOffsets(source, target, offsets, flush);
\r
1966 cr[0] = cnvMBCSSingleToUnicodeWithOffsets(source, target, offsets, flush);
\r
1971 /* set up the local pointers */
\r
1972 sourceArrayIndex = sourceArrayIndexStart = source.position();
\r
1974 if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) {
\r
1975 stateTable = sharedData.mbcs.swapLFNLStateTable;
\r
1977 stateTable = sharedData.mbcs.stateTable;
\r
1979 unicodeCodeUnits = sharedData.mbcs.unicodeCodeUnits;
\r
1981 /* get the converter state from UConverter */
\r
1982 offset = (int)toUnicodeStatus;
\r
1983 byteIndex = toULength;
\r
1984 bytes = toUBytesArray;
\r
1987 * if we are in the SBCS state for a DBCS-only converter, then load the DBCS state from the MBCS data
\r
1988 * (dbcsOnlyState==0 if it is not a DBCS-only converter)
\r
1990 state = (byte)mode;
\r
1992 state = sharedData.mbcs.dbcsOnlyState;
\r
1995 /* sourceIndex=-1 if the current character began in the previous buffer */
\r
1996 sourceIndex = byteIndex == 0 ? 0 : -1;
\r
1997 nextSourceIndex = 0;
\r
1999 /* conversion loop */
\r
2000 while (sourceArrayIndex < source.limit()) {
\r
2002 * This following test is to see if available input would overflow the output. It does not catch output
\r
2003 * of more than one code unit that overflows as a result of a surrogate pair or callback output from the
\r
2004 * last source byte. Therefore, those situations also test for overflows and will then break the loop,
\r
2007 if (!target.hasRemaining()) {
\r
2008 /* target is full */
\r
2009 cr[0] = CoderResult.OVERFLOW;
\r
2013 if (byteIndex == 0) {
\r
2014 /* optimized loop for 1/2-byte input and BMP output */
\r
2015 // agljport:todo see ucnvmbcs.c for deleted block
\r
2017 entry = stateTable[state][source.get(sourceArrayIndex)&UConverterConstants.UNSIGNED_BYTE_MASK];
\r
2018 if (MBCS_ENTRY_IS_TRANSITION(entry)) {
\r
2019 state = (byte)MBCS_ENTRY_TRANSITION_STATE(entry);
\r
2020 offset = MBCS_ENTRY_TRANSITION_OFFSET(entry);
\r
2021 ++sourceArrayIndex;
\r
2022 if (sourceArrayIndex < source.limit()
\r
2023 && MBCS_ENTRY_IS_FINAL(entry = stateTable[state][source.get(sourceArrayIndex)&UConverterConstants.UNSIGNED_BYTE_MASK])
\r
2024 && MBCS_ENTRY_FINAL_ACTION(entry) == MBCS_STATE_VALID_16
\r
2025 && (c = unicodeCodeUnits[offset + MBCS_ENTRY_FINAL_VALUE_16(entry)]) < 0xfffe) {
\r
2026 ++sourceArrayIndex;
\r
2028 if (offsets != null) {
\r
2029 offsets.put(sourceIndex);
\r
2030 sourceIndex = (nextSourceIndex += 2);
\r
2032 state = (byte)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
\r
2035 /* set the state and leave the optimized loop */
\r
2036 ++nextSourceIndex;
\r
2037 bytes[0] = source.get(sourceArrayIndex - 1);
\r
2042 if (MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
\r
2043 /* output BMP code point */
\r
2044 ++sourceArrayIndex;
\r
2045 target.put((char)MBCS_ENTRY_FINAL_VALUE_16(entry));
\r
2046 if (offsets != null) {
\r
2047 offsets.put(sourceIndex);
\r
2048 sourceIndex = ++nextSourceIndex;
\r
2050 state = (byte)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
\r
2052 /* leave the optimized loop */
\r
2056 } while (sourceArrayIndex < source.limit() && target.hasRemaining());
\r
2058 * these tests and break statements could be put inside the loop if C had "break outerLoop" like
\r
2061 if (sourceArrayIndex >= source.limit()) {
\r
2064 if (!target.hasRemaining()) {
\r
2065 /* target is full */
\r
2066 cr[0] = CoderResult.OVERFLOW;
\r
2070 ++nextSourceIndex;
\r
2071 bytes[byteIndex++] = source.get(sourceArrayIndex++);
\r
2072 } else /* byteIndex>0 */{
\r
2073 ++nextSourceIndex;
\r
2074 entry = stateTable[state][(bytes[byteIndex++] = source.get(sourceArrayIndex++))
\r
2075 & UConverterConstants.UNSIGNED_BYTE_MASK];
\r
2078 if (MBCS_ENTRY_IS_TRANSITION(entry)) {
\r
2079 state = (byte)MBCS_ENTRY_TRANSITION_STATE(entry);
\r
2080 offset += MBCS_ENTRY_TRANSITION_OFFSET(entry);
\r
2084 /* save the previous state for proper extension mapping with SI/SO-stateful converters */
\r
2087 /* set the next state early so that we can reuse the entry variable */
\r
2088 state = (byte)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
\r
2091 * An if-else-if chain provides more reliable performance for the most common cases compared to a
\r
2094 action = (byte)MBCS_ENTRY_FINAL_ACTION(entry);
\r
2095 if (action == MBCS_STATE_VALID_16) {
\r
2096 offset += MBCS_ENTRY_FINAL_VALUE_16(entry);
\r
2097 c = unicodeCodeUnits[offset];
\r
2099 /* output BMP code point */
\r
2101 if (offsets != null) {
\r
2102 offsets.put(sourceIndex);
\r
2105 } else if (c == 0xfffe) {
\r
2106 if (isFallbackUsed() && (entry = (int)getFallback(sharedData.mbcs, offset)) != 0xfffe) {
\r
2107 /* output fallback BMP code point */
\r
2108 target.put((char)entry);
\r
2109 if (offsets != null) {
\r
2110 offsets.put(sourceIndex);
\r
2115 /* callback(illegal) */
\r
2116 cr[0] = CoderResult.malformedForLength(byteIndex);
\r
2118 } else if (action == MBCS_STATE_VALID_DIRECT_16) {
\r
2119 /* output BMP code point */
\r
2120 target.put((char)MBCS_ENTRY_FINAL_VALUE_16(entry));
\r
2121 if (offsets != null) {
\r
2122 offsets.put(sourceIndex);
\r
2125 } else if (action == MBCS_STATE_VALID_16_PAIR) {
\r
2126 offset += MBCS_ENTRY_FINAL_VALUE_16(entry);
\r
2127 c = unicodeCodeUnits[offset++];
\r
2129 /* output BMP code point below 0xd800 */
\r
2131 if (offsets != null) {
\r
2132 offsets.put(sourceIndex);
\r
2135 } else if (isFallbackUsed() ? c <= 0xdfff : c <= 0xdbff) {
\r
2136 /* output roundtrip or fallback surrogate pair */
\r
2137 target.put((char)(c & 0xdbff));
\r
2138 if (offsets != null) {
\r
2139 offsets.put(sourceIndex);
\r
2142 if (target.hasRemaining()) {
\r
2143 target.put(unicodeCodeUnits[offset]);
\r
2144 if (offsets != null) {
\r
2145 offsets.put(sourceIndex);
\r
2148 /* target overflow */
\r
2149 charErrorBufferArray[0] = unicodeCodeUnits[offset];
\r
2150 charErrorBufferLength = 1;
\r
2151 cr[0] = CoderResult.OVERFLOW;
\r
2156 } else if (isFallbackUsed() ? (c & 0xfffe) == 0xe000 : c == 0xe000) {
\r
2157 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
\r
2158 target.put(unicodeCodeUnits[offset]);
\r
2159 if (offsets != null) {
\r
2160 offsets.put(sourceIndex);
\r
2163 } else if (c == 0xffff) {
\r
2164 /* callback(illegal) */
\r
2165 cr[0] = CoderResult.malformedForLength(byteIndex);
\r
2167 } else if (action == MBCS_STATE_VALID_DIRECT_20
\r
2168 || (action == MBCS_STATE_FALLBACK_DIRECT_20 && isFallbackUsed())) {
\r
2169 entry = MBCS_ENTRY_FINAL_VALUE(entry);
\r
2170 /* output surrogate pair */
\r
2171 target.put((char)(0xd800 | (char)(entry >> 10)));
\r
2172 if (offsets != null) {
\r
2173 offsets.put(sourceIndex);
\r
2176 c = (char)(0xdc00 | (char)(entry & 0x3ff));
\r
2177 if (target.hasRemaining()) {
\r
2179 if (offsets != null) {
\r
2180 offsets.put(sourceIndex);
\r
2183 /* target overflow */
\r
2184 charErrorBufferArray[0] = c;
\r
2185 charErrorBufferLength = 1;
\r
2186 cr[0] = CoderResult.OVERFLOW;
\r
2191 } else if (action == MBCS_STATE_CHANGE_ONLY) {
\r
2193 * This serves as a state change without any output. It is useful for reading simple stateful
\r
2194 * encodings, for example using just Shift-In/Shift-Out codes. The 21 unused bits may later be used
\r
2195 * for more sophisticated state transitions.
\r
2197 if (sharedData.mbcs.dbcsOnlyState == 0) {
\r
2200 /* SI/SO are illegal for DBCS-only conversion */
\r
2201 state = (byte)(mode); /* restore the previous state */
\r
2203 /* callback(illegal) */
\r
2204 cr[0] = CoderResult.malformedForLength(byteIndex);
\r
2206 } else if (action == MBCS_STATE_FALLBACK_DIRECT_16) {
\r
2207 if (isFallbackUsed()) {
\r
2208 /* output BMP code point */
\r
2209 target.put((char)MBCS_ENTRY_FINAL_VALUE_16(entry));
\r
2210 if (offsets != null) {
\r
2211 offsets.put(sourceIndex);
\r
2215 } else if (action == MBCS_STATE_UNASSIGNED) {
\r
2216 /* just fall through */
\r
2217 } else if (action == MBCS_STATE_ILLEGAL) {
\r
2218 /* callback(illegal) */
\r
2219 cr[0] = CoderResult.malformedForLength(byteIndex);
\r
2221 /* reserved, must never occur */
\r
2225 /* end of action codes: prepare for a new character */
\r
2228 if (byteIndex == 0) {
\r
2229 sourceIndex = nextSourceIndex;
\r
2230 } else if (cr[0].isError()) {
\r
2231 /* callback(illegal) */
\r
2232 if (byteIndex > 1) {
\r
2234 * Ticket 5691: consistent illegal sequences:
\r
2235 * - We include at least the first byte in the illegal sequence.
\r
2236 * - If any of the non-initial bytes could be the start of a character,
\r
2237 * we stop the illegal sequence before the first one of those.
\r
2239 boolean isDBCSOnly = (sharedData.mbcs.dbcsOnlyState != 0);
\r
2241 for (i = 1; i < byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, (short)(bytes[i] & UConverterConstants.UNSIGNED_BYTE_MASK)); i++) {}
\r
2242 if (i < byteIndex) {
\r
2243 byte backOutDistance = (byte)(byteIndex - i);
\r
2244 int bytesFromThisBuffer = sourceArrayIndex - sourceArrayIndexStart;
\r
2245 byteIndex = i; /* length of reported illegal byte sequence */
\r
2246 if (backOutDistance <= bytesFromThisBuffer) {
\r
2247 sourceArrayIndex -= backOutDistance;
\r
2249 /* Back out bytes from the previous buffer: Need to replay them. */
\r
2250 this.preToULength = (byte)(bytesFromThisBuffer - backOutDistance);
\r
2251 /* preToULength is negative! */
\r
2252 for (int n = 0; n < -this.preToULength; n++) {
\r
2253 this.preToUArray[n] = bytes[i+n];
\r
2255 sourceArrayIndex = sourceArrayIndexStart;
\r
2260 } else /* unassigned sequences indicated with byteIndex>0 */{
\r
2261 /* try an extension mapping */
\r
2262 int sourceBeginIndex = sourceArrayIndex;
\r
2263 source.position(sourceArrayIndex);
\r
2264 byteIndex = toU(byteIndex, source, target, offsets, sourceIndex, flush, cr);
\r
2265 sourceArrayIndex = source.position();
\r
2266 sourceIndex = nextSourceIndex += (int)(sourceArrayIndex - sourceBeginIndex);
\r
2268 if (cr[0].isError() || cr[0].isOverflow()) {
\r
2269 /* not mappable or buffer overflow */
\r
2275 /* set the converter state back into UConverter */
\r
2276 toUnicodeStatus = offset;
\r
2278 toULength = byteIndex;
\r
2280 /* write back the updated pointers */
\r
2281 source.position(sourceArrayIndex);
\r
2286 * This version of cnvMBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages that
\r
2287 * only map to and from the BMP. In addition to single-byte optimizations, the offset calculations become much
\r
2290 private CoderResult cnvMBCSSingleToBMPWithOffsets(ByteBuffer source, CharBuffer target, IntBuffer offsets,
\r
2292 CoderResult[] cr = { CoderResult.UNDERFLOW };
\r
2294 int sourceArrayIndex, lastSource;
\r
2295 int targetCapacity, length;
\r
2296 int[][] stateTable;
\r
2303 /* set up the local pointers */
\r
2304 sourceArrayIndex = source.position();
\r
2305 targetCapacity = target.remaining();
\r
2307 if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) {
\r
2308 stateTable = sharedData.mbcs.swapLFNLStateTable;
\r
2310 stateTable = sharedData.mbcs.stateTable;
\r
2313 /* sourceIndex=-1 if the current character began in the previous buffer */
\r
2315 lastSource = sourceArrayIndex;
\r
2318 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter for the minimum of the
\r
2319 * sourceLength and targetCapacity
\r
2321 length = source.remaining();
\r
2322 if (length < targetCapacity) {
\r
2323 targetCapacity = length;
\r
2326 /* conversion loop */
\r
2327 while (targetCapacity > 0) {
\r
2328 entry = stateTable[0][source.get(sourceArrayIndex++) & UConverterConstants.UNSIGNED_BYTE_MASK];
\r
2329 /* MBCS_ENTRY_IS_FINAL(entry) */
\r
2331 /* test the most common case first */
\r
2332 if (MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
\r
2333 /* output BMP code point */
\r
2334 target.put((char) MBCS_ENTRY_FINAL_VALUE_16(entry));
\r
2340 * An if-else-if chain provides more reliable performance for the most common cases compared to a
\r
2343 action = (byte) (MBCS_ENTRY_FINAL_ACTION(entry));
\r
2344 if (action == MBCS_STATE_FALLBACK_DIRECT_16) {
\r
2345 if (isFallbackUsed()) {
\r
2346 /* output BMP code point */
\r
2347 target.put((char) MBCS_ENTRY_FINAL_VALUE_16(entry));
\r
2351 } else if (action == MBCS_STATE_UNASSIGNED) {
\r
2352 /* just fall through */
\r
2353 } else if (action == MBCS_STATE_ILLEGAL) {
\r
2354 /* callback(illegal) */
\r
2355 cr[0] = CoderResult.malformedForLength(sourceArrayIndex - lastSource);
\r
2357 /* reserved, must never occur */
\r
2361 /* set offsets since the start or the last extension */
\r
2362 if (offsets != null) {
\r
2363 int count = sourceArrayIndex - lastSource;
\r
2365 /* predecrement: do not set the offset for the callback-causing character */
\r
2366 while (--count > 0) {
\r
2367 offsets.put(sourceIndex++);
\r
2369 /* offset and sourceIndex are now set for the current character */
\r
2372 if (cr[0].isError()) {
\r
2373 /* callback(illegal) */
\r
2375 } else /* unassigned sequences indicated with byteIndex>0 */{
\r
2376 /* try an extension mapping */
\r
2377 lastSource = sourceArrayIndex;
\r
2378 toUBytesArray[0] = source.get(sourceArrayIndex - 1);
\r
2379 source.position(sourceArrayIndex);
\r
2380 toULength = toU((byte) 1, source, target, offsets, sourceIndex, flush, cr);
\r
2381 sourceArrayIndex = source.position();
\r
2382 sourceIndex += 1 + (int) (sourceArrayIndex - lastSource);
\r
2384 if (cr[0].isError()) {
\r
2385 /* not mappable or buffer overflow */
\r
2389 /* recalculate the targetCapacity after an extension mapping */
\r
2390 targetCapacity = target.remaining();
\r
2391 length = source.remaining();
\r
2392 if (length < targetCapacity) {
\r
2393 targetCapacity = length;
\r
2398 if (!cr[0].isError() && sourceArrayIndex < source.limit() && !target.hasRemaining()) {
\r
2399 /* target is full */
\r
2400 cr[0] = CoderResult.OVERFLOW;
\r
2403 /* set offsets since the start or the last callback */
\r
2404 if (offsets != null) {
\r
2405 int count = sourceArrayIndex - lastSource;
\r
2406 while (count > 0) {
\r
2407 offsets.put(sourceIndex++);
\r
2412 /* write back the updated pointers */
\r
2413 source.position(sourceArrayIndex);
\r
2418 /* This version of cnvMBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */
\r
2419 private CoderResult cnvMBCSSingleToUnicodeWithOffsets(ByteBuffer source, CharBuffer target, IntBuffer offsets,
\r
2421 CoderResult[] cr = { CoderResult.UNDERFLOW };
\r
2423 int sourceArrayIndex;
\r
2424 int[][] stateTable;
\r
2432 /* set up the local pointers */
\r
2433 sourceArrayIndex = source.position();
\r
2435 if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) {
\r
2436 stateTable = sharedData.mbcs.swapLFNLStateTable;
\r
2438 stateTable = sharedData.mbcs.stateTable;
\r
2441 /* sourceIndex=-1 if the current character began in the previous buffer */
\r
2444 /* conversion loop */
\r
2445 while (sourceArrayIndex < source.limit()) {
\r
2447 * This following test is to see if available input would overflow the output. It does not catch output
\r
2448 * of more than one code unit that overflows as a result of a surrogate pair or callback output from the
\r
2449 * last source byte. Therefore, those situations also test for overflows and will then break the loop,
\r
2452 if (!target.hasRemaining()) {
\r
2453 /* target is full */
\r
2454 cr[0] = CoderResult.OVERFLOW;
\r
2458 entry = stateTable[0][source.get(sourceArrayIndex++) & UConverterConstants.UNSIGNED_BYTE_MASK];
\r
2459 /* MBCS_ENTRY_IS_FINAL(entry) */
\r
2461 /* test the most common case first */
\r
2462 if (MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
\r
2463 /* output BMP code point */
\r
2464 target.put((char) MBCS_ENTRY_FINAL_VALUE_16(entry));
\r
2465 if (offsets != null) {
\r
2466 offsets.put(sourceIndex);
\r
2469 /* normal end of action codes: prepare for a new character */
\r
2475 * An if-else-if chain provides more reliable performance for the most common cases compared to a
\r
2478 action = (byte) (MBCS_ENTRY_FINAL_ACTION(entry));
\r
2479 if (action == MBCS_STATE_VALID_DIRECT_20
\r
2480 || (action == MBCS_STATE_FALLBACK_DIRECT_20 && isFallbackUsed())) {
\r
2482 entry = MBCS_ENTRY_FINAL_VALUE(entry);
\r
2483 /* output surrogate pair */
\r
2484 target.put((char) (0xd800 | (char) (entry >>> 10)));
\r
2485 if (offsets != null) {
\r
2486 offsets.put(sourceIndex);
\r
2488 c = (char) (0xdc00 | (char) (entry & 0x3ff));
\r
2489 if (target.hasRemaining()) {
\r
2491 if (offsets != null) {
\r
2492 offsets.put(sourceIndex);
\r
2495 /* target overflow */
\r
2496 charErrorBufferArray[0] = c;
\r
2497 charErrorBufferLength = 1;
\r
2498 cr[0] = CoderResult.OVERFLOW;
\r
2504 } else if (action == MBCS_STATE_FALLBACK_DIRECT_16) {
\r
2505 if (isFallbackUsed()) {
\r
2506 /* output BMP code point */
\r
2507 target.put((char) MBCS_ENTRY_FINAL_VALUE_16(entry));
\r
2508 if (offsets != null) {
\r
2509 offsets.put(sourceIndex);
\r
2515 } else if (action == MBCS_STATE_UNASSIGNED) {
\r
2516 /* just fall through */
\r
2517 } else if (action == MBCS_STATE_ILLEGAL) {
\r
2518 /* callback(illegal) */
\r
2519 cr[0] = CoderResult.malformedForLength(1);
\r
2521 /* reserved, must never occur */
\r
2526 if (cr[0].isError()) {
\r
2527 /* callback(illegal) */
\r
2529 } else /* unassigned sequences indicated with byteIndex>0 */{
\r
2530 /* try an extension mapping */
\r
2531 int sourceBeginIndex = sourceArrayIndex;
\r
2532 toUBytesArray[0] = source.get(sourceArrayIndex - 1);
\r
2533 source.position(sourceArrayIndex);
\r
2534 toULength = toU((byte) 1, source, target, offsets, sourceIndex, flush, cr);
\r
2535 sourceArrayIndex = source.position();
\r
2536 sourceIndex += 1 + (int) (sourceArrayIndex - sourceBeginIndex);
\r
2538 if (cr[0].isError()) {
\r
2539 /* not mappable or buffer overflow */
\r
2545 /* write back the updated pointers */
\r
2546 source.position(sourceArrayIndex);
\r
2551 private int getFallback(UConverterMBCSTable mbcsTable, int offset) {
\r
2552 MBCSToUFallback[] toUFallbacks;
\r
2553 int i, start, limit;
\r
2555 limit = mbcsTable.countToUFallbacks;
\r
2557 /* do a binary search for the fallback mapping */
\r
2558 toUFallbacks = mbcsTable.toUFallbacks;
\r
2560 while (start < limit - 1) {
\r
2561 i = (start + limit) / 2;
\r
2562 if (offset < toUFallbacks[i].offset) {
\r
2569 /* did we really find it? */
\r
2570 if (offset == toUFallbacks[start].offset) {
\r
2571 return toUFallbacks[start].codePoint;
\r
2579 * This is a simple version of _MBCSGetNextUChar() that is used by other converter implementations. It only
\r
2580 * returns an "assigned" result if it consumes the entire input. It does not use state from the converter, nor
\r
2581 * error codes. It does not handle the EBCDIC swaplfnl option (set in UConverter). It handles conversion
\r
2582 * extensions but not GB 18030.
\r
2584 * @return U+fffe unassigned U+ffff illegal otherwise the Unicode code point
\r
2586 int simpleGetNextUChar(ByteBuffer source, boolean useFallback) {
\r
2590 // * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
\r
2591 // * TODO In future releases, verify that this function is never called for SBCS
\r
2592 // * conversions, i.e., that sharedData->mbcs.countStates==1 is still true.
\r
2593 // * Removal improves code coverage.
\r
2595 // /* use optimized function if possible */
\r
2596 // if(sharedData->mbcs.countStates==1) {
\r
2597 // if(length==1) {
\r
2598 // return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback);
\r
2600 // return 0xffff; /* illegal: more than a single byte for an SBCS converter */
\r
2605 /* set up the local pointers */
\r
2606 int[][] stateTable = sharedData.mbcs.stateTable;
\r
2607 char[] unicodeCodeUnits = sharedData.mbcs.unicodeCodeUnits;
\r
2609 /* converter state */
\r
2611 int state = sharedData.mbcs.dbcsOnlyState;
\r
2616 int i = source.position();
\r
2617 int length = source.limit() - i;
\r
2619 /* conversion loop */
\r
2621 // entry=stateTable[state][(uint8_t)source[i++]];
\r
2622 entry = stateTable[state][source.get(i++) & UConverterConstants.UNSIGNED_BYTE_MASK];
\r
2624 if (MBCS_ENTRY_IS_TRANSITION(entry)) {
\r
2625 state = MBCS_ENTRY_TRANSITION_STATE(entry);
\r
2626 offset += MBCS_ENTRY_TRANSITION_OFFSET(entry);
\r
2628 if (i == source.limit()) {
\r
2629 return 0xffff; /* truncated character */
\r
2633 * An if-else-if chain provides more reliable performance for the most common cases compared to a
\r
2636 action = MBCS_ENTRY_FINAL_ACTION(entry);
\r
2637 if (action == MBCS_STATE_VALID_16) {
\r
2638 offset += MBCS_ENTRY_FINAL_VALUE_16(entry);
\r
2639 c = unicodeCodeUnits[offset];
\r
2640 if (c != 0xfffe) {
\r
2642 } else if (isToUUseFallback()) {
\r
2643 c = getFallback(sharedData.mbcs, offset);
\r
2645 /* else done with 0xfffe */
\r
2646 } else if (action == MBCS_STATE_VALID_DIRECT_16) {
\r
2647 // /* output BMP code point */
\r
2648 c = MBCS_ENTRY_FINAL_VALUE_16(entry);
\r
2649 } else if (action == MBCS_STATE_VALID_16_PAIR) {
\r
2650 offset += MBCS_ENTRY_FINAL_VALUE_16(entry);
\r
2651 c = unicodeCodeUnits[offset++];
\r
2653 /* output BMP code point below 0xd800 */
\r
2654 } else if (isToUUseFallback() ? c <= 0xdfff : c <= 0xdbff) {
\r
2655 /* output roundtrip or fallback supplementary code point */
\r
2656 c = (((c & 0x3ff) << 10) + unicodeCodeUnits[offset] + (0x10000 - 0xdc00));
\r
2657 } else if (isToUUseFallback() ? (c & 0xfffe) == 0xe000 : c == 0xe000) {
\r
2658 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
\r
2659 c = unicodeCodeUnits[offset];
\r
2660 } else if (c == 0xffff) {
\r
2665 } else if (action == MBCS_STATE_VALID_DIRECT_20) {
\r
2666 /* output supplementary code point */
\r
2667 c = 0x10000 + MBCS_ENTRY_FINAL_VALUE(entry);
\r
2668 } else if (action == MBCS_STATE_FALLBACK_DIRECT_16) {
\r
2669 if (!isToUUseFallback(useFallback)) {
\r
2672 /* output BMP code point */
\r
2673 c = MBCS_ENTRY_FINAL_VALUE_16(entry);
\r
2675 } else if (action == MBCS_STATE_FALLBACK_DIRECT_20) {
\r
2676 if (!isToUUseFallback(useFallback)) {
\r
2679 /* output supplementary code point */
\r
2680 c = 0x10000 + MBCS_ENTRY_FINAL_VALUE(entry);
\r
2682 } else if (action == MBCS_STATE_UNASSIGNED) {
\r
2686 * forbid MBCS_STATE_CHANGE_ONLY for this function, and MBCS_STATE_ILLEGAL and reserved action
\r
2695 if (i != source.limit()) {
\r
2696 /* illegal for this function: not all input consumed */
\r
2700 if (c == 0xfffe) {
\r
2701 /* try an extension mapping */
\r
2702 if (sharedData.mbcs.extIndexes != null) {
\r
2703 /* Increase the limit for proper handling. Used in LMBCS. */
\r
2704 if (source.limit() > i + length) {
\r
2705 source.limit(i + length);
\r
2707 return simpleMatchToU(source, useFallback);
\r
2713 private boolean hasValidTrailBytes(int[][] stateTable, short state) {
\r
2714 int[] row = stateTable[state];
\r
2716 /* First test for final entries in this state for some commonly valid byte values. */
\r
2717 entry = row[0xa1];
\r
2718 if (!MBCS_ENTRY_IS_TRANSITION(entry) && MBCS_ENTRY_FINAL_ACTION(entry) != MBCS_STATE_ILLEGAL) {
\r
2721 entry = row[0x41];
\r
2722 if (!MBCS_ENTRY_IS_TRANSITION(entry) && MBCS_ENTRY_FINAL_ACTION(entry) != MBCS_STATE_ILLEGAL) {
\r
2725 /* Then test for final entries in this state. */
\r
2726 for (b = 0; b <= 0xff; b++) {
\r
2728 if (!MBCS_ENTRY_IS_TRANSITION(entry) && MBCS_ENTRY_FINAL_ACTION(entry) != MBCS_STATE_ILLEGAL) {
\r
2732 /* Then recurse for transition entries. */
\r
2733 for (b = 0; b <= 0xff; b++) {
\r
2735 if (MBCS_ENTRY_IS_TRANSITION(entry) &&
\r
2736 hasValidTrailBytes(stateTable, (short)(MBCS_ENTRY_TRANSITION_STATE(entry) & UConverterConstants.UNSIGNED_BYTE_MASK))) {
\r
2743 private boolean isSingleOrLead(int[][] stateTable, int state, boolean isDBCSOnly, int b) {
\r
2744 int[] row = stateTable[state];
\r
2745 int entry = row[b];
\r
2746 if (MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */
\r
2747 return hasValidTrailBytes(stateTable, (short)(MBCS_ENTRY_TRANSITION_STATE(entry) & UConverterConstants.UNSIGNED_BYTE_MASK));
\r
2749 short action = (short)(MBCS_ENTRY_FINAL_ACTION(entry) & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
2750 if (action == MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {
\r
2751 return false; /* SI/SO are illegal for DBCS-only conversion */
\r
2753 return (action != MBCS_STATE_ILLEGAL);
\r
2761 class CharsetEncoderMBCS extends CharsetEncoderICU {
\r
2762 private boolean allowReplacementChanges = false;
\r
2764 CharsetEncoderMBCS(CharsetICU cs) {
\r
2765 super(cs, fromUSubstitution);
\r
2766 allowReplacementChanges = true; // allow changes in implReplaceWith
\r
2770 protected void implReset() {
\r
2771 super.implReset();
\r
2772 preFromUFirstCP = UConverterConstants.U_SENTINEL;
\r
2775 protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
\r
2777 CoderResult[] cr = { CoderResult.UNDERFLOW };
\r
2778 // if (!source.hasRemaining() && fromUChar32 == 0)
\r
2781 int sourceArrayIndex;
\r
2783 byte[] pArray, bytes;
\r
2784 int pArrayIndex, outputType, c;
\r
2785 int prevSourceIndex, sourceIndex, nextSourceIndex;
\r
2786 int stage2Entry = 0, value = 0, length = 0, prevLength;
\r
2788 // long asciiRoundtrips;
\r
2790 boolean gotoUnassigned = false;
\r
2794 if (!flush && preFromUFirstCP >= 0) {
\r
2796 * pass sourceIndex=-1 because we continue from an earlier buffer in the future, this may change
\r
2797 * with continuous offsets
\r
2799 cr[0] = continueMatchFromU(source, target, offsets, flush, -1);
\r
2801 if (cr[0].isError() || preFromULength < 0) {
\r
2806 /* use optimized function if possible */
\r
2807 outputType = sharedData.mbcs.outputType;
\r
2808 uniMask = sharedData.mbcs.unicodeMask;
\r
2809 if (outputType == MBCS_OUTPUT_1 && (uniMask & UConverterConstants.HAS_SURROGATES) == 0) {
\r
2810 if ((uniMask & UConverterConstants.HAS_SUPPLEMENTARY) == 0) {
\r
2811 cr[0] = cnvMBCSSingleFromBMPWithOffsets(source, target, offsets, flush);
\r
2813 cr[0] = cnvMBCSSingleFromUnicodeWithOffsets(source, target, offsets, flush);
\r
2816 } else if (outputType == MBCS_OUTPUT_2) {
\r
2817 cr[0] = cnvMBCSDoubleFromUnicodeWithOffsets(source, target, offsets, flush);
\r
2821 table = sharedData.mbcs.fromUnicodeTable;
\r
2822 sourceArrayIndex = source.position();
\r
2824 if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) {
\r
2825 bytes = sharedData.mbcs.swapLFNLFromUnicodeBytes;
\r
2827 bytes = sharedData.mbcs.fromUnicodeBytes;
\r
2830 // asciiRoundtrips = sharedData.mbcs.asciiRoundtrips;
\r
2832 /* get the converter state from UConverter */
\r
2835 if (outputType == MBCS_OUTPUT_2_SISO) {
\r
2836 prevLength = (int) fromUnicodeStatus;
\r
2837 if (prevLength == 0) {
\r
2838 /* set the real value */
\r
2842 /* prevent fromUnicodeStatus from being set to something non-0 */
\r
2846 /* sourceIndex=-1 if the current character began in the previous buffer */
\r
2847 prevSourceIndex = -1;
\r
2848 sourceIndex = c == 0 ? 0 : -1;
\r
2849 nextSourceIndex = 0;
\r
2851 /* conversion loop */
\r
2853 * This is another piece of ugly code: A goto into the loop if the converter state contains a first
\r
2854 * surrogate from the previous function call. It saves me to check in each loop iteration a check of
\r
2855 * if(c==0) and duplicating the trail-surrogate-handling code in the else branch of that check. I could
\r
2856 * not find any other way to get around this other than using a function call for the conversion and
\r
2857 * callback, which would be even more inefficient.
\r
2859 * Markus Scherer 2000-jul-19
\r
2861 boolean doloop = true;
\r
2862 boolean doread = true;
\r
2863 if (c != 0 && target.hasRemaining()) {
\r
2864 if (UTF16.isLeadSurrogate((char) c) && (uniMask & UConverterConstants.HAS_SURROGATES) == 0) {
\r
2865 // c is a lead surrogate, read another input
\r
2866 SideEffects x = new SideEffects(c, sourceArrayIndex, sourceIndex, nextSourceIndex,
\r
2867 prevSourceIndex, prevLength);
\r
2868 doloop = getTrail(source, target, uniMask, x, flush, cr);
\r
2869 doread = x.doread;
\r
2871 sourceArrayIndex = x.sourceArrayIndex;
\r
2872 sourceIndex = x.sourceIndex;
\r
2873 nextSourceIndex = x.nextSourceIndex;
\r
2874 prevSourceIndex = x.prevSourceIndex;
\r
2875 prevLength = x.prevLength;
\r
2877 // c is not a lead surrogate, do not read another input
\r
2883 while (!doread || sourceArrayIndex < source.limit()) {
\r
2885 * This following test is to see if available input would overflow the output. It does not catch
\r
2886 * output of more than one byte that overflows as a result of a multi-byte character or callback
\r
2887 * output from the last source character. Therefore, those situations also test for overflows
\r
2888 * and will then break the loop, too.
\r
2890 if (target.hasRemaining()) {
\r
2892 * Get a correct Unicode code point: a single UChar for a BMP code point or a matched
\r
2893 * surrogate pair for a "supplementary code point".
\r
2897 // doread might be false only on the first looping
\r
2899 c = source.get(sourceArrayIndex++);
\r
2900 ++nextSourceIndex;
\r
2903 * This also tests if the codepage maps single surrogates. If it does, then surrogates
\r
2904 * are not paired but mapped separately. Note that in this case unmatched surrogates are
\r
2907 if (UTF16.isSurrogate((char) c)
\r
2908 && (uniMask & UConverterConstants.HAS_SURROGATES) == 0) {
\r
2909 if (UTF16.isLeadSurrogate((char) c)) {
\r
2911 SideEffects x = new SideEffects(c, sourceArrayIndex, sourceIndex,
\r
2912 nextSourceIndex, prevSourceIndex, prevLength);
\r
2913 doloop = getTrail(source, target, uniMask, x, flush, cr);
\r
2915 sourceArrayIndex = x.sourceArrayIndex;
\r
2916 sourceIndex = x.sourceIndex;
\r
2917 nextSourceIndex = x.nextSourceIndex;
\r
2918 prevSourceIndex = x.prevSourceIndex;
\r
2927 /* this is an unmatched trail code unit (2nd surrogate) */
\r
2928 /* callback(illegal) */
\r
2929 cr[0] = CoderResult.malformedForLength(1);
\r
2936 /* convert the Unicode code point in c into codepage bytes */
\r
2939 * The basic lookup is a triple-stage compact array (trie) lookup. For details see the
\r
2940 * beginning of this file.
\r
2942 * Single-byte codepages are handled with a different data structure by _MBCSSingle...
\r
2945 * The result consists of a 32-bit value from stage 2 and a pointer to as many bytes as are
\r
2946 * stored per character. The pointer points to the character's bytes in stage 3. Bits 15..0
\r
2947 * of the stage 2 entry contain the stage 3 index for that pointer, while bits 31..16 are
\r
2948 * flags for which of the 16 characters in the block are roundtrip-assigned.
\r
2950 * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t respectively as
\r
2951 * uint32_t, in the platform encoding. For 3-byte codepages, the bytes are always stored in
\r
2952 * big-endian order.
\r
2954 * For EUC encodings that use only either 0x8e or 0x8f as the first byte of their longest
\r
2955 * byte sequences, the first two bytes in this third stage indicate with their 7th bits
\r
2956 * whether these bytes are to be written directly or actually need to be preceeded by one of
\r
2957 * the two Single-Shift codes. With this, the third stage stores one byte fewer per
\r
2958 * character than the actual maximum length of EUC byte sequences.
\r
2960 * Other than that, leading zero bytes are removed and the other bytes output. A single zero
\r
2961 * byte may be output if the "assigned" bit in stage 2 was on. The data structure does not
\r
2962 * support zero byte output as a fallback, and also does not allow output of leading zeros.
\r
2964 stage2Entry = MBCS_STAGE_2_FROM_U(table, c);
\r
2966 /* get the bytes and the length for the output */
\r
2967 switch (outputType) {
\r
2968 /* This is handled above with the method cnvMBCSDoubleFromUnicodeWithOffsets() */
\r
2969 /* case MBCS_OUTPUT_2:
\r
2970 value = MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
\r
2971 if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) {
\r
2977 case MBCS_OUTPUT_2_SISO:
\r
2978 /* 1/2-byte stateful with Shift-In/Shift-Out */
\r
2980 * Save the old state in the converter object right here, then change the local
\r
2981 * prevLength state variable if necessary. Then, if this character turns out to be
\r
2982 * unassigned or a fallback that is not taken, the callback code must not save the new
\r
2983 * state in the converter because the new state is for a character that is not output.
\r
2984 * However, the callback must still restore the state from the converter in case the
\r
2985 * callback function changed it for its output.
\r
2987 fromUnicodeStatus = prevLength; /* save the old state */
\r
2988 value = MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
\r
2989 if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) {
\r
2990 if (value == 0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) == false) {
\r
2991 /* no mapping, leave value==0 */
\r
2993 } else if (prevLength <= 1) {
\r
2996 /* change from double-byte mode to single-byte */
\r
2997 value |= UConverterConstants.SI << 8;
\r
3002 if (prevLength == 2) {
\r
3005 /* change from single-byte mode to double-byte */
\r
3006 value |= UConverterConstants.SO << 16;
\r
3012 case MBCS_OUTPUT_DBCS_ONLY:
\r
3013 /* table with single-byte results, but only DBCS mappings used */
\r
3014 value = MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
\r
3015 if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) {
\r
3016 /* no mapping or SBCS result, not taken for DBCS-only */
\r
3017 value = stage2Entry = 0; /* stage2Entry=0 to reset roundtrip flags */
\r
3023 case MBCS_OUTPUT_3:
\r
3025 pArrayIndex = MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
\r
3026 value = ((pArray[pArrayIndex] & UConverterConstants.UNSIGNED_BYTE_MASK) << 16)
\r
3027 | ((pArray[pArrayIndex + 1] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8)
\r
3028 | (pArray[pArrayIndex + 2] & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
3029 if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) {
\r
3031 } else if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xffff) {
\r
3037 case MBCS_OUTPUT_4:
\r
3038 value = MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c);
\r
3039 if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) {
\r
3041 } else if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xffff) {
\r
3043 } else if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xffffff) {
\r
3049 case MBCS_OUTPUT_3_EUC:
\r
3050 value = MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
\r
3051 /* EUC 16-bit fixed-length representation */
\r
3052 if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) {
\r
3054 } else if ((value & 0x8000) == 0) {
\r
3055 value |= 0x8e8000;
\r
3057 } else if ((value & 0x80) == 0) {
\r
3058 value |= 0x8f0080;
\r
3064 case MBCS_OUTPUT_4_EUC:
\r
3066 pArrayIndex = MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
\r
3067 value = ((pArray[pArrayIndex] & UConverterConstants.UNSIGNED_BYTE_MASK) << 16)
\r
3068 | ((pArray[pArrayIndex + 1] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8)
\r
3069 | (pArray[pArrayIndex + 2] & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
3070 /* EUC 16-bit fixed-length representation applied to the first two bytes */
\r
3071 if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) {
\r
3073 } else if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xffff) {
\r
3075 } else if ((value & 0x800000) == 0) {
\r
3076 value |= 0x8e800000;
\r
3078 } else if ((value & 0x8000) == 0) {
\r
3079 value |= 0x8f008000;
\r
3086 /* must not occur */
\r
3088 * To avoid compiler warnings that value & length may be used without having been
\r
3089 * initialized, we set them here. In reality, this is unreachable code. Not having a
\r
3090 * default branch also causes warnings with some compilers.
\r
3092 value = stage2Entry = 0; /* stage2Entry=0 to reset roundtrip flags */
\r
3097 /* is this code point assigned, or do we use fallbacks? */
\r
3098 if (gotoUnassigned || (!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || (isFromUUseFallback(c) && value != 0)))) {
\r
3099 gotoUnassigned = false;
\r
3101 * We allow a 0 byte output if the "assigned" bit is set for this entry. There is no way
\r
3102 * with this data structure for fallback output to be a zero byte.
\r
3106 SideEffects x = new SideEffects(c, sourceArrayIndex, sourceIndex, nextSourceIndex,
\r
3107 prevSourceIndex, prevLength);
\r
3108 doloop = unassigned(source, target, offsets, x, flush, cr);
\r
3110 sourceArrayIndex = x.sourceArrayIndex;
\r
3111 sourceIndex = x.sourceIndex;
\r
3112 nextSourceIndex = x.nextSourceIndex;
\r
3113 prevSourceIndex = x.prevSourceIndex;
\r
3114 prevLength = x.prevLength;
\r
3121 /* write the output character bytes from value and length */
\r
3122 /* from the first if in the loop we know that targetCapacity>0 */
\r
3123 if (length <= target.remaining()) {
\r
3125 /* each branch falls through to the next one */
\r
3127 target.put((byte) (value >>> 24));
\r
3128 if (offsets != null) {
\r
3129 offsets.put(sourceIndex);
\r
3132 target.put((byte) (value >>> 16));
\r
3133 if (offsets != null) {
\r
3134 offsets.put(sourceIndex);
\r
3137 target.put((byte) (value >>> 8));
\r
3138 if (offsets != null) {
\r
3139 offsets.put(sourceIndex);
\r
3142 target.put((byte) value);
\r
3143 if (offsets != null) {
\r
3144 offsets.put(sourceIndex);
\r
3147 /* will never occur */
\r
3151 int errorBufferArrayIndex;
\r
3154 * We actually do this backwards here: In order to save an intermediate variable, we
\r
3155 * output first to the overflow buffer what does not fit into the regular target.
\r
3157 /* we know that 1<=targetCapacity<length<=4 */
\r
3158 length -= target.remaining();
\r
3160 errorBufferArrayIndex = 0;
\r
3162 /* each branch falls through to the next one */
\r
3164 errorBuffer[errorBufferArrayIndex++] = (byte) (value >>> 16);
\r
3166 errorBuffer[errorBufferArrayIndex++] = (byte) (value >>> 8);
\r
3168 errorBuffer[errorBufferArrayIndex] = (byte) value;
\r
3170 /* will never occur */
\r
3173 errorBufferLength = (byte) length;
\r
3175 /* now output what fits into the regular target */
\r
3176 value >>>= 8 * length; /* length was reduced by targetCapacity */
\r
3177 switch (target.remaining()) {
\r
3178 /* each branch falls through to the next one */
\r
3180 target.put((byte) (value >>> 16));
\r
3181 if (offsets != null) {
\r
3182 offsets.put(sourceIndex);
\r
3185 target.put((byte) (value >>> 8));
\r
3186 if (offsets != null) {
\r
3187 offsets.put(sourceIndex);
\r
3190 target.put((byte) value);
\r
3191 if (offsets != null) {
\r
3192 offsets.put(sourceIndex);
\r
3195 /* will never occur */
\r
3199 /* target overflow */
\r
3200 cr[0] = CoderResult.OVERFLOW;
\r
3205 /* normal end of conversion: prepare for a new character */
\r
3207 if (offsets != null) {
\r
3208 prevSourceIndex = sourceIndex;
\r
3209 sourceIndex = nextSourceIndex;
\r
3213 /* target is full */
\r
3214 cr[0] = CoderResult.OVERFLOW;
\r
3221 * the end of the input stream and detection of truncated input are handled by the framework, but for
\r
3222 * EBCDIC_STATEFUL conversion we need to emit an SI at the very end
\r
3224 * conditions: successful EBCDIC_STATEFUL in DBCS mode end of input and no truncated input
\r
3226 if (outputType == MBCS_OUTPUT_2_SISO && prevLength == 2 && flush && sourceArrayIndex >= source.limit()
\r
3229 /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */
\r
3230 if (target.hasRemaining()) {
\r
3231 target.put((byte) UConverterConstants.SI);
\r
3232 if (offsets != null) {
\r
3233 /* set the last source character's index (sourceIndex points at sourceLimit now) */
\r
3234 offsets.put(prevSourceIndex);
\r
3237 /* target is full */
\r
3238 errorBuffer[0] = (byte) UConverterConstants.SI;
\r
3239 errorBufferLength = 1;
\r
3240 cr[0] = CoderResult.OVERFLOW;
\r
3242 prevLength = 1; /* we switched into SBCS */
\r
3245 /* set the converter state back into UConverter */
\r
3247 fromUnicodeStatus = prevLength;
\r
3249 source.position(sourceArrayIndex);
\r
3250 } catch (BufferOverflowException ex) {
\r
3251 cr[0] = CoderResult.OVERFLOW;
\r
3258 * This is another simple conversion function for internal use by other conversion implementations. It does not
\r
3259 * use the converter state nor call callbacks. It does not handle the EBCDIC swaplfnl option (set in
\r
3260 * UConverter). It handles conversion extensions but not GB 18030.
\r
3262 * It converts one single Unicode code point into codepage bytes, encoded as one 32-bit value. The function
\r
3263 * returns the number of bytes in *pValue: 1..4 the number of bytes in *pValue 0 unassigned (*pValue undefined)
\r
3264 * -1 illegal (currently not used, *pValue undefined)
\r
3266 * *pValue will contain the resulting bytes with the last byte in bits 7..0, the second to last byte in bits
\r
3267 * 15..8, etc. Currently, the function assumes but does not check that 0<=c<=0x10ffff.
\r
3269 int fromUChar32(int c, int[] pValue, boolean isUseFallback) {
\r
3271 // /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
\r
3272 // const uint8_t *p;
\r
3281 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
\r
3282 if (c <= 0xffff || ((sharedData.mbcs.unicodeMask & UConverterConstants.HAS_SUPPLEMENTARY) != 0)) {
\r
3283 table = sharedData.mbcs.fromUnicodeTable;
\r
3285 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
\r
3286 if (sharedData.mbcs.outputType == MBCS_OUTPUT_1) {
\r
3287 value = MBCS_SINGLE_RESULT_FROM_U(table, sharedData.mbcs.fromUnicodeBytes, c);
\r
3288 /* is this code point assigned, or do we use fallbacks? */
\r
3289 if (isUseFallback ? value >= 0x800 : value >= 0xc00) {
\r
3290 pValue[0] = value & 0xff;
\r
3293 } else /* outputType!=MBCS_OUTPUT_1 */{
\r
3294 stage2Entry = MBCS_STAGE_2_FROM_U(table, c);
\r
3296 /* get the bytes and the length for the output */
\r
3297 switch (sharedData.mbcs.outputType) {
\r
3298 case MBCS_OUTPUT_2:
\r
3299 value = MBCS_VALUE_2_FROM_STAGE_2(sharedData.mbcs.fromUnicodeBytes, stage2Entry, c);
\r
3300 if (value <= 0xff) {
\r
3307 // /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */
\r
3308 // case MBCS_OUTPUT_DBCS_ONLY:
\r
3309 // /* table with single-byte results, but only DBCS mappings used */
\r
3310 // value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
\r
3311 // if(value<=0xff) {
\r
3312 // /* no mapping or SBCS result, not taken for DBCS-only */
\r
3313 // value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */
\r
3319 case MBCS_OUTPUT_3:
\r
3320 byte[] bytes = sharedData.mbcs.fromUnicodeBytes;
\r
3321 p = CharsetMBCS.MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);
\r
3322 value = ((bytes[p] & UConverterConstants.UNSIGNED_BYTE_MASK)<<16) |
\r
3323 ((bytes[p+1] & UConverterConstants.UNSIGNED_BYTE_MASK)<<8) |
\r
3324 (bytes[p+2] & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
3325 if (value <= 0xff) {
\r
3327 } else if (value <= 0xffff) {
\r
3333 // case MBCS_OUTPUT_4:
\r
3334 // value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
\r
3335 // if(value<=0xff) {
\r
3337 // } else if(value<=0xffff) {
\r
3339 // } else if(value<=0xffffff) {
\r
3345 // case MBCS_OUTPUT_3_EUC:
\r
3346 // value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
\r
3347 // /* EUC 16-bit fixed-length representation */
\r
3348 // if(value<=0xff) {
\r
3350 // } else if((value&0x8000)==0) {
\r
3351 // value|=0x8e8000;
\r
3353 // } else if((value&0x80)==0) {
\r
3354 // value|=0x8f0080;
\r
3360 // case MBCS_OUTPUT_4_EUC:
\r
3361 // p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
\r
3362 // value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
\r
3363 // /* EUC 16-bit fixed-length representation applied to the first two bytes */
\r
3364 // if(value<=0xff) {
\r
3366 // } else if(value<=0xffff) {
\r
3368 // } else if((value&0x800000)==0) {
\r
3369 // value|=0x8e800000;
\r
3371 // } else if((value&0x8000)==0) {
\r
3372 // value|=0x8f008000;
\r
3380 /* must not occur */
\r
3384 /* is this code point assigned, or do we use fallbacks? */
\r
3385 if (MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)
\r
3386 || (CharsetEncoderICU.isFromUUseFallback(isUseFallback, c) && value != 0)) {
\r
3388 * We allow a 0 byte output if the "assigned" bit is set for this entry. There is no way with
\r
3389 * this data structure for fallback output to be a zero byte.
\r
3392 pValue[0] = value;
\r
3398 if (sharedData.mbcs.extIndexes != null) {
\r
3399 length = simpleMatchFromU(c, pValue, isUseFallback);
\r
3400 return length >= 0 ? length : -length; /* return abs(length); */
\r
3408 * continue partial match with new input, requires cnv->preFromUFirstCP>=0 never called for simple,
\r
3409 * single-character conversion
\r
3411 private CoderResult continueMatchFromU(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush,
\r
3413 CoderResult cr = CoderResult.UNDERFLOW;
\r
3414 int[] value = new int[1];
\r
3417 match = matchFromU(preFromUFirstCP, preFromUArray, preFromUBegin, preFromULength, source, value, useFallback, flush);
\r
3419 match -= 2; /* remove 2 for the initial code point */
\r
3421 if (match >= preFromULength) {
\r
3422 /* advance src pointer for the consumed input */
\r
3423 source.position(source.position() + match - preFromULength);
\r
3424 preFromULength = 0;
\r
3426 /* the match did not use all of preFromU[] - keep the rest for replay */
\r
3427 int length = preFromULength - match;
\r
3428 System.arraycopy(preFromUArray, preFromUBegin + match, preFromUArray, preFromUBegin, length);
\r
3429 preFromULength = (byte) -length;
\r
3432 /* finish the partial match */
\r
3433 preFromUFirstCP = UConverterConstants.U_SENTINEL;
\r
3435 /* write result */
\r
3436 writeFromU(value[0], target, offsets, srcIndex);
\r
3437 } else if (match < 0) {
\r
3438 /* save state for partial match */
\r
3442 /* just _append_ the newly consumed input to preFromU[] */
\r
3443 sArrayIndex = source.position();
\r
3444 match = -match - 2; /* remove 2 for the initial code point */
\r
3445 for (j = preFromULength; j < match; ++j) {
\r
3446 preFromUArray[j] = source.get(sArrayIndex++);
\r
3448 source.position(sArrayIndex); /* same as *src=srcLimit; because we reached the end of input */
\r
3449 preFromULength = (byte) match;
\r
3450 } else { /* match==0 or 1 */
\r
3454 * We need to split the previous input into two parts:
\r
3456 * 1. The first code point is unmappable - that's how we got into trying the extension data in the first
\r
3457 * place. We need to move it from the preFromU buffer to the error buffer, set an error code, and
\r
3458 * prepare the rest of the previous input for 2.
\r
3460 * 2. The rest of the previous input must be converted once we come back from the callback for the first
\r
3461 * code point. At that time, we have to try again from scratch to convert these input characters. The
\r
3462 * replay will be handled by the ucnv.c conversion code.
\r
3466 /* matched, no mapping but request for <subchar1> */
\r
3467 useSubChar1 = true;
\r
3470 /* move the first code point to the error field */
\r
3471 fromUChar32 = preFromUFirstCP;
\r
3472 preFromUFirstCP = UConverterConstants.U_SENTINEL;
\r
3474 /* mark preFromU for replay */
\r
3475 preFromULength = (byte) -preFromULength;
\r
3477 /* set the error code for unassigned */
\r
3478 // TODO: figure out what the unmappable length really should be
\r
3479 cr = CoderResult.unmappableForLength(1);
\r
3486 * pointer to extension data; if NULL, returns 0
\r
3488 * the first code point before all the other UChars
\r
3490 * UChars that must match; !initialMatch: partial match with them
\r
3491 * @param preLength
\r
3492 * length of pre, >=0
\r
3494 * UChars that can be used to complete a match
\r
3495 * @param srcLength
\r
3496 * length of src, >=0
\r
3497 * @param pMatchValue
\r
3498 * [out] output result value for the match from the data structure
\r
3499 * @param useFallback
\r
3500 * "use fallback" flag, usually from cnv->useFallback
\r
3502 * TRUE if the end of the input stream is reached
\r
3503 * @return >1: matched, return value=total match length (number of input units matched) 1: matched, no mapping
\r
3504 * but request for <subchar1> (only for the first code point) 0: no match <0: partial match, return
\r
3505 * value=negative total match length (partial matches are never returned for flush==TRUE) (partial
\r
3506 * matches are never returned as being longer than UCNV_EXT_MAX_UCHARS) the matchLength is 2 if only
\r
3507 * firstCP matched, and >2 if firstCP and further code units matched
\r
3509 // static int32_t ucnv_extMatchFromU(const int32_t *cx, UChar32 firstCP, const UChar *pre, int32_t preLength,
\r
3510 // const UChar *src, int32_t srcLength, uint32_t *pMatchValue, UBool useFallback, UBool flush)
\r
3511 private int matchFromU(int firstCP, char[] preArray, int preArrayBegin, int preLength, CharBuffer source,
\r
3512 int[] pMatchValue, boolean isUseFallback, boolean flush) {
\r
3513 ByteBuffer cx = sharedData.mbcs.extIndexes;
\r
3515 CharBuffer stage12, stage3;
\r
3516 IntBuffer stage3b;
\r
3518 CharBuffer fromUTableUChars, fromUSectionUChars;
\r
3519 IntBuffer fromUTableValues, fromUSectionValues;
\r
3521 int value, matchValue;
\r
3522 int i, j, index, length, matchLength;
\r
3526 return 0; /* no extension data, no match */
\r
3529 /* trie lookup of firstCP */
\r
3530 index = firstCP >>> 10; /* stage 1 index */
\r
3531 if (index >= cx.asIntBuffer().get(EXT_FROM_U_STAGE_1_LENGTH)) {
\r
3532 return 0; /* the first code point is outside the trie */
\r
3535 stage12 = (CharBuffer) ARRAY(cx, EXT_FROM_U_STAGE_12_INDEX, char.class);
\r
3536 stage3 = (CharBuffer) ARRAY(cx, EXT_FROM_U_STAGE_3_INDEX, char.class);
\r
3537 index = FROM_U(stage12, stage3, index, firstCP);
\r
3539 stage3b = (IntBuffer) ARRAY(cx, EXT_FROM_U_STAGE_3B_INDEX, int.class);
\r
3540 value = stage3b.get(stage3b.position() + index);
\r
3545 if (TO_U_IS_PARTIAL(value)) {
\r
3546 /* partial match, enter the loop below */
\r
3547 index = FROM_U_GET_PARTIAL_INDEX(value);
\r
3550 fromUTableUChars = (CharBuffer) ARRAY(cx, EXT_FROM_U_UCHARS_INDEX, char.class);
\r
3551 fromUTableValues = (IntBuffer) ARRAY(cx, EXT_FROM_U_VALUES_INDEX, int.class);
\r
3554 i = j = matchLength = 0;
\r
3556 /* we must not remember fallback matches when not using fallbacks */
\r
3558 /* match input units until there is a full match or the input is consumed */
\r
3560 /* go to the next section */
\r
3561 int oldpos = fromUTableUChars.position();
\r
3562 fromUSectionUChars = ((CharBuffer) fromUTableUChars.position(index)).slice();
\r
3563 fromUTableUChars.position(oldpos);
\r
3564 oldpos = fromUTableValues.position();
\r
3565 fromUSectionValues = ((IntBuffer) fromUTableValues.position(index)).slice();
\r
3566 fromUTableValues.position(oldpos);
\r
3568 /* read first pair of the section */
\r
3569 length = fromUSectionUChars.get();
\r
3570 value = fromUSectionValues.get();
\r
3571 if (value != 0 && (FROM_U_IS_ROUNDTRIP(value) || isFromUUseFallback(isUseFallback, firstCP))) {
\r
3572 /* remember longest match so far */
\r
3573 matchValue = value;
\r
3574 matchLength = 2 + i + j;
\r
3577 /* match pre[] then src[] */
\r
3578 if (i < preLength) {
\r
3579 c = preArray[preArrayBegin + i++];
\r
3580 } else if (source != null && j < source.remaining()) {
\r
3581 c = source.get(source.position() + j++);
\r
3583 /* all input consumed, partial match */
\r
3584 if (flush || (length = (i + j)) > MAX_UCHARS) {
\r
3586 * end of the entire input stream, stop with the longest match so far or: partial match must
\r
3587 * not be longer than UCNV_EXT_MAX_UCHARS because it must fit into state buffers
\r
3591 /* continue with more input next time */
\r
3592 return -(2 + length);
\r
3596 /* search for the current UChar */
\r
3597 index = findFromU(fromUSectionUChars, length, c);
\r
3599 /* no match here, stop with the longest match so far */
\r
3602 value = fromUSectionValues.get(fromUSectionValues.position() + index);
\r
3603 if (FROM_U_IS_PARTIAL(value)) {
\r
3604 /* partial match, continue */
\r
3605 index = FROM_U_GET_PARTIAL_INDEX(value);
\r
3607 if (FROM_U_IS_ROUNDTRIP(value) || isFromUUseFallback(isUseFallback, firstCP)) {
\r
3608 /* full match, stop with result */
\r
3609 matchValue = value;
\r
3610 matchLength = 2 + i + j;
\r
3612 /* full match on fallback not taken, stop with the longest match so far */
\r
3619 if (matchLength == 0) {
\r
3620 /* no match at all */
\r
3623 } else /* result from firstCP trie lookup */{
\r
3624 if (FROM_U_IS_ROUNDTRIP(value) || isFromUUseFallback(isUseFallback, firstCP)) {
\r
3625 /* full match, stop with result */
\r
3626 matchValue = value;
\r
3629 /* fallback not taken */
\r
3634 if ((matchValue & FROM_U_RESERVED_MASK) != 0) {
\r
3635 /* do not interpret values with reserved bits used, for forward compatibility */
\r
3639 /* return result */
\r
3640 if (matchValue == FROM_U_SUBCHAR1) {
\r
3641 return 1; /* assert matchLength==2 */
\r
3644 pMatchValue[0] = FROM_U_MASK_ROUNDTRIP(matchValue);
\r
3645 return matchLength;
\r
3648 private int simpleMatchFromU(int cp, int[] pValue, boolean isUseFallback) {
\r
3649 int[] value = new int[1];
\r
3650 int match; // signed
\r
3652 /* try to match */
\r
3653 match = matchFromU(cp, null, 0, 0, null, value, isUseFallback, true);
\r
3655 /* write result for simple, single-character conversion */
\r
3657 boolean isRoundtrip;
\r
3659 isRoundtrip = FROM_U_IS_ROUNDTRIP(value[0]);
\r
3660 length = FROM_U_GET_LENGTH(value[0]);
\r
3661 value[0] = FROM_U_GET_DATA(value[0]);
\r
3663 if (length <= EXT_FROM_U_MAX_DIRECT_LENGTH) {
\r
3664 pValue[0] = value[0];
\r
3665 return isRoundtrip ? length : -length;
\r
3666 // #if 0 /* not currently used */
\r
3667 // } else if(length==4) {
\r
3668 // /* de-serialize a 4-byte result */
\r
3669 // const uint8_t *result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value;
\r
3671 // ((uint32_t)result[0]<<24)|
\r
3672 // ((uint32_t)result[1]<<16)|
\r
3673 // ((uint32_t)result[2]<<8)|
\r
3675 // return isRoundtrip ? 4 : -4;
\r
3681 * return no match because - match>1 && resultLength>4: result too long for simple conversion - match==1: no
\r
3682 * match found, <subchar1> preferred - match==0: no match found in the first place - match<0: partial
\r
3683 * match, not supported for simple conversion (and flush==TRUE)
\r
3688 private CoderResult writeFromU(int value, ByteBuffer target, IntBuffer offsets, int srcIndex) {
\r
3689 ByteBuffer cx = sharedData.mbcs.extIndexes;
\r
3691 byte bufferArray[] = new byte[1 + MAX_BYTES];
\r
3692 int bufferArrayIndex = 0;
\r
3693 byte[] resultArray;
\r
3694 int resultArrayIndex;
\r
3695 int length, prevLength;
\r
3697 length = FROM_U_GET_LENGTH(value);
\r
3698 value = FROM_U_GET_DATA(value);
\r
3700 /* output the result */
\r
3701 if (length <= FROM_U_MAX_DIRECT_LENGTH) {
\r
3703 * Generate a byte array and then write it below. This is not the fastest possible way, but it should be
\r
3704 * ok for extension mappings, and it is much simpler. Offset and overflow handling are only done once
\r
3707 int p = bufferArrayIndex + 1; /* reserve buffer[0] for shiftByte below */
\r
3710 bufferArray[p++] = (byte) (value >>> 16);
\r
3712 bufferArray[p++] = (byte) (value >>> 8);
\r
3714 bufferArray[p++] = (byte) value;
\r
3716 break; /* will never occur */
\r
3718 resultArray = bufferArray;
\r
3719 resultArrayIndex = bufferArrayIndex + 1;
\r
3721 byte[] slice = new byte[length];
\r
3723 ByteBuffer bb = ((ByteBuffer) ARRAY(cx, EXT_FROM_U_BYTES_INDEX, byte.class));
\r
3724 bb.position(value);
\r
3725 bb.get(slice, 0, slice.length);
\r
3727 resultArray = slice;
\r
3728 resultArrayIndex = 0;
\r
3731 /* with correct data we have length>0 */
\r
3733 if ((prevLength = (int) fromUnicodeStatus) != 0) {
\r
3734 /* handle SI/SO stateful output */
\r
3737 if (prevLength > 1 && length == 1) {
\r
3738 /* change from double-byte mode to single-byte */
\r
3739 shiftByte = (byte) UConverterConstants.SI;
\r
3740 fromUnicodeStatus = 1;
\r
3741 } else if (prevLength == 1 && length > 1) {
\r
3742 /* change from single-byte mode to double-byte */
\r
3743 shiftByte = (byte) UConverterConstants.SO;
\r
3744 fromUnicodeStatus = 2;
\r
3749 if (shiftByte != 0) {
\r
3750 /* prepend the shift byte to the result bytes */
\r
3751 bufferArray[0] = shiftByte;
\r
3752 if (resultArray != bufferArray || resultArrayIndex != bufferArrayIndex + 1) {
\r
3753 System.arraycopy(resultArray, resultArrayIndex, bufferArray, bufferArrayIndex + 1, length);
\r
3755 resultArray = bufferArray;
\r
3756 resultArrayIndex = bufferArrayIndex;
\r
3761 return fromUWriteBytes(this, resultArray, resultArrayIndex, length, target, offsets, srcIndex);
\r
3765 * @return if(U_FAILURE) return the code point for cnv->fromUChar32 else return 0 after output has been written
\r
3768 private int fromU(int cp_, CharBuffer source, ByteBuffer target, IntBuffer offsets, int sourceIndex,
\r
3769 int length, boolean flush, CoderResult[] cr) {
\r
3771 long cp = cp_ & UConverterConstants.UNSIGNED_INT_MASK;
\r
3773 useSubChar1 = false;
\r
3775 if (sharedData.mbcs.extIndexes != null
\r
3776 && initialMatchFromU((int) cp, source, target, offsets, sourceIndex, flush, cr)) {
\r
3777 return 0; /* an extension mapping handled the input */
\r
3781 if ((options & MBCS_OPTION_GB18030) != 0) {
\r
3785 for (i = 0; i < gb18030Ranges.length; ++i) {
\r
3786 range = gb18030Ranges[i];
\r
3787 if (range[0] <= cp && cp <= range[1]) {
\r
3788 /* found the Unicode code point, output the four-byte sequence for it */
\r
3790 byte bytes[] = new byte[4];
\r
3792 /* get the linear value of the first GB 18030 code in this range */
\r
3793 linear = range[2] - LINEAR_18030_BASE;
\r
3795 /* add the offset from the beginning of the range */
\r
3796 linear += (cp - range[0]);
\r
3798 bytes[3] = (byte) (0x30 + linear % 10);
\r
3800 bytes[2] = (byte) (0x81 + linear % 126);
\r
3802 bytes[1] = (byte) (0x30 + linear % 10);
\r
3804 bytes[0] = (byte) (0x81 + linear);
\r
3806 /* output this sequence */
\r
3807 cr[0] = fromUWriteBytes(this, bytes, 0, 4, target, offsets, sourceIndex);
\r
3814 cr[0] = CoderResult.unmappableForLength(length);
\r
3819 * target<targetLimit; set error code for overflow
\r
3821 private boolean initialMatchFromU(int cp, CharBuffer source, ByteBuffer target, IntBuffer offsets,
\r
3822 int srcIndex, boolean flush, CoderResult[] cr) {
\r
3823 int[] value = new int[1];
\r
3826 /* try to match */
\r
3827 match = matchFromU(cp, null, 0, 0, source, value, useFallback, flush);
\r
3829 /* reject a match if the result is a single byte for DBCS-only */
\r
3831 && !(FROM_U_GET_LENGTH(value[0]) == 1 && sharedData.mbcs.outputType == MBCS_OUTPUT_DBCS_ONLY)) {
\r
3832 /* advance src pointer for the consumed input */
\r
3833 source.position(source.position() + match - 2); /* remove 2 for the initial code point */
\r
3835 /* write result to target */
\r
3836 cr[0] = writeFromU(value[0], target, offsets, srcIndex);
\r
3838 } else if (match < 0) {
\r
3839 /* save state for partial match */
\r
3843 /* copy the first code point */
\r
3844 preFromUFirstCP = cp;
\r
3846 /* now copy the newly consumed input */
\r
3847 sArrayIndex = source.position();
\r
3848 match = -match - 2; /* remove 2 for the initial code point */
\r
3849 for (j = 0; j < match; ++j) {
\r
3850 preFromUArray[j] = source.get(sArrayIndex++);
\r
3852 source.position(sArrayIndex); /* same as *src=srcLimit; because we reached the end of input */
\r
3853 preFromULength = (byte) match;
\r
3855 } else if (match == 1) {
\r
3856 /* matched, no mapping but request for <subchar1> */
\r
3857 useSubChar1 = true;
\r
3859 } else /* match==0 no match */{
\r
3864 CoderResult cnvMBCSFromUnicodeWithOffsets(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
\r
3865 CoderResult[] cr = { CoderResult.UNDERFLOW };
\r
3872 SideEffects x = new SideEffects(0, 0, 0, 0, 0, 0);
\r
3874 int targetCapacity = target.limit() - target.position();
\r
3876 int stage2Entry = 0;
\r
3877 //int asciiRoundtrips;
\r
3882 boolean doLoop = true;
\r
3883 boolean gotoGetTrail = false;
\r
3885 if (preFromUFirstCP >= 0) {
\r
3887 * pass sourceIndex=-1 because we continue from an earlier buffer
\r
3888 * in the future, this may change with continuous offsets.
\r
3890 cr[0] = continueMatchFromU(source, target, offsets, flush, -1);
\r
3891 if (cr[0].isError() || preFromULength < 0) {
\r
3896 /* use optimized function if possible */
\r
3897 outputType = sharedData.mbcs.outputType;
\r
3898 uniMask = sharedData.mbcs.unicodeMask;
\r
3899 if (outputType == MBCS_OUTPUT_1 && ((uniMask&UConverterConstants.HAS_SURROGATES) == 0)) {
\r
3900 if ((uniMask&UConverterConstants.HAS_SURROGATES) == 0) {
\r
3901 cr[0] = cnvMBCSSingleFromBMPWithOffsets(source, target, offsets, flush);
\r
3903 cr[0] = cnvMBCSSingleFromUnicodeWithOffsets(source, target, offsets, flush);
\r
3906 }/* else if (outputType == MBCS_OUTPUT_2 && mbcs.sharedData.mbcs.utf8Friendly) {
\r
3907 cr[0] = cnvMBCSDoubleFromUnicodeWithOffsets(source, target, offsets, flush);
\r
3911 table = sharedData.mbcs.fromUnicodeTable;
\r
3912 /* if (mbcs.sharedData.mbcs.utf8Friendly) {
\r
3913 mbcsIndex = mbcs.sharedData.mbcs.mbcsIndex;
\r
3918 if ((options&UConverterConstants.OPTION_SWAP_LFNL) != 0) {
\r
3919 bytes = ByteBuffer.wrap(sharedData.mbcs.swapLFNLFromUnicodeBytes);
\r
3921 bytes = ByteBuffer.wrap(sharedData.mbcs.fromUnicodeBytes);
\r
3923 //asciiRoundtrips = mbcs.sharedData.mbcs.asciiRoundtrips;
\r
3925 /* get the converter state from UConverter */
\r
3926 x.c = fromUChar32;
\r
3927 if (outputType == MBCS_OUTPUT_2_SISO) {
\r
3928 x.prevLength = fromUnicodeStatus;
\r
3929 if (x.prevLength == 0) {
\r
3930 /* set the real value */
\r
3934 /* prevent fromUnicodeStatus from being set to something non-0 */
\r
3938 /* sourceIndex = -1 if the current character began in the previous buffer */
\r
3939 x.prevSourceIndex = -1;
\r
3940 x.sourceIndex = x.c==0 ? 0 : -1;
\r
3941 x.nextSourceIndex = 0;
\r
3943 /* conversion loop */
\r
3944 if (x.c != 0 && targetCapacity > 0) {
\r
3945 gotoGetTrail = true; // set gotoGetTrail flag and go to gotoGetTrail label
\r
3948 while (gotoGetTrail || source.hasRemaining()) {
\r
3950 * This following test is to see if available input would overflow the output.
\r
3951 * It does not catch output of more than one byte that
\r
3952 * overflows as a result of a multi-byte character or callback output
\r
3953 * from the last source character.
\r
3954 * Therefore, those situations also test for overflows and will
\r
3955 * then break the loop, too.
\r
3957 if (gotoGetTrail || targetCapacity > 0) {
\r
3959 * Get a correct Unicode code point:
\r
3960 * a single UChar for a BMP code point or
\r
3961 * a matched surrogate pair for a "supplementary code point."
\r
3963 if (!gotoGetTrail) {
\r
3964 x.c = source.get();
\r
3965 ++x.nextSourceIndex;
\r
3966 /* This is commented out because of the fact that IS_ASCII_ROUNDTRIP is not
\r
3967 * being used in ICU4J.
\r
3969 /*if (x.c <= 0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
\r
3970 target.put((byte)x.c);
\r
3971 if (offsets != null) {
\r
3972 offsets.put(x.sourceIndex);
\r
3973 x.prevSourceIndex = x.sourceIndex;
\r
3974 x.sourceIndex = x.nextSourceIndex;
\r
3981 /* Code to use utf8friendly code was removed since it is not needed in Java. */
\r
3982 /* This also tests if the codepage maps single surrogates.
\r
3983 * If it does, then surrogates are not paired but mapped separately.
\r
3984 * Note that in this case unmatched surrogates are not detected.
\r
3986 if (gotoGetTrail || (UTF16.isSurrogate((char)x.c) && (uniMask&UConverterConstants.HAS_SURROGATES) == 0)) {
\r
3987 if (gotoGetTrail || (UTF16.isLeadSurrogate((char)x.c))) {
\r
3989 gotoGetTrail = false; // reset gotoGetTrail flag
\r
3991 x.sourceArrayIndex = source.position();
\r
3993 doLoop = getTrail(source, target, uniMask, x, flush, cr);
\r
3994 if (x.doread && doLoop) {
\r
3996 } else if (!x.doread && !doLoop) {
\r
3998 } else if (!doLoop) {
\r
4002 /* this is an unmatched trail code unit (2nd surrogate) */
\r
4003 /* callback(illegal) */
\r
4004 cr[0] = CoderResult.malformedForLength(1);
\r
4009 /* convert the Unicode point in c into codepage bytes */
\r
4011 * The basic lookup is a triple-stage compact array (trie) lookup.
\r
4013 * Single-byte codepages are handled with a different data structure
\r
4014 * by _MBCSSingle... functions.
\r
4016 * The result consists of a 32-bit value from stage 2 and
\r
4017 * a pointer to as many bytes as are stored per character.
\r
4018 * The pointer points to the character's bytes in stage 3.
\r
4019 * Bits 15..0 of the stage 2 entry contain the stage 3 index
\r
4020 * for that pointer, while bits 31..16 are flags for which of
\r
4021 * the 16 characters in the block are roundtrip-assigned.
\r
4023 * For 2-byte and 4 byte codepages, the bytes are stored as uint16_t
\r
4024 * respectively as uint32_t, in the platform encoding.
\r
4025 * For 3-byte codepages, the bytes are always stored in big-endian order.
\r
4027 * For EUC encodings that use only either 0x8e or 0x8f as the first
\r
4028 * byte of their longest byte sequences, the first two bytes in
\r
4029 * this third stage indicate with their 7th bits whether these bytes
\r
4030 * are to be writeen directly or actually need to be preceeded by
\r
4031 * one of the two Single-Shift codes. With this, the third stage
\r
4032 * stores one byte fewer per character than the actual maximum length of
\r
4033 * EUC byte sequences.
\r
4035 * Other than that, leading zero bytes are removed and the other
\r
4036 * bytes output. A single zero byte may be ouput if the "assigned"
\r
4037 * bit in stage 2 was on.
\r
4038 * The data structure does not support zero byte output as a fallback,
\r
4039 * and also does not allow output of leading zeros.
\r
4041 stage2Entry = MBCS_STAGE_2_FROM_U(table, x.c);
\r
4043 /* get the bytes and the length for the output */
\r
4044 switch (outputType) {
\r
4045 case MBCS_OUTPUT_2:
\r
4046 value = MBCS_VALUE_2_FROM_STAGE_2(bytes.array(), stage2Entry, x.c);
\r
4047 if (value <= 0xff) {
\r
4053 case MBCS_OUTPUT_2_SISO:
\r
4054 /* 1/2-byte stateful with Shift-In/Shift-Out */
\r
4056 * Save the old state in the converter object
\r
4057 * right here, then change the local pervLength state variable if necessary.
\r
4058 * Then, if this character turns out to be unassigned or a fallback that
\r
4059 * is not taken, the callback code must not save the new state in the converter
\r
4060 * because the new state is for a character that is not output.
\r
4061 * However, the callback must still restore the state from the converter
\r
4062 * in case the callback function changed it for its output.
\r
4064 fromUnicodeStatus = x.prevLength; /* save the old state */
\r
4065 value = MBCS_VALUE_2_FROM_STAGE_2(bytes.array(), stage2Entry, x.c);
\r
4066 if (value <= 0xff) {
\r
4067 if (value == 0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, x.c)) {
\r
4068 /* no mapping, leave value == 0 */
\r
4070 } else if (x.prevLength <= 1) {
\r
4073 /* change from double-byte mode to single-byte */
\r
4074 value |= UConverterConstants.UNSIGNED_INT_MASK & (UConverterConstants.SI<<8);
\r
4079 if (x.prevLength == 2) {
\r
4082 /* change from single-byte mode to double-byte */
\r
4083 value |= UConverterConstants.UNSIGNED_INT_MASK & (UConverterConstants.SO<<16);
\r
4089 case MBCS_OUTPUT_DBCS_ONLY:
\r
4090 /* table with single-byte results, but only DBCS mappings used */
\r
4091 value = MBCS_VALUE_2_FROM_STAGE_2(bytes.array(), stage2Entry, x.c);
\r
4092 if (value <= 0xff) {
\r
4093 /* no mapping or SBCS result, not taken for DBCS-only */
\r
4094 value = stage2Entry = 0; /* stage2Entry=0 to reset roundtrip flags */
\r
4100 case MBCS_OUTPUT_3:
\r
4101 p = MBCS_POINTER_3_FROM_STAGE_2(bytes.array(), stage2Entry, x.c);
\r
4102 value = UConverterConstants.UNSIGNED_INT_MASK&((int)bytes.get(p)<<16 | (int)bytes.get(p+1)<<8 | bytes.get(p+2));
\r
4103 if (value <= 0xff) {
\r
4105 } else if (value <= 0xffff) {
\r
4111 case MBCS_OUTPUT_4:
\r
4112 value = MBCS_VALUE_4_FROM_STAGE_2(bytes.array(), stage2Entry, x.c);
\r
4113 if (value <= 0xff) {
\r
4115 } else if (value <= 0xffff) {
\r
4117 } else if (value <= 0xffffff) {
\r
4123 case MBCS_OUTPUT_3_EUC:
\r
4124 value = MBCS_VALUE_2_FROM_STAGE_2(bytes.array(), stage2Entry, x.c);
\r
4125 /* EUC 16-bit fixed-length representation */
\r
4126 if (value <= 0xff) {
\r
4128 } else if ((value&0x8000) == 0) {
\r
4129 value |= 0x8e8000;
\r
4131 } else if ((value&0x80) == 0) {
\r
4132 value |= 0x8f0080;
\r
4138 case MBCS_OUTPUT_4_EUC:
\r
4139 p = MBCS_POINTER_3_FROM_STAGE_2(bytes.array(), stage2Entry, x.c);
\r
4140 value = UConverterConstants.UNSIGNED_INT_MASK&((int)bytes.get(p)<<16 | (int)bytes.get(p+1)<<8 | bytes.get(p+2));
\r
4141 /* EUC 16-bit fixed-length representation applied to the first two bytes */
\r
4142 if (value <= 0xff) {
\r
4144 } else if (value <= 0xffff) {
\r
4146 } else if ((value&0x800000) == 0) {
\r
4147 value |= 0x08e800000;
\r
4149 } else if ((value&0x8000) == 0) {
\r
4150 value |= 0x08f008000;
\r
4157 /* must not occur */
\r
4158 value = stage2Entry = 0;
\r
4162 /* is this code point assigned, or do we use fallbacks? */
\r
4163 if (!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, x.c)) ||
\r
4164 (CharsetEncoderICU.isFromUUseFallback(useFallback, x.c) && value != 0)) {
\r
4166 * We allow a 0 byte output if the "assigned" bit is set for this entry.
\r
4167 * There is no way with this data structure for fallback output
\r
4168 * to be a zero byte.
\r
4170 // unassigned label
\r
4171 int currentSourcePos = source.position();
\r
4172 doLoop = unassigned(source, target, offsets, x, flush, cr);
\r
4176 if (source.position() < currentSourcePos) {
\r
4177 source.position(currentSourcePos);
\r
4183 /* write the output character bytes from value and length */
\r
4184 /* from the first if in the loop we know that targetCapacity>0 */
\r
4185 if (length <= targetCapacity) {
\r
4187 /* each branch falls through to the next one */
\r
4189 target.put((byte)(value>>24));
\r
4190 if (offsets != null) {
\r
4191 offsets.put(x.sourceIndex);
\r
4194 target.put((byte)(value>>16));
\r
4195 if (offsets != null) {
\r
4196 offsets.put(x.sourceIndex);
\r
4199 target.put((byte)(value>>8));
\r
4200 if (offsets != null) {
\r
4201 offsets.put(x.sourceIndex);
\r
4204 target.put((byte)value);
\r
4205 if (offsets != null) {
\r
4206 offsets.put(x.sourceIndex);
\r
4209 /* will never occur */
\r
4213 targetCapacity -= length;
\r
4216 * We actually do this backwards here:
\r
4217 * In order to save an intermediate variable, we output
\r
4218 * first to the overflow buffer what does not fit into the
\r
4221 /* we know that 1<=targetCapacity<length<=4 */
\r
4222 length -= targetCapacity;
\r
4223 int i = 0; // index for errorBuffer
\r
4225 /* each branch falls through to the next one */
\r
4227 errorBuffer[i++] = (byte)(value>>16);
\r
4229 errorBuffer[i++] = (byte)(value>>8);
\r
4231 errorBuffer[i++] = (byte)value;
\r
4233 /* will never occur */
\r
4236 errorBufferLength = length;
\r
4238 /* now output what fits into the regular target */
\r
4239 value>>=8*length; /* length was reduced by targetCapacity */
\r
4240 switch (targetCapacity) {
\r
4241 /* each branch falls through to the next one */
\r
4243 target.put((byte)(value>>16));
\r
4244 if (offsets != null) {
\r
4245 offsets.put(x.sourceIndex);
\r
4248 target.put((byte)(value>>8));
\r
4249 if (offsets != null) {
\r
4250 offsets.put(x.sourceIndex);
\r
4253 target.put((byte)value);
\r
4254 if (offsets != null) {
\r
4255 offsets.put(x.sourceIndex);
\r
4258 /* will never occur */
\r
4262 /* target overflow */
\r
4263 targetCapacity = 0;
\r
4264 cr[0] = CoderResult.OVERFLOW;
\r
4269 /* normal end of conversion: prepare for a new character */
\r
4271 if (offsets != null) {
\r
4272 x.prevSourceIndex = x.sourceIndex;
\r
4273 x.sourceIndex = x.nextSourceIndex;
\r
4277 /* target is full */
\r
4278 cr[0] = CoderResult.OVERFLOW;
\r
4284 * the end of the input stream and detection of truncated input
\r
4285 * are handled by the framework, but for EBCDIC_STATEFUL conversion
\r
4286 * we need to emit an SI at the very end
\r
4290 * EBCDIC_STATEFUL in DBCS mode
\r
4291 * end of input and no truncated input
\r
4293 if (!cr[0].isError() && outputType == MBCS_OUTPUT_2_SISO && x.prevLength == 2 && flush && !source.hasRemaining() && x.c == 0) {
\r
4294 /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */
\r
4295 if (targetCapacity > 0) {
\r
4296 target.put((byte)UConverterConstants.SI);
\r
4297 if (offsets != null) {
\r
4298 /* set the last source character's index (sourceIndex points at sourceLimit now) */
\r
4299 offsets.put(x.prevSourceIndex);
\r
4302 /* target is full */
\r
4303 errorBuffer[0] = UConverterConstants.SI;
\r
4304 errorBufferLength = 1;
\r
4305 cr[0] = CoderResult.OVERFLOW;
\r
4307 x.prevLength = 1; /* we switched into SBCS */
\r
4309 /* set the converter state back into UConverter */
\r
4310 fromUChar32 = x.c;
\r
4311 fromUnicodeStatus = x.prevLength;
\r
4317 * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages that map only to and from the
\r
4318 * BMP. In addition to single-byte/state optimizations, the offset calculations become much easier.
\r
4320 private CoderResult cnvMBCSSingleFromBMPWithOffsets(CharBuffer source, ByteBuffer target, IntBuffer offsets,
\r
4323 CoderResult[] cr = { CoderResult.UNDERFLOW };
\r
4325 int sourceArrayIndex, lastSource;
\r
4326 int targetCapacity, length;
\r
4330 int c, sourceIndex;
\r
4331 char value, minValue;
\r
4333 /* set up the local pointers */
\r
4334 sourceArrayIndex = source.position();
\r
4335 targetCapacity = target.remaining();
\r
4336 table = sharedData.mbcs.fromUnicodeTable;
\r
4338 if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) {
\r
4339 results = sharedData.mbcs.swapLFNLFromUnicodeBytes; // agljport:comment should swapLFNLFromUnicodeBytes
\r
4340 // be a ByteBuffer so results can be a 16-bit view
\r
4343 results = sharedData.mbcs.fromUnicodeBytes; // agljport:comment should swapLFNLFromUnicodeBytes be a
\r
4344 // ByteBuffer so results can be a 16-bit view of it?
\r
4347 if (useFallback) {
\r
4348 /* use all roundtrip and fallback results */
\r
4351 /* use only roundtrips and fallbacks from private-use characters */
\r
4355 /* get the converter state from UConverter */
\r
4358 /* sourceIndex=-1 if the current character began in the previous buffer */
\r
4359 sourceIndex = c == 0 ? 0 : -1;
\r
4360 lastSource = sourceArrayIndex;
\r
4363 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter for the minimum of the
\r
4364 * sourceLength and targetCapacity
\r
4366 length = source.limit() - sourceArrayIndex;
\r
4367 if (length < targetCapacity) {
\r
4368 targetCapacity = length;
\r
4371 boolean doloop = true;
\r
4372 if (c != 0 && targetCapacity > 0) {
\r
4373 SideEffectsSingleBMP x = new SideEffectsSingleBMP(c, sourceArrayIndex);
\r
4374 doloop = getTrailSingleBMP(source, x, cr);
\r
4376 sourceArrayIndex = x.sourceArrayIndex;
\r
4380 while (targetCapacity > 0) {
\r
4382 * Get a correct Unicode code point: a single UChar for a BMP code point or a matched surrogate pair
\r
4383 * for a "supplementary code point".
\r
4385 c = source.get(sourceArrayIndex++);
\r
4387 * Do not immediately check for single surrogates: Assume that they are unassigned and check for
\r
4388 * them in that case. This speeds up the conversion of assigned characters.
\r
4390 /* convert the Unicode code point in c into codepage bytes */
\r
4391 value = MBCS_SINGLE_RESULT_FROM_U(table, results, c);
\r
4393 /* is this code point assigned, or do we use fallbacks? */
\r
4394 if (value >= minValue) {
\r
4395 /* assigned, write the output character bytes from value and length */
\r
4397 /* this is easy because we know that there is enough space */
\r
4398 target.put((byte) value);
\r
4401 /* normal end of conversion: prepare for a new character */
\r
4404 } else if (!UTF16.isSurrogate((char) c)) {
\r
4405 /* normal, unassigned BMP character */
\r
4406 } else if (UTF16.isLeadSurrogate((char) c)) {
\r
4408 SideEffectsSingleBMP x = new SideEffectsSingleBMP(c, sourceArrayIndex);
\r
4409 doloop = getTrailSingleBMP(source, x, cr);
\r
4411 sourceArrayIndex = x.sourceArrayIndex;
\r
4415 /* this is an unmatched trail code unit (2nd surrogate) */
\r
4416 /* callback(illegal) */
\r
4417 cr[0] = CoderResult.malformedForLength(1);
\r
4421 /* c does not have a mapping */
\r
4423 /* get the number of code units for c to correctly advance sourceIndex */
\r
4424 length = UTF16.getCharCount(c);
\r
4426 /* set offsets since the start or the last extension */
\r
4427 if (offsets != null) {
\r
4428 int count = sourceArrayIndex - lastSource;
\r
4430 /* do not set the offset for this character */
\r
4433 while (count > 0) {
\r
4434 offsets.put(sourceIndex++);
\r
4437 /* offsets and sourceIndex are now set for the current character */
\r
4440 /* try an extension mapping */
\r
4441 lastSource = sourceArrayIndex;
\r
4442 source.position(sourceArrayIndex);
\r
4443 c = fromU(c, source, target, offsets, sourceIndex, length, flush, cr);
\r
4444 sourceArrayIndex = source.position();
\r
4445 sourceIndex += length + (sourceArrayIndex - lastSource);
\r
4446 lastSource = sourceArrayIndex;
\r
4448 if (cr[0].isError()) {
\r
4449 /* not mappable or buffer overflow */
\r
4452 /* a mapping was written to the target, continue */
\r
4454 /* recalculate the targetCapacity after an extension mapping */
\r
4455 targetCapacity = target.remaining();
\r
4456 length = source.limit() - sourceArrayIndex;
\r
4457 if (length < targetCapacity) {
\r
4458 targetCapacity = length;
\r
4464 if (sourceArrayIndex < source.limit() && !target.hasRemaining()) {
\r
4465 /* target is full */
\r
4466 cr[0] = CoderResult.OVERFLOW;
\r
4469 /* set offsets since the start or the last callback */
\r
4470 if (offsets != null) {
\r
4471 int count = sourceArrayIndex - lastSource;
\r
4472 while (count > 0) {
\r
4473 offsets.put(sourceIndex++);
\r
4478 /* set the converter state back into UConverter */
\r
4481 /* write back the updated pointers */
\r
4482 source.position(sourceArrayIndex);
\r
4487 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */
\r
4488 private CoderResult cnvMBCSSingleFromUnicodeWithOffsets(CharBuffer source, ByteBuffer target,
\r
4489 IntBuffer offsets, boolean flush) {
\r
4491 CoderResult[] cr = { CoderResult.UNDERFLOW };
\r
4493 int sourceArrayIndex;
\r
4496 byte[] results; // agljport:comment results is used to to get 16-bit values out of byte[] array
\r
4499 int sourceIndex, nextSourceIndex;
\r
4501 char value, minValue;
\r
4503 /* set up the local pointers */
\r
4505 sourceArrayIndex = source.position();
\r
4507 table = sharedData.mbcs.fromUnicodeTable;
\r
4509 if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) {
\r
4510 results = sharedData.mbcs.swapLFNLFromUnicodeBytes; // agljport:comment should swapLFNLFromUnicodeBytes
\r
4511 // be a ByteBuffer so results can be a 16-bit view
\r
4514 results = sharedData.mbcs.fromUnicodeBytes; // agljport:comment should swapLFNLFromUnicodeBytes be a
\r
4515 // ByteBuffer so results can be a 16-bit view of it?
\r
4518 if (useFallback) {
\r
4519 /* use all roundtrip and fallback results */
\r
4522 /* use only roundtrips and fallbacks from private-use characters */
\r
4525 // agljport:comment hasSupplementary only used in getTrail block which now simply repeats the mask operation
\r
4526 uniMask = sharedData.mbcs.unicodeMask;
\r
4528 /* get the converter state from UConverter */
\r
4531 /* sourceIndex=-1 if the current character began in the previous buffer */
\r
4532 sourceIndex = c == 0 ? 0 : -1;
\r
4533 nextSourceIndex = 0;
\r
4535 boolean doloop = true;
\r
4536 boolean doread = true;
\r
4537 if (c != 0 && target.hasRemaining()) {
\r
4538 if (UTF16.isLeadSurrogate((char) c)) {
\r
4539 SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, nextSourceIndex);
\r
4540 doloop = getTrailDouble(source, target, uniMask, x, flush, cr);
\r
4541 doread = x.doread;
\r
4543 sourceArrayIndex = x.sourceArrayIndex;
\r
4544 sourceIndex = x.sourceIndex;
\r
4545 nextSourceIndex = x.nextSourceIndex;
\r
4552 while (!doread || sourceArrayIndex < source.limit()) {
\r
4554 * This following test is to see if available input would overflow the output. It does not catch
\r
4555 * output of more than one byte that overflows as a result of a multi-byte character or callback
\r
4556 * output from the last source character. Therefore, those situations also test for overflows and
\r
4557 * will then break the loop, too.
\r
4559 if (target.hasRemaining()) {
\r
4561 * Get a correct Unicode code point: a single UChar for a BMP code point or a matched surrogate
\r
4562 * pair for a "supplementary code point".
\r
4566 c = source.get(sourceArrayIndex++);
\r
4567 ++nextSourceIndex;
\r
4568 if (UTF16.isSurrogate((char) c)) {
\r
4569 if (UTF16.isLeadSurrogate((char) c)) {
\r
4571 SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex,
\r
4573 doloop = getTrailDouble(source, target, uniMask, x, flush, cr);
\r
4575 sourceArrayIndex = x.sourceArrayIndex;
\r
4576 sourceIndex = x.sourceIndex;
\r
4577 nextSourceIndex = x.nextSourceIndex;
\r
4585 /* this is an unmatched trail code unit (2nd surrogate) */
\r
4586 /* callback(illegal) */
\r
4587 cr[0] = CoderResult.malformedForLength(1);
\r
4595 /* convert the Unicode code point in c into codepage bytes */
\r
4596 value = MBCS_SINGLE_RESULT_FROM_U(table, results, c);
\r
4598 /* is this code point assigned, or do we use fallbacks? */
\r
4599 if (value >= minValue) {
\r
4600 /* assigned, write the output character bytes from value and length */
\r
4602 /* this is easy because we know that there is enough space */
\r
4603 target.put((byte) value);
\r
4604 if (offsets != null) {
\r
4605 offsets.put(sourceIndex);
\r
4608 /* normal end of conversion: prepare for a new character */
\r
4610 sourceIndex = nextSourceIndex;
\r
4611 } else { /* unassigned */
\r
4612 /* try an extension mapping */
\r
4613 SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex,
\r
4615 doloop = unassignedDouble(source, target, x, flush, cr);
\r
4617 sourceArrayIndex = x.sourceArrayIndex;
\r
4618 sourceIndex = x.sourceIndex;
\r
4619 nextSourceIndex = x.nextSourceIndex;
\r
4624 /* target is full */
\r
4625 cr[0] = CoderResult.OVERFLOW;
\r
4631 /* set the converter state back into UConverter */
\r
4634 /* write back the updated pointers */
\r
4635 source.position(sourceArrayIndex);
\r
4640 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */
\r
4641 private CoderResult cnvMBCSDoubleFromUnicodeWithOffsets(CharBuffer source, ByteBuffer target,
\r
4642 IntBuffer offsets, boolean flush) {
\r
4643 CoderResult[] cr = { CoderResult.UNDERFLOW };
\r
4645 int sourceArrayIndex;
\r
4650 int c, sourceIndex, nextSourceIndex;
\r
4657 /* use optimized function if possible */
\r
4658 uniMask = sharedData.mbcs.unicodeMask;
\r
4660 /* set up the local pointers */
\r
4661 sourceArrayIndex = source.position();
\r
4663 table = sharedData.mbcs.fromUnicodeTable;
\r
4665 if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) {
\r
4666 bytes = sharedData.mbcs.swapLFNLFromUnicodeBytes;
\r
4668 bytes = sharedData.mbcs.fromUnicodeBytes;
\r
4671 /* get the converter state from UConverter */
\r
4674 /* sourceIndex=-1 if the current character began in the previous buffer */
\r
4675 sourceIndex = c == 0 ? 0 : -1;
\r
4676 nextSourceIndex = 0;
\r
4678 /* conversion loop */
\r
4679 boolean doloop = true;
\r
4680 boolean doread = true;
\r
4681 if (c != 0 && target.hasRemaining()) {
\r
4682 if (UTF16.isLeadSurrogate((char) c)) {
\r
4683 SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, nextSourceIndex);
\r
4684 doloop = getTrailDouble(source, target, uniMask, x, flush, cr);
\r
4685 doread = x.doread;
\r
4687 sourceArrayIndex = x.sourceArrayIndex;
\r
4688 sourceIndex = x.sourceIndex;
\r
4689 nextSourceIndex = x.nextSourceIndex;
\r
4696 while (!doread || sourceArrayIndex < source.limit()) {
\r
4698 * This following test is to see if available input would overflow the output. It does not catch
\r
4699 * output of more than one byte that overflows as a result of a multi-byte character or callback
\r
4700 * output from the last source character. Therefore, those situations also test for overflows and
\r
4701 * will then break the loop, too.
\r
4703 if (target.hasRemaining()) {
\r
4706 * Get a correct Unicode code point: a single UChar for a BMP code point or a matched
\r
4707 * surrogate pair for a "supplementary code point".
\r
4709 c = source.get(sourceArrayIndex++);
\r
4710 ++nextSourceIndex;
\r
4712 * This also tests if the codepage maps single surrogates. If it does, then surrogates are
\r
4713 * not paired but mapped separately. Note that in this case unmatched surrogates are not
\r
4716 if (UTF16.isSurrogate((char) c) && (uniMask & UConverterConstants.HAS_SURROGATES) == 0) {
\r
4717 if (UTF16.isLeadSurrogate((char) c)) {
\r
4719 SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex,
\r
4721 doloop = getTrailDouble(source, target, uniMask, x, flush, cr);
\r
4723 sourceArrayIndex = x.sourceArrayIndex;
\r
4724 sourceIndex = x.sourceIndex;
\r
4725 nextSourceIndex = x.nextSourceIndex;
\r
4734 /* this is an unmatched trail code unit (2nd surrogate) */
\r
4735 /* callback(illegal) */
\r
4736 cr[0] = CoderResult.malformedForLength(1);
\r
4744 /* convert the Unicode code point in c into codepage bytes */
\r
4745 stage2Entry = MBCS_STAGE_2_FROM_U(table, c);
\r
4747 /* get the bytes and the length for the output */
\r
4748 /* MBCS_OUTPUT_2 */
\r
4749 value = MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
\r
4750 if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) {
\r
4756 /* is this code point assigned, or do we use fallbacks? */
\r
4757 if (!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || (isFromUUseFallback(c) && value != 0))) {
\r
4759 * We allow a 0 byte output if the "assigned" bit is set for this entry. There is no way
\r
4760 * with this data structure for fallback output to be a zero byte.
\r
4764 SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex,
\r
4767 doloop = unassignedDouble(source, target, x, flush, cr);
\r
4769 sourceArrayIndex = x.sourceArrayIndex;
\r
4770 sourceIndex = x.sourceIndex;
\r
4771 nextSourceIndex = x.nextSourceIndex;
\r
4778 /* write the output character bytes from value and length */
\r
4779 /* from the first if in the loop we know that targetCapacity>0 */
\r
4780 if (length == 1) {
\r
4781 /* this is easy because we know that there is enough space */
\r
4782 target.put((byte) value);
\r
4783 if (offsets != null) {
\r
4784 offsets.put(sourceIndex);
\r
4786 } else /* length==2 */{
\r
4787 target.put((byte) (value >>> 8));
\r
4788 if (2 <= target.remaining()) {
\r
4789 target.put((byte) value);
\r
4790 if (offsets != null) {
\r
4791 offsets.put(sourceIndex);
\r
4792 offsets.put(sourceIndex);
\r
4795 if (offsets != null) {
\r
4796 offsets.put(sourceIndex);
\r
4798 errorBuffer[0] = (byte) value;
\r
4799 errorBufferLength = 1;
\r
4801 /* target overflow */
\r
4802 cr[0] = CoderResult.OVERFLOW;
\r
4808 /* normal end of conversion: prepare for a new character */
\r
4810 sourceIndex = nextSourceIndex;
\r
4813 /* target is full */
\r
4814 cr[0] = CoderResult.OVERFLOW;
\r
4820 /* set the converter state back into UConverter */
\r
4823 /* write back the updated pointers */
\r
4824 source.position(sourceArrayIndex);
\r
4829 private final class SideEffectsSingleBMP {
\r
4830 int c, sourceArrayIndex;
\r
4832 SideEffectsSingleBMP(int c_, int sourceArrayIndex_) {
\r
4834 sourceArrayIndex = sourceArrayIndex_;
\r
4838 // function made out of block labeled getTrail in ucnv_MBCSSingleFromUnicodeWithOffsets
\r
4839 // assumes input c is lead surrogate
\r
4840 private final boolean getTrailSingleBMP(CharBuffer source, SideEffectsSingleBMP x, CoderResult[] cr) {
\r
4841 if (x.sourceArrayIndex < source.limit()) {
\r
4842 /* test the following code unit */
\r
4843 char trail = source.get(x.sourceArrayIndex);
\r
4844 if (UTF16.isTrailSurrogate(trail)) {
\r
4845 ++x.sourceArrayIndex;
\r
4846 x.c = UCharacter.getCodePoint((char) x.c, trail);
\r
4847 /* this codepage does not map supplementary code points */
\r
4848 /* callback(unassigned) */
\r
4849 cr[0] = CoderResult.unmappableForLength(2);
\r
4852 /* this is an unmatched lead code unit (1st surrogate) */
\r
4853 /* callback(illegal) */
\r
4854 cr[0] = CoderResult.malformedForLength(1);
\r
4858 /* no more input */
\r
4864 private final class SideEffects {
\r
4865 int c, sourceArrayIndex, sourceIndex, nextSourceIndex, prevSourceIndex, prevLength;
\r
4866 boolean doread = true;
\r
4868 SideEffects(int c_, int sourceArrayIndex_, int sourceIndex_, int nextSourceIndex_, int prevSourceIndex_,
\r
4869 int prevLength_) {
\r
4871 sourceArrayIndex = sourceArrayIndex_;
\r
4872 sourceIndex = sourceIndex_;
\r
4873 nextSourceIndex = nextSourceIndex_;
\r
4874 prevSourceIndex = prevSourceIndex_;
\r
4875 prevLength = prevLength_;
\r
4879 // function made out of block labeled getTrail in ucnv_MBCSFromUnicodeWithOffsets
\r
4880 // assumes input c is lead surrogate
\r
4881 private final boolean getTrail(CharBuffer source, ByteBuffer target, int uniMask, SideEffects x,
\r
4882 boolean flush, CoderResult[] cr) {
\r
4883 if (x.sourceArrayIndex < source.limit()) {
\r
4884 /* test the following code unit */
\r
4885 char trail = source.get(x.sourceArrayIndex);
\r
4886 if (UTF16.isTrailSurrogate(trail)) {
\r
4887 ++x.sourceArrayIndex;
\r
4888 ++x.nextSourceIndex;
\r
4889 /* convert this supplementary code point */
\r
4890 x.c = UCharacter.getCodePoint((char) x.c, trail);
\r
4891 if ((uniMask & UConverterConstants.HAS_SUPPLEMENTARY) == 0) {
\r
4892 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
\r
4893 fromUnicodeStatus = x.prevLength; /* save the old state */
\r
4894 /* callback(unassigned) */
\r
4896 return unassigned(source, target, null, x, flush, cr);
\r
4902 /* this is an unmatched lead code unit (1st surrogate) */
\r
4903 /* callback(illegal) */
\r
4904 cr[0] = CoderResult.malformedForLength(1);
\r
4908 /* no more input */
\r
4913 // function made out of block labeled unassigned in ucnv_MBCSFromUnicodeWithOffsets
\r
4914 private final boolean unassigned(CharBuffer source, ByteBuffer target, IntBuffer offsets, SideEffects x,
\r
4915 boolean flush, CoderResult[] cr) {
\r
4916 /* try an extension mapping */
\r
4917 int sourceBegin = x.sourceArrayIndex;
\r
4918 source.position(x.sourceArrayIndex);
\r
4919 x.c = fromU(x.c, source, target, null, x.sourceIndex, x.nextSourceIndex, flush, cr);
\r
4920 x.sourceArrayIndex = source.position();
\r
4921 x.nextSourceIndex += x.sourceArrayIndex - sourceBegin;
\r
4922 x.prevLength = (int) fromUnicodeStatus;
\r
4924 if (cr[0].isError()) {
\r
4925 /* not mappable or buffer overflow */
\r
4928 /* a mapping was written to the target, continue */
\r
4930 /* recalculate the targetCapacity after an extension mapping */
\r
4931 // x.targetCapacity=pArgs.targetLimit-x.targetArrayIndex;
\r
4932 /* normal end of conversion: prepare for a new character */
\r
4933 if (offsets != null) {
\r
4934 x.prevSourceIndex = x.sourceIndex;
\r
4935 x.sourceIndex = x.nextSourceIndex;
\r
4941 private final class SideEffectsDouble {
\r
4942 int c, sourceArrayIndex, sourceIndex, nextSourceIndex;
\r
4943 boolean doread = true;
\r
4945 SideEffectsDouble(int c_, int sourceArrayIndex_, int sourceIndex_, int nextSourceIndex_) {
\r
4947 sourceArrayIndex = sourceArrayIndex_;
\r
4948 sourceIndex = sourceIndex_;
\r
4949 nextSourceIndex = nextSourceIndex_;
\r
4953 // function made out of block labeled getTrail in ucnv_MBCSDoubleFromUnicodeWithOffsets
\r
4954 // assumes input c is lead surrogate
\r
4955 private final boolean getTrailDouble(CharBuffer source, ByteBuffer target, int uniMask,
\r
4956 SideEffectsDouble x, boolean flush, CoderResult[] cr) {
\r
4957 if (x.sourceArrayIndex < source.limit()) {
\r
4958 /* test the following code unit */
\r
4959 char trail = source.get(x.sourceArrayIndex);
\r
4960 if (UTF16.isTrailSurrogate(trail)) {
\r
4961 ++x.sourceArrayIndex;
\r
4962 ++x.nextSourceIndex;
\r
4963 /* convert this supplementary code point */
\r
4964 x.c = UCharacter.getCodePoint((char) x.c, trail);
\r
4965 if ((uniMask & UConverterConstants.HAS_SUPPLEMENTARY) == 0) {
\r
4966 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
\r
4967 /* callback(unassigned) */
\r
4969 return unassignedDouble(source, target, x, flush, cr);
\r
4975 /* this is an unmatched lead code unit (1st surrogate) */
\r
4976 /* callback(illegal) */
\r
4977 cr[0] = CoderResult.malformedForLength(1);
\r
4981 /* no more input */
\r
4986 // function made out of block labeled unassigned in ucnv_MBCSDoubleFromUnicodeWithOffsets
\r
4987 private final boolean unassignedDouble(CharBuffer source, ByteBuffer target, SideEffectsDouble x,
\r
4988 boolean flush, CoderResult[] cr) {
\r
4989 /* try an extension mapping */
\r
4990 int sourceBegin = x.sourceArrayIndex;
\r
4991 source.position(x.sourceArrayIndex);
\r
4992 x.c = fromU(x.c, source, target, null, x.sourceIndex, x.nextSourceIndex, flush, cr);
\r
4993 x.sourceArrayIndex = source.position();
\r
4994 x.nextSourceIndex += x.sourceArrayIndex - sourceBegin;
\r
4996 if (cr[0].isError()) {
\r
4997 /* not mappable or buffer overflow */
\r
5000 /* a mapping was written to the target, continue */
\r
5002 /* recalculate the targetCapacity after an extension mapping */
\r
5003 // x.targetCapacity=pArgs.targetLimit-x.targetArrayIndex;
\r
5004 /* normal end of conversion: prepare for a new character */
\r
5005 x.sourceIndex = x.nextSourceIndex;
\r
5011 * Overrides super class method
\r
5019 protected CoderResult cbFromUWriteSub(CharsetEncoderICU encoder, CharBuffer source, ByteBuffer target,
\r
5020 IntBuffer offsets) {
\r
5021 CharsetMBCS cs = (CharsetMBCS) encoder.charset();
\r
5025 if (cs.subChar1 != 0
\r
5026 && (cs.sharedData.mbcs.extIndexes != null ? encoder.useSubChar1
\r
5027 : (encoder.invalidUCharBuffer[0] <= 0xff))) {
\r
5029 * select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS
\r
5032 subchar = new byte[] { cs.subChar1 };
\r
5035 /* select subChar in all other cases */
\r
5036 subchar = cs.subChar;
\r
5037 length = cs.subCharLen;
\r
5040 /* reset the selector for the next code point */
\r
5041 encoder.useSubChar1 = false;
\r
5043 if (cs.sharedData.mbcs.outputType == MBCS_OUTPUT_2_SISO) {
\r
5044 byte[] buffer = new byte[4];
\r
5047 /* fromUnicodeStatus contains prevLength */
\r
5050 if (encoder.fromUnicodeStatus == 2) {
\r
5051 /* DBCS mode and SBCS sub char: change to SBCS */
\r
5052 encoder.fromUnicodeStatus = 1;
\r
5053 buffer[i++] = UConverterConstants.SI;
\r
5055 buffer[i++] = subchar[0];
\r
5058 if (encoder.fromUnicodeStatus <= 1) {
\r
5059 /* SBCS mode and DBCS sub char: change to DBCS */
\r
5060 encoder.fromUnicodeStatus = 2;
\r
5061 buffer[i++] = UConverterConstants.SO;
\r
5063 buffer[i++] = subchar[0];
\r
5064 buffer[i++] = subchar[1];
\r
5067 throw new IllegalArgumentException();
\r
5073 return CharsetEncoderICU.fromUWriteBytes(encoder, subchar, 0, length, target, offsets, source.position());
\r
5077 * Gets called whenever CharsetEncoder.replaceWith gets called. allowReplacementChanges only allows subChar and
\r
5078 * subChar1 to be modified outside construction (since replaceWith is called once during construction).
\r
5080 * @param replacement
\r
5081 * The replacement for subchar.
\r
5083 protected void implReplaceWith(byte[] replacement) {
\r
5084 if (allowReplacementChanges) {
\r
5085 CharsetMBCS cs = (CharsetMBCS) this.charset();
\r
5087 System.arraycopy(replacement, 0, cs.subChar, 0, replacement.length);
\r
5088 cs.subCharLen = (byte) replacement.length;
\r
5094 public CharsetDecoder newDecoder() {
\r
5095 return new CharsetDecoderMBCS(this);
\r
5098 public CharsetEncoder newEncoder() {
\r
5099 return new CharsetEncoderMBCS(this);
\r
5102 void MBCSGetFilteredUnicodeSetForUnicode(UConverterSharedData data, UnicodeSet setFillIn, int which, int filter){
\r
5103 UConverterMBCSTable mbcsTable;
\r
5105 char st1,maxStage1, st2;
\r
5109 mbcsTable = data.mbcs;
\r
5110 table = mbcsTable.fromUnicodeTable;
\r
5111 if((mbcsTable.unicodeMask & UConverterConstants.HAS_SUPPLEMENTARY)!=0){
\r
5112 maxStage1 = 0x440;
\r
5117 c=0; /* keep track of current code point while enumerating */
\r
5119 if(mbcsTable.outputType==MBCS_OUTPUT_1){
\r
5120 char stage2, stage3;
\r
5122 CharBuffer results;
\r
5123 results = ByteBuffer.wrap(mbcsTable.fromUnicodeBytes).asCharBuffer();
\r
5125 if(which==ROUNDTRIP_SET) {
\r
5126 /* use only roundtrips */
\r
5129 /* use all roundtrip and fallback results */
\r
5132 for(st1=0;st1<maxStage1;++st1){
\r
5134 if(st2>maxStage1){
\r
5136 for(st2=0; st2<64; ++st2){
\r
5137 st3 = table[stage2 + st2];
\r
5139 /*read the stage 3 block */
\r
5140 stage3 = (char)st3;
\r
5142 if(results.get(stage3++)>=minValue){
\r
5146 }while((++c&0xf) !=0);
\r
5148 c+= 16; /*empty stage 2 block */
\r
5152 c+=1024; /* empty stage 2 block */
\r
5156 int stage2,stage3;
\r
5158 int st3Multiplier;
\r
5160 boolean useFallBack;
\r
5161 bytes = mbcsTable.fromUnicodeBytes;
\r
5162 useFallBack = (which == ROUNDTRIP_AND_FALLBACK_SET);
\r
5163 switch(mbcsTable.outputType) {
\r
5164 case MBCS_OUTPUT_3:
\r
5165 case MBCS_OUTPUT_4_EUC:
\r
5166 st3Multiplier = 3;
\r
5168 case MBCS_OUTPUT_4:
\r
5175 //ByteBuffer buffer = (ByteBuffer)charTobyte(table);
\r
5177 for(st1=0;st1<maxStage1;++st1){
\r
5178 st2 = table[st1];
\r
5179 if(st2>(maxStage1>>1)){
\r
5181 for(st2=0;st2<128;++st2){
\r
5182 /*read the stage 3 block */
\r
5183 st3 = table[stage2*2 + st2]<<16;
\r
5184 st3+=table[stage2*2 + ++st2];
\r
5186 //if((st3=table[stage2+st2])!=0){
\r
5187 stage3 = st3Multiplier*16*(int)(st3&UConverterConstants.UNSIGNED_SHORT_MASK);
\r
5189 /* get the roundtrip flags for the stage 3 block */
\r
5191 st3 &= UConverterConstants.UNSIGNED_SHORT_MASK;
\r
5193 case UCNV_SET_FILTER_NONE:
\r
5198 stage3+=st3Multiplier;
\r
5199 }else if (useFallBack) {
\r
5202 switch(st3Multiplier) {
\r
5205 b|= ByteBuffer.wrap(bytes).getChar(stage3++);
\r
5209 b|= ByteBuffer.wrap(bytes).getChar(stage3++);
\r
5213 b|= ByteBuffer.wrap(bytes).getChar(stage3) | ByteBuffer.wrap(bytes).getChar(stage3+1);
\r
5223 }while((++c&0xf)!=0);
\r
5225 case UCNV_SET_FILTER_DBCS_ONLY:
\r
5226 /* Ignore single bytes results (<0x100). */
\r
5228 if(((st3&1) != 0 || useFallBack) &&
\r
5229 (UConverterConstants.UNSIGNED_SHORT_MASK & (ByteBuffer.wrap(bytes).getChar(stage3))) >= 0x100){
\r
5234 }while((++c&0xf) != 0);
\r
5236 case UCNV_SET_FILTER_2022_CN :
\r
5237 /* only add code points that map to CNS 11643 planes 1&2 for non-EXT ISO-2202-CN. */
\r
5239 if(((st3&1) != 0 || useFallBack) &&
\r
5240 ((value= (UConverterConstants.UNSIGNED_BYTE_MASK & (ByteBuffer.wrap(bytes).get(stage3))))==0x81 || value==0x82) ){
\r
5245 }while((++c&0xf)!=0);
\r
5247 case UCNV_SET_FILTER_SJIS:
\r
5248 /* only add code points that map tp Shift-JIS codes corrosponding to JIS X 0280. */
\r
5251 if(((st3&1) != 0 || useFallBack) && (value=(UConverterConstants.UNSIGNED_SHORT_MASK & (ByteBuffer.wrap(bytes).getChar(stage3))))>=0x8140 && value<=0xeffc){
\r
5256 }while((++c&0xf)!=0);
\r
5258 case UCNV_SET_FILTER_GR94DBCS:
\r
5259 /* only add code points that maps to ISO 2022 GR 94 DBCS codes*/
\r
5261 if(((st3&1) != 0 || useFallBack) &&
\r
5262 (UConverterConstants.UNSIGNED_SHORT_MASK & ((value=(UConverterConstants.UNSIGNED_SHORT_MASK & (ByteBuffer.wrap(bytes).getChar(stage3))))- 0xa1a1))<=(0xfefe - 0xa1a1) &&
\r
5263 (UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1)) <= (0xfe - 0xa1)){
\r
5268 }while((++c&0xf)!=0);
\r
5270 case UCNV_SET_FILTER_HZ:
\r
5271 /*Only add code points that are suitable for HZ DBCS*/
\r
5273 if( ((st3&1) != 0 || useFallBack) &&
\r
5274 (UConverterConstants.UNSIGNED_SHORT_MASK & ((value=(UConverterConstants.UNSIGNED_SHORT_MASK & (ByteBuffer.wrap(bytes).getChar(stage3))))-0xa1a1))<=(0xfdfe - 0xa1a1) &&
\r
5275 (UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1)) <= (0xfe - 0xa1)){
\r
5280 }while((++c&0xf) != 0);
\r
5286 c+=16; /* empty stage 3 block */
\r
5290 c+=1024; /*empty stage2 block */
\r
5294 extGetUnicodeSet(setFillIn, which, filter, data);
\r
5297 static void extGetUnicodeSetString(ByteBuffer cx,UnicodeSet setFillIn, boolean useFallback,
\r
5298 int minLength, int c, char s[],int length,int sectionIndex){
\r
5299 CharBuffer fromUSectionUChar;
\r
5300 IntBuffer fromUSectionValues;
\r
5301 fromUSectionUChar = (CharBuffer)ARRAY(cx, EXT_FROM_U_UCHARS_INDEX,char.class );
\r
5302 fromUSectionValues = (IntBuffer)ARRAY(cx, EXT_FROM_U_VALUES_INDEX,int.class );
\r
5303 int fromUSectionUCharIndex = fromUSectionUChar.position()+sectionIndex;
\r
5304 int fromUSectionValuesIndex = fromUSectionValues.position()+sectionIndex;
\r
5305 int value, i, count;
\r
5307 /* read first pair of the section */
\r
5308 count = fromUSectionUChar.get(fromUSectionUCharIndex++);
\r
5309 value = fromUSectionValues.get(fromUSectionValuesIndex++);
\r
5310 if(value!=0 && (FROM_U_IS_ROUNDTRIP(value) || useFallback) && FROM_U_GET_LENGTH(value)>=minLength) {
\r
5314 String normalizedString=""; // String for composite characters
\r
5315 for(int j=0; j<length;j++){
\r
5316 normalizedString+=s[j];
\r
5318 for(int j=0;j<length;j++){
\r
5319 setFillIn.add(normalizedString);
\r
5325 for(i=0; i<count; ++i){
\r
5326 s[length] = fromUSectionUChar.get(fromUSectionUCharIndex + i);
\r
5327 value = fromUSectionValues.get(fromUSectionValuesIndex + i);
\r
5330 /* no mapping, do nothing */
\r
5331 } else if (FROM_U_IS_PARTIAL(value)) {
\r
5332 extGetUnicodeSetString( cx, setFillIn, useFallback, minLength, UConverterConstants.U_SENTINEL, s, length+1,
\r
5333 FROM_U_GET_PARTIAL_INDEX(value));
\r
5334 } else if ((useFallback ? (value&FROM_U_RESERVED_MASK)==0:((value&(FROM_U_ROUNDTRIP_FLAG|FROM_U_RESERVED_MASK))==FROM_U_ROUNDTRIP_FLAG))
\r
5335 && FROM_U_GET_LENGTH(value)>=minLength) {
\r
5336 String normalizedString=""; // String for composite characters
\r
5337 for(int j=0; j<(length+1);j++){
\r
5338 normalizedString+=s[j];
\r
5340 setFillIn.add(normalizedString);
\r
5347 static void extGetUnicodeSet(UnicodeSet setFillIn, int which, int filter, UConverterSharedData Data){
\r
5348 int st1, stage1Length, st2, st3, minLength;
\r
5351 CharBuffer stage12, stage3;
\r
5352 int value, length;
\r
5353 IntBuffer stage3b;
\r
5354 boolean useFallback;
\r
5355 char s[] = new char[MAX_UCHARS];
\r
5357 ByteBuffer cx = Data.mbcs.extIndexes;
\r
5361 stage12 = (CharBuffer)ARRAY(cx, EXT_FROM_U_STAGE_12_INDEX,char.class );
\r
5362 stage3 = (CharBuffer)ARRAY(cx, EXT_FROM_U_STAGE_3_INDEX,char.class );
\r
5363 stage3b = (IntBuffer)ARRAY(cx, EXT_FROM_U_STAGE_3B_INDEX,int.class );
\r
5365 stage1Length = cx.asIntBuffer().get(EXT_FROM_U_STAGE_1_LENGTH);
\r
5366 useFallback =(boolean)(which==ROUNDTRIP_AND_FALLBACK_SET);
\r
5369 if(filter == UCNV_SET_FILTER_2022_CN) {
\r
5371 } else if (Data.mbcs.outputType == MBCS_OUTPUT_DBCS_ONLY || filter != UCNV_SET_FILTER_NONE) {
\r
5372 /* DBCS-only, ignore single-byte results */
\r
5378 for(st1=0; st1< stage1Length; ++st1){
\r
5379 st2 = stage12.get(st1);
\r
5380 if(st2>stage1Length) {
\r
5382 for(st2=0;st2<64;++st2){
\r
5383 st3=((int) stage12.get(ps2+st2))<<STAGE_2_LEFT_SHIFT;
\r
5387 value = stage3b.get((int)(UConverterConstants.UNSIGNED_SHORT_MASK&stage3.get(ps3++)));
\r
5389 /* no mapping do nothing */
\r
5390 }else if (FROM_U_IS_PARTIAL(value)){
\r
5392 length=UTF16.append(s, length, c);
\r
5393 extGetUnicodeSetString(cx,setFillIn,useFallback,minLength,c,s,length,(int)FROM_U_GET_PARTIAL_INDEX(value));
\r
5394 } else if ((useFallback ? (value&FROM_U_RESERVED_MASK)==0 :((value&(FROM_U_ROUNDTRIP_FLAG|FROM_U_RESERVED_MASK))== FROM_U_ROUNDTRIP_FLAG)) &&
\r
5395 FROM_U_GET_LENGTH(value)>=minLength){
\r
5398 case UCNV_SET_FILTER_2022_CN:
\r
5399 if(!(FROM_U_GET_LENGTH(value)==3 && FROM_U_GET_DATA(value)<=0x82ffff)){
\r
5403 case UCNV_SET_FILTER_SJIS:
\r
5404 if(!(FROM_U_GET_LENGTH(value)==2 && (value=FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)){
\r
5408 case UCNV_SET_FILTER_GR94DBCS:
\r
5409 if(!(FROM_U_GET_LENGTH(value)==2 && (UConverterConstants.UNSIGNED_SHORT_MASK & ((value=FROM_U_GET_DATA(value)) - 0xa1a1))<=(0xfefe - 0xa1a1)
\r
5410 && (UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1))<= (0xfe - 0xa1))){
\r
5415 case UCNV_SET_FILTER_HZ:
\r
5416 if(!(FROM_U_GET_LENGTH(value)==2 && (UConverterConstants.UNSIGNED_SHORT_MASK & ((value=FROM_U_GET_DATA(value)) - 0xa1a1))<=(0xfdfe - 0xa1a1)
\r
5417 && (UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1))<= (0xfe - 0xa1))){
\r
5423 * UCNV_SET_FILTER_NONE,
\r
5424 * or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength
\r
5431 }while((++c&0xf) != 0);
\r
5434 c+=16; /* emplty stage3 block */
\r
5438 c+=1024; /* empty stage 2 block*/
\r
5443 void MBCSGetUnicodeSetForUnicode(UConverterSharedData data, UnicodeSet setFillIn, int which){
\r
5444 MBCSGetFilteredUnicodeSetForUnicode(data, setFillIn, which,
\r
5445 this.sharedData.mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? UCNV_SET_FILTER_DBCS_ONLY : UCNV_SET_FILTER_NONE );
\r
5448 void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
\r
5449 if((options & MBCS_OPTION_GB18030)!=0){
\r
5450 setFillIn.add(0, 0xd7ff);
\r
5451 setFillIn.add(0xe000, 0x10ffff);
\r
5454 this.MBCSGetUnicodeSetForUnicode(sharedData, setFillIn, which);
\r