2 *******************************************************************************
3 * Copyright (C) 2008-2011, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
7 package com.ibm.icu.charset;
9 import java.nio.ByteBuffer;
10 import java.nio.CharBuffer;
11 import java.nio.IntBuffer;
12 import java.nio.charset.CharsetDecoder;
13 import java.nio.charset.CharsetEncoder;
14 import java.nio.charset.CoderResult;
16 import com.ibm.icu.lang.UCharacter;
17 import com.ibm.icu.text.UTF16;
18 import com.ibm.icu.text.UnicodeSet;
24 class CharsetBOCU1 extends CharsetICU {
25 /* BOCU constants and macros */
27 /* initial value for "prev": middle of the ASCII range */
28 private static final byte BOCU1_ASCII_PREV = 0x40;
30 /* bounding byte values for differences */
31 private static final int BOCU1_MIN = 0x21;
32 private static final int BOCU1_MIDDLE = 0x90;
33 //private static final int BOCU1_MAX_LEAD = 0xfe;
34 private static final int BOCU1_MAX_TRAIL = 0xff;
35 private static final int BOCU1_RESET = 0xff;
37 /* number of lead bytes */
38 //private static final int BOCU1_COUNT = (BOCU1_MAX_LEAD-BOCU1_MIN+1);
40 /* adjust trail byte counts for the use of some C0 control byte values */
41 private static final int BOCU1_TRAIL_CONTROLS_COUNT = 20;
42 private static final int BOCU1_TRAIL_BYTE_OFFSET = (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT);
44 /* number of trail bytes */
45 private static final int BOCU1_TRAIL_COUNT =((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT);
48 * number of positive and negative single-byte codes
49 * (counting 0==BOCU1_MIDDLE among the positive ones)
51 private static final int BOCU1_SINGLE = 64;
53 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
54 private static final int BOCU1_LEAD_2 = 43;
55 private static final int BOCU1_LEAD_3 = 3;
56 //private static final int BOCU1_LEAD_4 = 1;
58 /* The difference value range for single-byters. */
59 private static final int BOCU1_REACH_POS_1 = (BOCU1_SINGLE-1);
60 private static final int BOCU1_REACH_NEG_1 = (-BOCU1_SINGLE);
62 /* The difference value range for double-byters. */
63 private static final int BOCU1_REACH_POS_2 = (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT);
64 private static final int BOCU1_REACH_NEG_2 = (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT);
66 /* The difference value range for 3-byters. */
67 private static final int BOCU1_REACH_POS_3 =
68 (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
70 private static final int BOCU1_REACH_NEG_3 = (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
72 /* The lead byte start values. */
73 private static final int BOCU1_START_POS_2 = (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1);
74 private static final int BOCU1_START_POS_3 = (BOCU1_START_POS_2+BOCU1_LEAD_2);
75 private static final int BOCU1_START_POS_4 = (BOCU1_START_POS_3+BOCU1_LEAD_3);
76 /* ==BOCU1_MAX_LEAD */
78 private static final int BOCU1_START_NEG_2 = (BOCU1_MIDDLE+BOCU1_REACH_NEG_1);
79 private static final int BOCU1_START_NEG_3 = (BOCU1_START_NEG_2-BOCU1_LEAD_2);
80 //private static final int BOCU1_START_NEG_4 = (BOCU1_START_NEG_3-BOCU1_LEAD_3);
83 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
84 /* private static int BOCU1_LENGTH_FROM_LEAD(int lead) {
85 return ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 :
86 (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 :
87 (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4);
90 /* The length of a byte sequence, according to its packed form. */
91 private static int BOCU1_LENGTH_FROM_PACKED(int packed) {
92 return (((packed)&UConverterConstants.UNSIGNED_INT_MASK)<0x04000000 ? (packed)>>24 : 4);
96 * Byte value map for control codes,
97 * from external byte values 0x00..0x20
98 * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
99 * External byte values that are illegal as trail bytes are mapped to -1.
101 private static final int[]
103 /* 0 1 2 3 4 5 6 7 */
104 -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
106 /* 8 9 a b c d e f */
107 -1, -1, -1, -1, -1, -1, -1, -1,
109 /* 10 11 12 13 14 15 16 17 */
110 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
112 /* 18 19 1a 1b 1c 1d 1e 1f */
113 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,
120 * Byte value map for control codes,
121 * from trail byte values 0..19 (0..0x13) as used in the difference calculation
122 * to external byte values 0x00..0x20.
124 private static final int[]
126 /* 0 1 2 3 4 5 6 7 */
127 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
129 /* 8 9 a b c d e f */
130 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
133 0x1c, 0x1d, 0x1e, 0x1f
138 * 12 commonly used C0 control codes (and space) are only used to encode
139 * themselves directly,
140 * which makes BOCU-1 MIME-usable and reasonably safe for
141 * ASCII-oriented software.
161 * The other 20 C0 controls are also encoded directly (to preserve order)
162 * but are also used as trail bytes in difference encoding
163 * (for better compression).
165 private static int BOCU1_TRAIL_TO_BYTE(int trail) {
166 return ((trail)>=BOCU1_TRAIL_CONTROLS_COUNT ? (trail)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[trail]);
169 /* BOCU-1 implementation functions ------------------------------------------ */
170 private static int BOCU1_SIMPLE_PREV(int c){
171 return (((c)&~0x7f)+BOCU1_ASCII_PREV);
175 * Compute the next "previous" value for differencing
176 * from the current code point.
178 * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
179 * @return "previous code point" state value
181 private static int bocu1Prev(int c) {
182 /* compute new prev */
183 if(/* 0x3040<=c && */ c<=0x309f) {
184 /* Hiragana is not 128-aligned */
186 } else if(0x4e00<=c && c<=0x9fa5) {
188 return 0x4e00-BOCU1_REACH_NEG_2;
189 } else if(0xac00<=c /* && c<=0xd7a3 */) {
191 return (0xd7a3+0xac00)/2;
193 /* mostly small scripts */
194 return BOCU1_SIMPLE_PREV(c);
198 /** Fast version of bocu1Prev() for most scripts. */
199 private static int BOCU1_PREV(int c) {
200 return ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c));
203 protected byte[] fromUSubstitution = new byte[]{(byte)0x1A};
205 /* Faster versions of packDiff() for single-byte-encoded diff values. */
207 /** Is a diff value encodable in a single byte? */
208 private static boolean DIFF_IS_SINGLE(int diff){
209 return (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1);
212 /** Encode a diff value in a single byte. */
213 private static int PACK_SINGLE_DIFF(int diff){
214 return (BOCU1_MIDDLE+(diff));
217 /** Is a diff value encodable in two bytes? */
218 private static boolean DIFF_IS_DOUBLE(int diff){
219 return (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2);
222 public CharsetBOCU1(String icuCanonicalName, String javaCanonicalName, String[] aliases){
223 super(icuCanonicalName, javaCanonicalName, aliases);
229 class CharsetEncoderBOCU extends CharsetEncoderICU {
230 public CharsetEncoderBOCU(CharsetICU cs) {
231 super(cs,fromUSubstitution);
234 int sourceIndex, nextSourceIndex;
236 boolean checkNegative;
237 boolean LoopAfterTrail;
241 /* label values for supporting behavior similar to goto in C */
242 private static final int fastSingle=0;
243 private static final int getTrail=1;
244 private static final int regularLoop=2;
246 private boolean LabelLoop; //used to break the while loop
247 private int labelType = fastSingle; //labeType is set to fastSingle to start the code from fastSingle:
250 * Integer division and modulo with negative numerators
251 * yields negative modulo results and quotients that are one more than
253 * This macro adjust the results so that the modulo-value m is always >=0.
255 * For positive n, the if() condition is always FALSE.
257 * @param n Number to be split into quotient and rest.
258 * Will be modified to contain the quotient.
260 * @param m Output variable for the rest (modulo result).
262 private int NEGDIVMOD(int n, int d, int m) {
274 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
275 * and return a packed integer with them.
277 * The encoding favors small absolute differences with short encodings
278 * to compress runs of same-script characters.
280 * Optimized version with unrolled loops and fewer floating-point operations
281 * than the standard packDiff().
283 * @param diff difference value -0x10ffff..0x10ffff
285 * 0x010000zz for 1-byte sequence zz
286 * 0x0200yyzz for 2-byte sequence yy zz
287 * 0x03xxyyzz for 3-byte sequence xx yy zz
288 * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
290 private int packDiff(int n) {
294 if(diff>=BOCU1_REACH_NEG_1) {
295 /* mostly positive differences, and single-byte negative ones */
296 if(diff<=BOCU1_REACH_POS_2) {
298 diff-=BOCU1_REACH_POS_1+1;
301 m=diff%BOCU1_TRAIL_COUNT;
302 diff/=BOCU1_TRAIL_COUNT;
303 result|=BOCU1_TRAIL_TO_BYTE(m);
305 result|=(BOCU1_START_POS_2+diff)<<8;
306 } else if(diff<=BOCU1_REACH_POS_3) {
308 diff-=BOCU1_REACH_POS_2+1;
311 m=diff%BOCU1_TRAIL_COUNT;
312 diff/=BOCU1_TRAIL_COUNT;
313 result|=BOCU1_TRAIL_TO_BYTE(m);
315 m=diff%BOCU1_TRAIL_COUNT;
316 diff/=BOCU1_TRAIL_COUNT;
317 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
319 result|=(BOCU1_START_POS_3+diff)<<16;
322 diff-=BOCU1_REACH_POS_3+1;
324 m=diff%BOCU1_TRAIL_COUNT;
325 diff/=BOCU1_TRAIL_COUNT;
326 result=BOCU1_TRAIL_TO_BYTE(m);
328 m=diff%BOCU1_TRAIL_COUNT;
329 diff/=BOCU1_TRAIL_COUNT;
330 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
333 * We know that / and % would deliver quotient 0 and rest=diff.
334 * Avoid division and modulo for performance.
336 result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
338 result|=((BOCU1_START_POS_4&UConverterConstants.UNSIGNED_INT_MASK))<<24;
341 /* two- to four-byte negative differences */
342 if(diff>=BOCU1_REACH_NEG_2) {
344 diff-=BOCU1_REACH_NEG_1;
347 m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
348 result|=BOCU1_TRAIL_TO_BYTE(m);
350 result|=(BOCU1_START_NEG_2+diff)<<8;
351 } else if(diff>=BOCU1_REACH_NEG_3) {
353 diff-=BOCU1_REACH_NEG_2;
356 m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
357 result|=BOCU1_TRAIL_TO_BYTE(m);
359 m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
360 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
362 result|=(BOCU1_START_NEG_3+diff)<<16;
365 diff-=BOCU1_REACH_NEG_3;
367 m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
368 result=BOCU1_TRAIL_TO_BYTE(m);
370 m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
371 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
374 * We know that NEGDIVMOD would deliver
375 * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
376 * Avoid division and modulo for performance.
378 m=diff+BOCU1_TRAIL_COUNT;
379 result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
381 result|=BOCU1_MIN<<24;
387 protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush){
388 cr = CoderResult.UNDERFLOW;
390 LabelLoop = true; //used to break the while loop
391 checkNegative = false; // its value is set to true to get out of while loop when c = -c
392 LoopAfterTrail = false; // its value is set to true to ignore code before getTrail:
394 /*set up the local pointers*/
395 targetCapacity = target.limit() - target.position();
397 prev = fromUnicodeStatus;
400 prev = BOCU1_ASCII_PREV;
403 /*sourceIndex ==-1 if the current characte began in the previous buffer*/
404 sourceIndex = c == 0 ? 0: -1;
408 if(c!=0 && targetCapacity>0){
409 labelType = getTrail;
415 labelType = fastSingle(source, target, offsets);
418 labelType = getTrail(source, target, offsets);
421 labelType = regularLoop(source, target, offsets);
429 private int fastSingle(CharBuffer source, ByteBuffer target, IntBuffer offsets){
431 /*fast loop for single-byte differences*/
432 /*use only one loop counter variable , targetCapacity, not also source*/
433 diff = source.limit() - source.position();
434 if(targetCapacity>diff){
435 targetCapacity = diff;
437 while(targetCapacity>0 && (c=source.get(source.position()))<0x3000){
440 prev = BOCU1_ASCII_PREV;
444 offsets.put(nextSourceIndex++);
446 source.position(source.position()+1);
450 if(DIFF_IS_SINGLE(diff)){
451 prev = BOCU1_SIMPLE_PREV(c);
452 target.put((byte)PACK_SINGLE_DIFF(diff));
454 offsets.put(nextSourceIndex++);
456 source.position(source.position()+1);
466 private int getTrail(CharBuffer source, ByteBuffer target, IntBuffer offsets){
467 if(source.hasRemaining()){
468 /*test the following code unit*/
469 char trail = source.get(source.position());
470 if(UTF16.isTrailSurrogate(trail)){
471 source.position(source.position()+1);
473 c=UCharacter.getCodePoint((char)c, trail);
477 c = -c; /*negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else*/
478 checkNegative = true;
480 LoopAfterTrail = true;
484 @SuppressWarnings("fallthrough")
485 private int regularLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets){
487 /*restore real values*/
488 targetCapacity = target.limit()-target.position();
489 sourceIndex = nextSourceIndex; /*wrong if offsets==null but does not matter*/
491 /*regular loop for all classes*/
492 while(LoopAfterTrail || source.hasRemaining()){
493 if(LoopAfterTrail || targetCapacity>0){
501 * ISO C0 control & space:
502 * Encode directly for MIME compatibility,
503 * and reset state except for space, to not disrupt compression.
506 prev=BOCU1_ASCII_PREV;
510 offsets.put(sourceIndex++);
514 sourceIndex=nextSourceIndex;
518 if(UTF16.isLeadSurrogate((char)c)){
519 getTrail(source, target, offsets);
527 LoopAfterTrail = false;
531 * all other Unicode code points c==U+0021..U+10ffff
532 * are encoded with the difference c-prev
534 * a new prev is computed from c,
535 * placed in the middle of a 0x80-block (for most small scripts) or
536 * in the middle of the Unihan and Hangul blocks
537 * to statistically minimize the following difference
540 prev = BOCU1_PREV(c);
541 if(DIFF_IS_SINGLE(diff)){
542 target.put((byte)PACK_SINGLE_DIFF(diff));
544 offsets.put(sourceIndex++);
547 sourceIndex=nextSourceIndex;
549 labelType = fastSingle;
552 } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity){
553 /*optimize 2 byte case*/
556 diff -= BOCU1_REACH_POS_1 +1;
557 m = diff%BOCU1_TRAIL_COUNT;
558 diff/=BOCU1_TRAIL_COUNT;
559 diff+=BOCU1_START_POS_2;
561 diff -= BOCU1_REACH_NEG_1;
562 m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
563 diff+=BOCU1_START_NEG_2;
565 target.put((byte)diff);
566 target.put((byte)BOCU1_TRAIL_TO_BYTE(m));
568 offsets.put(sourceIndex);
569 offsets.put(sourceIndex);
572 sourceIndex = nextSourceIndex;
574 int length; /*will be 2..4*/
575 diff = packDiff(diff);
576 length = BOCU1_LENGTH_FROM_PACKED(diff);
578 /*write the output character bytes from diff and length*/
579 /*from the first if in the loop we know that targetCapacity>0*/
580 if(length<=targetCapacity){
582 /*each branch falls through the next one*/
584 target.put((byte)(diff>>24));
586 offsets.put(sourceIndex);
589 target.put((byte)(diff>>16));
591 offsets.put(sourceIndex);
594 target.put((byte)(diff>>8));
596 offsets.put(sourceIndex);
598 /*case 1 handled above*/
599 target.put((byte)diff);
601 offsets.put(sourceIndex);
607 targetCapacity -= length;
608 sourceIndex = nextSourceIndex;
610 ByteBuffer error = ByteBuffer.wrap(errorBuffer);
612 * We actually do this backwards here:
613 * In order to save an intermediate variable, we output
614 * first to the overflow buffer what does not fit into the
617 /* we know that 1<=targetCapacity<length<=4 */
618 length-=targetCapacity;
620 /* each branch falls through to the next one */
622 error.put((byte)(diff>>16));
624 error.put((byte)(diff>>8));
626 error.put((byte)diff);
628 /* will never occur */
631 errorBufferLength = length;
633 /* now output what fits into the regular target */
634 diff>>=8*length; /* length was reduced by targetCapacity */
635 switch(targetCapacity) {
636 /* each branch falls through to the next one */
638 target.put((byte)(diff>>16));
640 offsets.put(sourceIndex);
643 target.put((byte)(diff>>8));
645 offsets.put(sourceIndex);
648 target.put((byte)diff);
650 offsets.put(sourceIndex);
653 /* will never occur */
657 /* target overflow */
659 cr = CoderResult.OVERFLOW;
665 cr = CoderResult.OVERFLOW;
670 /*set the converter state back into UConverter*/
671 fromUChar32 = c<0 ? -c :0;
672 fromUnicodeStatus = prev;
674 labelType = fastSingle;
680 static class CharsetDecoderBOCU extends CharsetDecoderICU{
681 public CharsetDecoderBOCU(CharsetICU cs) {
686 int sourceIndex, nextSourceIndex;
687 int prev, c , diff, count;
692 /* label values for supporting behavior similar to goto in C */
693 private static final int fastSingle=0;
694 private static final int getTrail=1;
695 private static final int regularLoop=2;
696 private static final int endLoop=3;
698 private boolean LabelLoop;//used to break the while loop
699 private boolean afterTrail; // its value is set to true to ignore code after getTrail:
700 private int labelType;
702 * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
703 * The UConverter fields are used as follows:
705 * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
707 * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
708 * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0)
711 /* BOCU-1-from-Unicode conversion functions --------------------------------- */
716 * Function for BOCU-1 decoder; handles multi-byte lead bytes.
718 * @param b lead byte;
719 * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
720 * @return (diff<<2)|count
722 private int decodeBocu1LeadByte(int b) {
723 int diffValue, countValue;
725 if(b >= BOCU1_START_NEG_2) {
726 /* positive difference */
727 if(b < BOCU1_START_POS_3) {
729 diffValue = (b - BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_POS_1+1;
731 } else if(b < BOCU1_START_POS_4) {
733 diffValue = (b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
737 diffValue = BOCU1_REACH_POS_3+1;
741 /* negative difference */
742 if(b >= BOCU1_START_NEG_3) {
744 diffValue=(b -BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_1;
746 } else if(b>BOCU1_MIN) {
748 diffValue=(b - BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_2;
752 diffValue=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
757 /* return the state for decoding the trail byte(s) */
758 return (diffValue<<2)|countValue;
762 * Function for BOCU-1 decoder; handles multi-byte trail bytes.
764 * @param count number of remaining trail bytes including this one
765 * @param b trail byte
766 * @return new delta for diff including b - <0 indicates an error
770 private int decodeBocu1TrailByte(int countValue, int b) {
771 b = b&UConverterConstants.UNSIGNED_BYTE_MASK;
773 /* skip some C0 controls and make the trail byte range contiguous */
774 b = bocu1ByteToTrail[b];
775 /* b<0 for an illegal trail byte value will result in return<0 below */
777 //b-= BOCU1_TRAIL_BYTE_OFFSET;
778 b = b - BOCU1_TRAIL_BYTE_OFFSET;
781 /* add trail byte into difference and decrement count */
784 } else if(countValue==2) {
785 return b*BOCU1_TRAIL_COUNT;
786 } else /* count==3 */ {
787 return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
791 protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets,
793 cr = CoderResult.UNDERFLOW;
797 labelType = fastSingle; // labelType is set to fastSingle so t
799 /*get the converter state*/
800 prev = toUnicodeStatus;
803 prev = BOCU1_ASCII_PREV;
809 byteIndex = toULength;
810 bytes = toUBytesArray;
812 /* sourceIndex=-1 if the current character began in the previous buffer */
813 sourceIndex=byteIndex==0 ? 0 : -1;
816 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
817 if(count>0 && byteIndex>0 && target.position()<target.limit()) {
818 labelType = getTrail;
824 labelType = fastSingle(source, target, offsets);
827 labelType = getTrail(source, target, offsets);
830 labelType = afterGetTrail(source, target, offsets);
833 endLoop(source, target, offsets);
841 private int fastSingle(ByteBuffer source, CharBuffer target, IntBuffer offsets){
842 labelType = regularLoop;
843 /* fast loop for single-byte differences */
844 /* use count as the only loop counter variable */
845 diff = source.limit() - source.position();
846 count = target.limit()-target.position();
851 if(BOCU1_START_NEG_2 <=(c=source.get(source.position())&UConverterConstants.UNSIGNED_BYTE_MASK) && c< BOCU1_START_POS_2) {
852 c = prev + (c-BOCU1_MIDDLE);
856 offsets.put(nextSourceIndex++);
858 prev = BOCU1_SIMPLE_PREV(c);
862 } else if((c&UConverterConstants.UNSIGNED_BYTE_MASK) <= 0x20) {
863 if((c&UConverterConstants.UNSIGNED_BYTE_MASK) != 0x20) {
864 prev = BOCU1_ASCII_PREV;
868 offsets.put(nextSourceIndex++);
873 source.position(source.position()+1);
876 sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
880 private int getTrail(ByteBuffer source, CharBuffer target, IntBuffer offsets){
881 labelType = regularLoop;
883 if(source.position() >= source.limit()) {
888 c = bytes[byteIndex++] = source.get();
890 /* trail byte in any position */
891 c = decodeBocu1TrailByte(count, c);
893 cr = CoderResult.malformedForLength(1);
900 /* final trail byte, deliver a code point */
904 cr = CoderResult.malformedForLength(1);
916 private int afterGetTrail(ByteBuffer source, CharBuffer target, IntBuffer offsets){
917 /* decode a sequence of single and lead bytes */
918 while(afterTrail || source.hasRemaining()) {
920 if(target.position() >= target.limit()) {
922 cr = CoderResult.OVERFLOW;
927 c = source.get()&UConverterConstants.UNSIGNED_BYTE_MASK;
928 if(BOCU1_START_NEG_2 <= c && c < BOCU1_START_POS_2) {
929 /* Write a code point directly from a single-byte difference. */
930 c = prev + (c-BOCU1_MIDDLE);
934 offsets.put(sourceIndex);
936 prev = BOCU1_SIMPLE_PREV(c);
937 sourceIndex = nextSourceIndex;
938 labelType = fastSingle;
941 } else if(c <= 0x20) {
943 * Direct-encoded C0 control code or space.
944 * Reset prev for C0 control codes but not for space.
947 prev=BOCU1_ASCII_PREV;
951 offsets.put(sourceIndex);
953 sourceIndex=nextSourceIndex;
955 } else if(BOCU1_START_NEG_3 <= c && c < BOCU1_START_POS_3 && source.hasRemaining()) {
956 /* Optimize two-byte case. */
957 if(c >= BOCU1_MIDDLE) {
958 diff=(c - BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_POS_1 + 1;
960 diff=(c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_1;
965 c = decodeBocu1TrailByte(1, source.get());
966 if(c<0 || ((c = prev + diff + c)&UConverterConstants.UNSIGNED_INT_MASK)>0x10ffff) {
967 bytes[0]= source.get(source.position()-2);
968 bytes[1]= source.get(source.position()-1);
970 cr = CoderResult.malformedForLength(2);
973 } else if(c == BOCU1_RESET) {
974 /* only reset the state, no code point */
975 prev=BOCU1_ASCII_PREV;
976 sourceIndex=nextSourceIndex;
980 * For multi-byte difference lead bytes, set the decoder state
981 * with the partial difference value from the lead byte and
982 * with the number of trail bytes.
987 diff = decodeBocu1LeadByte(c);
990 getTrail(source, target, offsets);
991 if(labelType != regularLoop){
1001 /* calculate the next prev and output c */
1002 prev = BOCU1_PREV(c);
1004 target.put((char)c);
1006 offsets.put(sourceIndex);
1009 /* output surrogate pair */
1010 target.put(UTF16.getLeadSurrogate(c));
1011 if(target.hasRemaining()) {
1012 target.put(UTF16.getTrailSurrogate(c));
1014 offsets.put(sourceIndex);
1015 offsets.put(sourceIndex);
1018 /* target overflow */
1020 offsets.put(sourceIndex);
1022 charErrorBufferArray[0] = UTF16.getTrailSurrogate(c);
1023 charErrorBufferLength = 1;
1024 cr = CoderResult.OVERFLOW;
1028 sourceIndex=nextSourceIndex;
1030 labelType = endLoop;
1034 private void endLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets){
1035 if(cr.isMalformed()) {
1036 /* set the converter state in UConverter to deal with the next character */
1037 toUnicodeStatus = BOCU1_ASCII_PREV;
1040 /* set the converter state back into UConverter */
1041 toUnicodeStatus=prev;
1042 mode=(diff<<2)|count;
1044 toULength=byteIndex;
1051 public CharsetDecoder newDecoder() {
1052 return new CharsetDecoderBOCU(this);
1055 public CharsetEncoder newEncoder() {
1056 return new CharsetEncoderBOCU(this);
1059 void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
1060 CharsetICU.getCompleteUnicodeSet(setFillIn);