2 *******************************************************************************
\r
3 * Copyright (C) 2008-2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.charset;
\r
9 import java.nio.ByteBuffer;
\r
10 import java.nio.CharBuffer;
\r
11 import java.nio.IntBuffer;
\r
12 import java.nio.charset.CharsetDecoder;
\r
13 import java.nio.charset.CharsetEncoder;
\r
14 import java.nio.charset.CoderResult;
\r
16 import com.ibm.icu.lang.UCharacter;
\r
17 import com.ibm.icu.text.UTF16;
\r
18 import com.ibm.icu.text.UnicodeSet;
\r
24 class CharsetBOCU1 extends CharsetICU {
\r
25 /* BOCU constants and macros */
\r
27 /* initial value for "prev": middle of the ASCII range */
\r
28 private static final byte BOCU1_ASCII_PREV = 0x40;
\r
30 /* bounding byte values for differences */
\r
31 private static final int BOCU1_MIN = 0x21;
\r
32 private static final int BOCU1_MIDDLE = 0x90;
\r
33 //private static final int BOCU1_MAX_LEAD = 0xfe;
\r
34 private static final int BOCU1_MAX_TRAIL = 0xff;
\r
35 private static final int BOCU1_RESET = 0xff;
\r
37 /* number of lead bytes */
\r
38 //private static final int BOCU1_COUNT = (BOCU1_MAX_LEAD-BOCU1_MIN+1);
\r
40 /* adjust trail byte counts for the use of some C0 control byte values */
\r
41 private static final int BOCU1_TRAIL_CONTROLS_COUNT = 20;
\r
42 private static final int BOCU1_TRAIL_BYTE_OFFSET = (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT);
\r
44 /* number of trail bytes */
\r
45 private static final int BOCU1_TRAIL_COUNT =((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT);
\r
48 * number of positive and negative single-byte codes
\r
49 * (counting 0==BOCU1_MIDDLE among the positive ones)
\r
51 private static final int BOCU1_SINGLE = 64;
\r
53 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
\r
54 private static final int BOCU1_LEAD_2 = 43;
\r
55 private static final int BOCU1_LEAD_3 = 3;
\r
56 //private static final int BOCU1_LEAD_4 = 1;
\r
58 /* The difference value range for single-byters. */
\r
59 private static final int BOCU1_REACH_POS_1 = (BOCU1_SINGLE-1);
\r
60 private static final int BOCU1_REACH_NEG_1 = (-BOCU1_SINGLE);
\r
62 /* The difference value range for double-byters. */
\r
63 private static final int BOCU1_REACH_POS_2 = (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT);
\r
64 private static final int BOCU1_REACH_NEG_2 = (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT);
\r
66 /* The difference value range for 3-byters. */
\r
67 private static final int BOCU1_REACH_POS_3 =
\r
68 (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
\r
70 private static final int BOCU1_REACH_NEG_3 = (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
\r
72 /* The lead byte start values. */
\r
73 private static final int BOCU1_START_POS_2 = (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1);
\r
74 private static final int BOCU1_START_POS_3 = (BOCU1_START_POS_2+BOCU1_LEAD_2);
\r
75 private static final int BOCU1_START_POS_4 = (BOCU1_START_POS_3+BOCU1_LEAD_3);
\r
76 /* ==BOCU1_MAX_LEAD */
\r
78 private static final int BOCU1_START_NEG_2 = (BOCU1_MIDDLE+BOCU1_REACH_NEG_1);
\r
79 private static final int BOCU1_START_NEG_3 = (BOCU1_START_NEG_2-BOCU1_LEAD_2);
\r
80 //private static final int BOCU1_START_NEG_4 = (BOCU1_START_NEG_3-BOCU1_LEAD_3);
\r
83 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
\r
84 /* private static int BOCU1_LENGTH_FROM_LEAD(int lead) {
\r
85 return ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 :
\r
86 (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 :
\r
87 (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4);
\r
90 /* The length of a byte sequence, according to its packed form. */
\r
91 private static int BOCU1_LENGTH_FROM_PACKED(int packed) {
\r
92 return (((packed)&UConverterConstants.UNSIGNED_INT_MASK)<0x04000000 ? (packed)>>24 : 4);
\r
96 * Byte value map for control codes,
\r
97 * from external byte values 0x00..0x20
\r
98 * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
\r
99 * External byte values that are illegal as trail bytes are mapped to -1.
\r
101 private static final int[]
\r
103 /* 0 1 2 3 4 5 6 7 */
\r
104 -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
\r
106 /* 8 9 a b c d e f */
\r
107 -1, -1, -1, -1, -1, -1, -1, -1,
\r
109 /* 10 11 12 13 14 15 16 17 */
\r
110 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
\r
112 /* 18 19 1a 1b 1c 1d 1e 1f */
\r
113 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,
\r
120 * Byte value map for control codes,
\r
121 * from trail byte values 0..19 (0..0x13) as used in the difference calculation
\r
122 * to external byte values 0x00..0x20.
\r
124 private static final int[]
\r
125 bocu1TrailToByte = {
\r
126 /* 0 1 2 3 4 5 6 7 */
\r
127 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
\r
129 /* 8 9 a b c d e f */
\r
130 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
\r
133 0x1c, 0x1d, 0x1e, 0x1f
\r
138 * 12 commonly used C0 control codes (and space) are only used to encode
\r
139 * themselves directly,
\r
140 * which makes BOCU-1 MIME-usable and reasonably safe for
\r
141 * ASCII-oriented software.
\r
143 * These controls are
\r
161 * The other 20 C0 controls are also encoded directly (to preserve order)
\r
162 * but are also used as trail bytes in difference encoding
\r
163 * (for better compression).
\r
165 private static int BOCU1_TRAIL_TO_BYTE(int trail) {
\r
166 return ((trail)>=BOCU1_TRAIL_CONTROLS_COUNT ? (trail)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[trail]);
\r
169 /* BOCU-1 implementation functions ------------------------------------------ */
\r
170 private static int BOCU1_SIMPLE_PREV(int c){
\r
171 return (((c)&~0x7f)+BOCU1_ASCII_PREV);
\r
175 * Compute the next "previous" value for differencing
\r
176 * from the current code point.
\r
178 * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
\r
179 * @return "previous code point" state value
\r
181 private static int bocu1Prev(int c) {
\r
182 /* compute new prev */
\r
183 if(/* 0x3040<=c && */ c<=0x309f) {
\r
184 /* Hiragana is not 128-aligned */
\r
186 } else if(0x4e00<=c && c<=0x9fa5) {
\r
188 return 0x4e00-BOCU1_REACH_NEG_2;
\r
189 } else if(0xac00<=c /* && c<=0xd7a3 */) {
\r
190 /* Korean Hangul */
\r
191 return (0xd7a3+0xac00)/2;
\r
193 /* mostly small scripts */
\r
194 return BOCU1_SIMPLE_PREV(c);
\r
198 /** Fast version of bocu1Prev() for most scripts. */
\r
199 private static int BOCU1_PREV(int c) {
\r
200 return ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c));
\r
203 protected byte[] fromUSubstitution = new byte[]{(byte)0x1A};
\r
205 /* Faster versions of packDiff() for single-byte-encoded diff values. */
\r
207 /** Is a diff value encodable in a single byte? */
\r
208 private static boolean DIFF_IS_SINGLE(int diff){
\r
209 return (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1);
\r
212 /** Encode a diff value in a single byte. */
\r
213 private static int PACK_SINGLE_DIFF(int diff){
\r
214 return (BOCU1_MIDDLE+(diff));
\r
217 /** Is a diff value encodable in two bytes? */
\r
218 private static boolean DIFF_IS_DOUBLE(int diff){
\r
219 return (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2);
\r
222 public CharsetBOCU1(String icuCanonicalName, String javaCanonicalName, String[] aliases){
\r
223 super(icuCanonicalName, javaCanonicalName, aliases);
\r
224 maxBytesPerChar = 4;
\r
225 minBytesPerChar = 1;
\r
226 maxCharsPerByte = 1;
\r
229 class CharsetEncoderBOCU extends CharsetEncoderICU {
\r
230 public CharsetEncoderBOCU(CharsetICU cs) {
\r
231 super(cs,fromUSubstitution);
\r
234 int sourceIndex, nextSourceIndex;
\r
235 int prev, c , diff;
\r
236 boolean checkNegative;
\r
237 boolean LoopAfterTrail;
\r
238 int targetCapacity;
\r
241 /* label values for supporting behavior similar to goto in C */
\r
242 private static final int fastSingle=0;
\r
243 private static final int getTrail=1;
\r
244 private static final int regularLoop=2;
\r
246 private boolean LabelLoop; //used to break the while loop
\r
247 private int labelType = fastSingle; //labeType is set to fastSingle to start the code from fastSingle:
\r
250 * Integer division and modulo with negative numerators
\r
251 * yields negative modulo results and quotients that are one more than
\r
252 * what we need here.
\r
253 * This macro adjust the results so that the modulo-value m is always >=0.
\r
255 * For positive n, the if() condition is always FALSE.
\r
257 * @param n Number to be split into quotient and rest.
\r
258 * Will be modified to contain the quotient.
\r
259 * @param d Divisor.
\r
260 * @param m Output variable for the rest (modulo result).
\r
262 private int NEGDIVMOD(int n, int d, int m) {
\r
274 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
\r
275 * and return a packed integer with them.
\r
277 * The encoding favors small absolute differences with short encodings
\r
278 * to compress runs of same-script characters.
\r
280 * Optimized version with unrolled loops and fewer floating-point operations
\r
281 * than the standard packDiff().
\r
283 * @param diff difference value -0x10ffff..0x10ffff
\r
285 * 0x010000zz for 1-byte sequence zz
\r
286 * 0x0200yyzz for 2-byte sequence yy zz
\r
287 * 0x03xxyyzz for 3-byte sequence xx yy zz
\r
288 * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
\r
290 private int packDiff(int n) {
\r
294 if(diff>=BOCU1_REACH_NEG_1) {
\r
295 /* mostly positive differences, and single-byte negative ones */
\r
296 if(diff<=BOCU1_REACH_POS_2) {
\r
298 diff-=BOCU1_REACH_POS_1+1;
\r
301 m=diff%BOCU1_TRAIL_COUNT;
\r
302 diff/=BOCU1_TRAIL_COUNT;
\r
303 result|=BOCU1_TRAIL_TO_BYTE(m);
\r
305 result|=(BOCU1_START_POS_2+diff)<<8;
\r
306 } else if(diff<=BOCU1_REACH_POS_3) {
\r
308 diff-=BOCU1_REACH_POS_2+1;
\r
311 m=diff%BOCU1_TRAIL_COUNT;
\r
312 diff/=BOCU1_TRAIL_COUNT;
\r
313 result|=BOCU1_TRAIL_TO_BYTE(m);
\r
315 m=diff%BOCU1_TRAIL_COUNT;
\r
316 diff/=BOCU1_TRAIL_COUNT;
\r
317 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
\r
319 result|=(BOCU1_START_POS_3+diff)<<16;
\r
322 diff-=BOCU1_REACH_POS_3+1;
\r
324 m=diff%BOCU1_TRAIL_COUNT;
\r
325 diff/=BOCU1_TRAIL_COUNT;
\r
326 result=BOCU1_TRAIL_TO_BYTE(m);
\r
328 m=diff%BOCU1_TRAIL_COUNT;
\r
329 diff/=BOCU1_TRAIL_COUNT;
\r
330 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
\r
333 * We know that / and % would deliver quotient 0 and rest=diff.
\r
334 * Avoid division and modulo for performance.
\r
336 result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
\r
338 result|=((BOCU1_START_POS_4&UConverterConstants.UNSIGNED_INT_MASK))<<24;
\r
341 /* two- to four-byte negative differences */
\r
342 if(diff>=BOCU1_REACH_NEG_2) {
\r
344 diff-=BOCU1_REACH_NEG_1;
\r
347 m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
\r
348 result|=BOCU1_TRAIL_TO_BYTE(m);
\r
350 result|=(BOCU1_START_NEG_2+diff)<<8;
\r
351 } else if(diff>=BOCU1_REACH_NEG_3) {
\r
353 diff-=BOCU1_REACH_NEG_2;
\r
356 m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
\r
357 result|=BOCU1_TRAIL_TO_BYTE(m);
\r
359 m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
\r
360 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
\r
362 result|=(BOCU1_START_NEG_3+diff)<<16;
\r
365 diff-=BOCU1_REACH_NEG_3;
\r
367 m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
\r
368 result=BOCU1_TRAIL_TO_BYTE(m);
\r
370 m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
\r
371 result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
\r
374 * We know that NEGDIVMOD would deliver
\r
375 * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
\r
376 * Avoid division and modulo for performance.
\r
378 m=diff+BOCU1_TRAIL_COUNT;
\r
379 result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
\r
381 result|=BOCU1_MIN<<24;
\r
387 protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush){
\r
388 cr = CoderResult.UNDERFLOW;
\r
390 LabelLoop = true; //used to break the while loop
\r
391 checkNegative = false; // its value is set to true to get out of while loop when c = -c
\r
392 LoopAfterTrail = false; // its value is set to true to ignore code before getTrail:
\r
394 /*set up the local pointers*/
\r
395 targetCapacity = target.limit() - target.position();
\r
397 prev = fromUnicodeStatus;
\r
400 prev = BOCU1_ASCII_PREV;
\r
403 /*sourceIndex ==-1 if the current characte began in the previous buffer*/
\r
404 sourceIndex = c == 0 ? 0: -1;
\r
405 nextSourceIndex = 0;
\r
407 /*conversion loop*/
\r
408 if(c!=0 && targetCapacity>0){
\r
409 labelType = getTrail;
\r
415 labelType = fastSingle(source, target, offsets);
\r
418 labelType = getTrail(source, target, offsets);
\r
421 labelType = regularLoop(source, target, offsets);
\r
429 private int fastSingle(CharBuffer source, ByteBuffer target, IntBuffer offsets){
\r
431 /*fast loop for single-byte differences*/
\r
432 /*use only one loop counter variable , targetCapacity, not also source*/
\r
433 diff = source.limit() - source.position();
\r
434 if(targetCapacity>diff){
\r
435 targetCapacity = diff;
\r
437 while(targetCapacity>0 && (c=source.get(source.position()))<0x3000){
\r
440 prev = BOCU1_ASCII_PREV;
\r
442 target.put((byte)c);
\r
444 offsets.put(nextSourceIndex++);
\r
446 source.position(source.position()+1);
\r
450 if(DIFF_IS_SINGLE(diff)){
\r
451 prev = BOCU1_SIMPLE_PREV(c);
\r
452 target.put((byte)PACK_SINGLE_DIFF(diff));
\r
454 offsets.put(nextSourceIndex++);
\r
456 source.position(source.position()+1);
\r
463 return regularLoop;
\r
466 private int getTrail(CharBuffer source, ByteBuffer target, IntBuffer offsets){
\r
467 if(source.hasRemaining()){
\r
468 /*test the following code unit*/
\r
469 char trail = source.get(source.position());
\r
470 if(UTF16.isTrailSurrogate(trail)){
\r
471 source.position(source.position()+1);
\r
473 c=UCharacter.getCodePoint((char)c, trail);
\r
477 c = -c; /*negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else*/
\r
478 checkNegative = true;
\r
480 LoopAfterTrail = true;
\r
481 return regularLoop;
\r
484 @SuppressWarnings("fallthrough")
\r
485 private int regularLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets){
\r
486 if(!LoopAfterTrail){
\r
487 /*restore real values*/
\r
488 targetCapacity = target.limit()-target.position();
\r
489 sourceIndex = nextSourceIndex; /*wrong if offsets==null but does not matter*/
\r
491 /*regular loop for all classes*/
\r
492 while(LoopAfterTrail || source.hasRemaining()){
\r
493 if(LoopAfterTrail || targetCapacity>0){
\r
495 if(!LoopAfterTrail){
\r
501 * ISO C0 control & space:
\r
502 * Encode directly for MIME compatibility,
\r
503 * and reset state except for space, to not disrupt compression.
\r
506 prev=BOCU1_ASCII_PREV;
\r
508 target.put((byte)c);
\r
509 if(offsets != null){
\r
510 offsets.put(sourceIndex++);
\r
514 sourceIndex=nextSourceIndex;
\r
518 if(UTF16.isLeadSurrogate((char)c)){
\r
519 getTrail(source, target, offsets);
\r
526 if(LoopAfterTrail){
\r
527 LoopAfterTrail = false;
\r
531 * all other Unicode code points c==U+0021..U+10ffff
\r
532 * are encoded with the difference c-prev
\r
534 * a new prev is computed from c,
\r
535 * placed in the middle of a 0x80-block (for most small scripts) or
\r
536 * in the middle of the Unihan and Hangul blocks
\r
537 * to statistically minimize the following difference
\r
540 prev = BOCU1_PREV(c);
\r
541 if(DIFF_IS_SINGLE(diff)){
\r
542 target.put((byte)PACK_SINGLE_DIFF(diff));
\r
544 offsets.put(sourceIndex++);
\r
547 sourceIndex=nextSourceIndex;
\r
549 labelType = fastSingle;
\r
552 } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity){
\r
553 /*optimize 2 byte case*/
\r
556 diff -= BOCU1_REACH_POS_1 +1;
\r
557 m = diff%BOCU1_TRAIL_COUNT;
\r
558 diff/=BOCU1_TRAIL_COUNT;
\r
559 diff+=BOCU1_START_POS_2;
\r
561 diff -= BOCU1_REACH_NEG_1;
\r
562 m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
\r
563 diff+=BOCU1_START_NEG_2;
\r
565 target.put((byte)diff);
\r
566 target.put((byte)BOCU1_TRAIL_TO_BYTE(m));
\r
568 offsets.put(sourceIndex);
\r
569 offsets.put(sourceIndex);
\r
571 targetCapacity -= 2;
\r
572 sourceIndex = nextSourceIndex;
\r
574 int length; /*will be 2..4*/
\r
575 diff = packDiff(diff);
\r
576 length = BOCU1_LENGTH_FROM_PACKED(diff);
\r
578 /*write the output character bytes from diff and length*/
\r
579 /*from the first if in the loop we know that targetCapacity>0*/
\r
580 if(length<=targetCapacity){
\r
582 /*each branch falls through the next one*/
\r
584 target.put((byte)(diff>>24));
\r
585 if(offsets!= null){
\r
586 offsets.put(sourceIndex);
\r
589 target.put((byte)(diff>>16));
\r
590 if(offsets!= null){
\r
591 offsets.put(sourceIndex);
\r
594 target.put((byte)(diff>>8));
\r
595 if(offsets!= null){
\r
596 offsets.put(sourceIndex);
\r
598 /*case 1 handled above*/
\r
599 target.put((byte)diff);
\r
600 if(offsets!= null){
\r
601 offsets.put(sourceIndex);
\r
604 /*will never occur*/
\r
607 targetCapacity -= length;
\r
608 sourceIndex = nextSourceIndex;
\r
610 ByteBuffer error = ByteBuffer.wrap(errorBuffer);
\r
612 * We actually do this backwards here:
\r
613 * In order to save an intermediate variable, we output
\r
614 * first to the overflow buffer what does not fit into the
\r
617 /* we know that 1<=targetCapacity<length<=4 */
\r
618 length-=targetCapacity;
\r
620 /* each branch falls through to the next one */
\r
622 error.put((byte)(diff>>16));
\r
624 error.put((byte)(diff>>8));
\r
626 error.put((byte)diff);
\r
628 /* will never occur */
\r
631 errorBufferLength = length;
\r
633 /* now output what fits into the regular target */
\r
634 diff>>=8*length; /* length was reduced by targetCapacity */
\r
635 switch(targetCapacity) {
\r
636 /* each branch falls through to the next one */
\r
638 target.put((byte)(diff>>16));
\r
639 if(offsets!= null){
\r
640 offsets.put(sourceIndex);
\r
643 target.put((byte)(diff>>8));
\r
644 if(offsets!= null){
\r
645 offsets.put(sourceIndex);
\r
648 target.put((byte)diff);
\r
649 if(offsets!= null){
\r
650 offsets.put(sourceIndex);
\r
653 /* will never occur */
\r
657 /* target overflow */
\r
659 cr = CoderResult.OVERFLOW;
\r
665 cr = CoderResult.OVERFLOW;
\r
670 /*set the converter state back into UConverter*/
\r
671 fromUChar32 = c<0 ? -c :0;
\r
672 fromUnicodeStatus = prev;
\r
674 labelType = fastSingle;
\r
680 class CharsetDecoderBOCU extends CharsetDecoderICU{
\r
681 public CharsetDecoderBOCU(CharsetICU cs) {
\r
686 int sourceIndex, nextSourceIndex;
\r
687 int prev, c , diff, count;
\r
689 int targetCapacity;
\r
692 /* label values for supporting behavior similar to goto in C */
\r
693 private static final int fastSingle=0;
\r
694 private static final int getTrail=1;
\r
695 private static final int regularLoop=2;
\r
696 private static final int endLoop=3;
\r
698 private boolean LabelLoop;//used to break the while loop
\r
699 private boolean afterTrail; // its value is set to true to ignore code after getTrail:
\r
700 private int labelType;
\r
702 * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
\r
703 * The UConverter fields are used as follows:
\r
705 * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
\r
707 * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
\r
708 * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0)
\r
711 /* BOCU-1-from-Unicode conversion functions --------------------------------- */
\r
716 * Function for BOCU-1 decoder; handles multi-byte lead bytes.
\r
718 * @param b lead byte;
\r
719 * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
\r
720 * @return (diff<<2)|count
\r
722 private int decodeBocu1LeadByte(int b) {
\r
723 int diffValue, countValue;
\r
725 if(b >= BOCU1_START_NEG_2) {
\r
726 /* positive difference */
\r
727 if(b < BOCU1_START_POS_3) {
\r
729 diffValue = (b - BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_POS_1+1;
\r
731 } else if(b < BOCU1_START_POS_4) {
\r
733 diffValue = (b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
\r
737 diffValue = BOCU1_REACH_POS_3+1;
\r
741 /* negative difference */
\r
742 if(b >= BOCU1_START_NEG_3) {
\r
744 diffValue=(b -BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_1;
\r
746 } else if(b>BOCU1_MIN) {
\r
748 diffValue=(b - BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_2;
\r
752 diffValue=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
\r
757 /* return the state for decoding the trail byte(s) */
\r
758 return (diffValue<<2)|countValue;
\r
762 * Function for BOCU-1 decoder; handles multi-byte trail bytes.
\r
764 * @param count number of remaining trail bytes including this one
\r
765 * @param b trail byte
\r
766 * @return new delta for diff including b - <0 indicates an error
\r
770 private int decodeBocu1TrailByte(int countValue, int b) {
\r
771 b = b&UConverterConstants.UNSIGNED_BYTE_MASK;
\r
773 /* skip some C0 controls and make the trail byte range contiguous */
\r
774 b = bocu1ByteToTrail[b];
\r
775 /* b<0 for an illegal trail byte value will result in return<0 below */
\r
777 //b-= BOCU1_TRAIL_BYTE_OFFSET;
\r
778 b = b - BOCU1_TRAIL_BYTE_OFFSET;
\r
781 /* add trail byte into difference and decrement count */
\r
782 if(countValue==1) {
\r
784 } else if(countValue==2) {
\r
785 return b*BOCU1_TRAIL_COUNT;
\r
786 } else /* count==3 */ {
\r
787 return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
\r
791 protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets,
\r
793 cr = CoderResult.UNDERFLOW;
\r
796 afterTrail = false;
\r
797 labelType = fastSingle; // labelType is set to fastSingle so t
\r
799 /*get the converter state*/
\r
800 prev = toUnicodeStatus;
\r
803 prev = BOCU1_ASCII_PREV;
\r
809 byteIndex = toULength;
\r
810 bytes = toUBytesArray;
\r
812 /* sourceIndex=-1 if the current character began in the previous buffer */
\r
813 sourceIndex=byteIndex==0 ? 0 : -1;
\r
816 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
\r
817 if(count>0 && byteIndex>0 && target.position()<target.limit()) {
\r
818 labelType = getTrail;
\r
824 labelType = fastSingle(source, target, offsets);
\r
827 labelType = getTrail(source, target, offsets);
\r
830 labelType = afterGetTrail(source, target, offsets);
\r
833 endLoop(source, target, offsets);
\r
841 private int fastSingle(ByteBuffer source, CharBuffer target, IntBuffer offsets){
\r
842 labelType = regularLoop;
\r
843 /* fast loop for single-byte differences */
\r
844 /* use count as the only loop counter variable */
\r
845 diff = source.limit() - source.position();
\r
846 count = target.limit()-target.position();
\r
851 if(BOCU1_START_NEG_2 <=(c=source.get(source.position())&UConverterConstants.UNSIGNED_BYTE_MASK) && c< BOCU1_START_POS_2) {
\r
852 c = prev + (c-BOCU1_MIDDLE);
\r
854 target.put((char)c);
\r
856 offsets.put(nextSourceIndex++);
\r
858 prev = BOCU1_SIMPLE_PREV(c);
\r
862 } else if((c&UConverterConstants.UNSIGNED_BYTE_MASK) <= 0x20) {
\r
863 if((c&UConverterConstants.UNSIGNED_BYTE_MASK) != 0x20) {
\r
864 prev = BOCU1_ASCII_PREV;
\r
866 target.put((char)c);
\r
868 offsets.put(nextSourceIndex++);
\r
873 source.position(source.position()+1);
\r
876 sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
\r
880 private int getTrail(ByteBuffer source, CharBuffer target, IntBuffer offsets){
\r
881 labelType = regularLoop;
\r
883 if(source.position() >= source.limit()) {
\r
884 labelType = endLoop;
\r
888 c = bytes[byteIndex++] = source.get();
\r
890 /* trail byte in any position */
\r
891 c = decodeBocu1TrailByte(count, c);
\r
893 cr = CoderResult.malformedForLength(1);
\r
894 labelType = endLoop;
\r
900 /* final trail byte, deliver a code point */
\r
904 cr = CoderResult.malformedForLength(1);
\r
905 labelType = endLoop;
\r
916 private int afterGetTrail(ByteBuffer source, CharBuffer target, IntBuffer offsets){
\r
917 /* decode a sequence of single and lead bytes */
\r
918 while(afterTrail || source.hasRemaining()) {
\r
920 if(target.position() >= target.limit()) {
\r
921 /* target is full */
\r
922 cr = CoderResult.OVERFLOW;
\r
927 c = source.get()&UConverterConstants.UNSIGNED_BYTE_MASK;
\r
928 if(BOCU1_START_NEG_2 <= c && c < BOCU1_START_POS_2) {
\r
929 /* Write a code point directly from a single-byte difference. */
\r
930 c = prev + (c-BOCU1_MIDDLE);
\r
932 target.put((char)c);
\r
934 offsets.put(sourceIndex);
\r
936 prev = BOCU1_SIMPLE_PREV(c);
\r
937 sourceIndex = nextSourceIndex;
\r
938 labelType = fastSingle;
\r
941 } else if(c <= 0x20) {
\r
943 * Direct-encoded C0 control code or space.
\r
944 * Reset prev for C0 control codes but not for space.
\r
947 prev=BOCU1_ASCII_PREV;
\r
949 target.put((char)c);
\r
951 offsets.put(sourceIndex);
\r
953 sourceIndex=nextSourceIndex;
\r
955 } else if(BOCU1_START_NEG_3 <= c && c < BOCU1_START_POS_3 && source.hasRemaining()) {
\r
956 /* Optimize two-byte case. */
\r
957 if(c >= BOCU1_MIDDLE) {
\r
958 diff=(c - BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_POS_1 + 1;
\r
960 diff=(c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_1;
\r
965 c = decodeBocu1TrailByte(1, source.get());
\r
966 if(c<0 || ((c = prev + diff + c)&UConverterConstants.UNSIGNED_INT_MASK)>0x10ffff) {
\r
967 bytes[0]= source.get(source.position()-2);
\r
968 bytes[1]= source.get(source.position()-1);
\r
970 cr = CoderResult.malformedForLength(2);
\r
973 } else if(c == BOCU1_RESET) {
\r
974 /* only reset the state, no code point */
\r
975 prev=BOCU1_ASCII_PREV;
\r
976 sourceIndex=nextSourceIndex;
\r
980 * For multi-byte difference lead bytes, set the decoder state
\r
981 * with the partial difference value from the lead byte and
\r
982 * with the number of trail bytes.
\r
987 diff = decodeBocu1LeadByte(c);
\r
990 getTrail(source, target, offsets);
\r
991 if(labelType != regularLoop){
\r
998 afterTrail = false;
\r
1001 /* calculate the next prev and output c */
\r
1002 prev = BOCU1_PREV(c);
\r
1004 target.put((char)c);
\r
1005 if(offsets!=null){
\r
1006 offsets.put(sourceIndex);
\r
1009 /* output surrogate pair */
\r
1010 target.put(UTF16.getLeadSurrogate(c));
\r
1011 if(target.hasRemaining()) {
\r
1012 target.put(UTF16.getTrailSurrogate(c));
\r
1013 if(offsets!=null){
\r
1014 offsets.put(sourceIndex);
\r
1015 offsets.put(sourceIndex);
\r
1018 /* target overflow */
\r
1019 if(offsets!=null){
\r
1020 offsets.put(sourceIndex);
\r
1022 charErrorBufferArray[0] = UTF16.getTrailSurrogate(c);
\r
1023 charErrorBufferLength = 1;
\r
1024 cr = CoderResult.OVERFLOW;
\r
1028 sourceIndex=nextSourceIndex;
\r
1030 labelType = endLoop;
\r
1034 private void endLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets){
\r
1035 if(cr.isMalformed()) {
\r
1036 /* set the converter state in UConverter to deal with the next character */
\r
1037 toUnicodeStatus = BOCU1_ASCII_PREV;
\r
1040 /* set the converter state back into UConverter */
\r
1041 toUnicodeStatus=prev;
\r
1042 mode=(diff<<2)|count;
\r
1044 toULength=byteIndex;
\r
1045 LabelLoop = false;
\r
1051 public CharsetDecoder newDecoder() {
\r
1052 return new CharsetDecoderBOCU(this);
\r
1055 public CharsetEncoder newEncoder() {
\r
1056 return new CharsetEncoderBOCU(this);
\r
1059 void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
\r
1060 CharsetICU.getCompleteUnicodeSet(setFillIn);
\r