]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-4_8_1_1/main/classes/charset/src/com/ibm/icu/charset/CharsetBOCU1.java
Added flags.
[Dictionary.git] / jars / icu4j-4_8_1_1 / main / classes / charset / src / com / ibm / icu / charset / CharsetBOCU1.java
1 /*
2  *******************************************************************************
3  * Copyright (C) 2008-2010, International Business Machines Corporation and         *
4  * others. All Rights Reserved.                                                *
5  *******************************************************************************
6  */
7 package com.ibm.icu.charset;
8
9 import java.nio.ByteBuffer;
10 import java.nio.CharBuffer;
11 import java.nio.IntBuffer;
12 import java.nio.charset.CharsetDecoder;
13 import java.nio.charset.CharsetEncoder;
14 import java.nio.charset.CoderResult;
15
16 import com.ibm.icu.lang.UCharacter;
17 import com.ibm.icu.text.UTF16;
18 import com.ibm.icu.text.UnicodeSet;
19
20 /**
21  * @author krajwade
22  *
23  */
24 class CharsetBOCU1 extends CharsetICU {   
25     /* BOCU constants and macros */
26     
27     /* initial value for "prev": middle of the ASCII range */
28     private static final byte BOCU1_ASCII_PREV = 0x40;
29     
30     /* bounding byte values for differences */
31     private static final int BOCU1_MIN = 0x21;
32     private static final int BOCU1_MIDDLE = 0x90;
33     //private static final int BOCU1_MAX_LEAD = 0xfe;
34     private static final int BOCU1_MAX_TRAIL = 0xff;
35     private static final int BOCU1_RESET = 0xff;
36
37     /* number of lead bytes */
38     //private static final int BOCU1_COUNT = (BOCU1_MAX_LEAD-BOCU1_MIN+1);
39
40     /* adjust trail byte counts for the use of some C0 control byte values */
41     private static final int BOCU1_TRAIL_CONTROLS_COUNT =  20;
42     private static final int BOCU1_TRAIL_BYTE_OFFSET = (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT);
43
44     /* number of trail bytes */
45     private static final int BOCU1_TRAIL_COUNT =((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT);
46     
47     /*
48      * number of positive and negative single-byte codes
49      * (counting 0==BOCU1_MIDDLE among the positive ones)
50      */
51     private static final int BOCU1_SINGLE = 64;
52
53     /* number of lead bytes for positive and negative 2/3/4-byte sequences */
54     private static final int BOCU1_LEAD_2 = 43;
55     private static final int BOCU1_LEAD_3 = 3;
56     //private static final int BOCU1_LEAD_4 = 1;
57
58     /* The difference value range for single-byters. */
59     private static final int BOCU1_REACH_POS_1 = (BOCU1_SINGLE-1);
60     private static final int BOCU1_REACH_NEG_1 = (-BOCU1_SINGLE);
61
62     /* The difference value range for double-byters. */
63     private static final int BOCU1_REACH_POS_2 = (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT);
64     private static final int BOCU1_REACH_NEG_2 = (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT);
65
66     /* The difference value range for 3-byters. */
67     private static final int BOCU1_REACH_POS_3  =
68         (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
69
70     private static final int BOCU1_REACH_NEG_3 = (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
71
72     /* The lead byte start values. */
73     private static final int BOCU1_START_POS_2 =  (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1);
74     private static final int BOCU1_START_POS_3  = (BOCU1_START_POS_2+BOCU1_LEAD_2);
75     private static final int BOCU1_START_POS_4  = (BOCU1_START_POS_3+BOCU1_LEAD_3);
76          /* ==BOCU1_MAX_LEAD */
77
78     private static final int BOCU1_START_NEG_2 = (BOCU1_MIDDLE+BOCU1_REACH_NEG_1);
79     private static final int BOCU1_START_NEG_3 = (BOCU1_START_NEG_2-BOCU1_LEAD_2);
80     //private static final int BOCU1_START_NEG_4 = (BOCU1_START_NEG_3-BOCU1_LEAD_3);
81          /* ==BOCU1_MIN+1 */
82
83     /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
84    /* private static int BOCU1_LENGTH_FROM_LEAD(int lead) {
85        return ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : 
86          (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : 
87          (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4);
88     }*/
89
90     /* The length of a byte sequence, according to its packed form. */
91     private static int BOCU1_LENGTH_FROM_PACKED(int packed) {
92         return (((packed)&UConverterConstants.UNSIGNED_INT_MASK)<0x04000000 ? (packed)>>24 : 4);
93     }
94     
95     /*
96      * Byte value map for control codes,
97      * from external byte values 0x00..0x20
98      * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
99      * External byte values that are illegal as trail bytes are mapped to -1.
100      */
101     private static final int[]
102     bocu1ByteToTrail={
103     /*  0     1     2     3     4     5     6     7    */
104         -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
105
106     /*  8     9     a     b     c     d     e     f    */
107         -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
108
109     /*  10    11    12    13    14    15    16    17   */
110         0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
111
112     /*  18    19    1a    1b    1c    1d    1e    1f   */
113         0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,
114
115     /*  20   */
116         -1
117     };
118
119     /*
120      * Byte value map for control codes,
121      * from trail byte values 0..19 (0..0x13) as used in the difference calculation
122      * to external byte values 0x00..0x20.
123      */
124     private static final int[] 
125     bocu1TrailToByte = {
126     /*  0     1     2     3     4     5     6     7    */
127         0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
128
129     /*  8     9     a     b     c     d     e     f    */
130         0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
131
132     /*  10    11    12    13   */
133         0x1c, 0x1d, 0x1e, 0x1f
134     };
135     
136     
137     /*
138      * 12 commonly used C0 control codes (and space) are only used to encode
139      * themselves directly,
140      * which makes BOCU-1 MIME-usable and reasonably safe for
141      * ASCII-oriented software.
142      *
143      * These controls are
144      *  0   NUL
145      *
146      *  7   BEL
147      *  8   BS
148      *
149      *  9   TAB
150      *  a   LF
151      *  b   VT
152      *  c   FF
153      *  d   CR
154      *
155      *  e   SO
156      *  f   SI
157      *
158      * 1a   SUB
159      * 1b   ESC
160      *
161      * The other 20 C0 controls are also encoded directly (to preserve order)
162      * but are also used as trail bytes in difference encoding
163      * (for better compression).
164      */
165     private static int BOCU1_TRAIL_TO_BYTE(int trail) {
166         return ((trail)>=BOCU1_TRAIL_CONTROLS_COUNT ? (trail)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[trail]);
167     }    
168     
169     /* BOCU-1 implementation functions ------------------------------------------ */
170     private static int BOCU1_SIMPLE_PREV(int c){
171         return (((c)&~0x7f)+BOCU1_ASCII_PREV);
172     }
173
174     /**
175      * Compute the next "previous" value for differencing
176      * from the current code point.
177      *
178      * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
179      * @return "previous code point" state value
180      */
181     private static  int bocu1Prev(int c) {
182         /* compute new prev */
183         if(/* 0x3040<=c && */ c<=0x309f) {
184             /* Hiragana is not 128-aligned */
185             return 0x3070;
186         } else if(0x4e00<=c && c<=0x9fa5) {
187             /* CJK Unihan */
188             return 0x4e00-BOCU1_REACH_NEG_2;
189         } else if(0xac00<=c /* && c<=0xd7a3 */) {
190             /* Korean Hangul */
191             return (0xd7a3+0xac00)/2;
192         } else {
193             /* mostly small scripts */
194             return BOCU1_SIMPLE_PREV(c);
195         }
196     }
197
198     /** Fast version of bocu1Prev() for most scripts. */
199     private static int BOCU1_PREV(int c) {
200         return ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c));
201     }
202     
203     protected byte[] fromUSubstitution = new byte[]{(byte)0x1A};
204
205     /* Faster versions of packDiff() for single-byte-encoded diff values. */
206
207     /** Is a diff value encodable in a single byte? */
208     private static boolean DIFF_IS_SINGLE(int diff){
209         return (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1);
210     }
211
212     /** Encode a diff value in a single byte. */
213     private static int PACK_SINGLE_DIFF(int diff){
214         return (BOCU1_MIDDLE+(diff));
215     }
216
217     /** Is a diff value encodable in two bytes? */
218     private static boolean DIFF_IS_DOUBLE(int diff){
219         return (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2);
220     }   
221       
222     public CharsetBOCU1(String icuCanonicalName, String javaCanonicalName, String[] aliases){
223         super(icuCanonicalName, javaCanonicalName, aliases);
224         maxBytesPerChar = 4; 
225         minBytesPerChar = 1;
226         maxCharsPerByte = 1;
227      }
228     
229     class CharsetEncoderBOCU extends CharsetEncoderICU {
230         public CharsetEncoderBOCU(CharsetICU cs) {
231             super(cs,fromUSubstitution);
232         }
233         
234         int sourceIndex, nextSourceIndex;
235         int prev, c , diff;
236         boolean checkNegative;
237         boolean LoopAfterTrail;
238         int targetCapacity;
239         CoderResult cr;        
240         
241         /* label values for supporting behavior similar to goto in C */
242         private static final int fastSingle=0;
243         private static final int getTrail=1;
244         private static final int regularLoop=2;
245         
246         private boolean LabelLoop; //used to break the while loop
247         private int labelType = fastSingle; //labeType is set to fastSingle to start the code from fastSingle:
248         
249         /**
250          * Integer division and modulo with negative numerators
251          * yields negative modulo results and quotients that are one more than
252          * what we need here.
253          * This macro adjust the results so that the modulo-value m is always >=0.
254          *
255          * For positive n, the if() condition is always FALSE.
256          *
257          * @param n Number to be split into quotient and rest.
258          *          Will be modified to contain the quotient.
259          * @param d Divisor.
260          * @param m Output variable for the rest (modulo result).
261          */
262         private int NEGDIVMOD(int n, int d, int m) {
263             diff = n;
264             (m)=(diff)%(d); 
265             (diff)/=(d); 
266             if((m)<0) { 
267                 --(diff);
268                 (m)+=(d);
269             }
270             return m;
271         }
272         
273         /**
274          * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
275          * and return a packed integer with them.
276          *
277          * The encoding favors small absolute differences with short encodings
278          * to compress runs of same-script characters.
279          *
280          * Optimized version with unrolled loops and fewer floating-point operations
281          * than the standard packDiff().
282          *
283          * @param diff difference value -0x10ffff..0x10ffff
284          * @return
285          *      0x010000zz for 1-byte sequence zz
286          *      0x0200yyzz for 2-byte sequence yy zz
287          *      0x03xxyyzz for 3-byte sequence xx yy zz
288          *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
289          */
290         private int packDiff(int n) {
291             int result, m = 0;
292             diff = n;
293
294             if(diff>=BOCU1_REACH_NEG_1) {
295                 /* mostly positive differences, and single-byte negative ones */
296                 if(diff<=BOCU1_REACH_POS_2) {
297                     /* two bytes */
298                     diff-=BOCU1_REACH_POS_1+1;
299                     result=0x02000000;
300
301                     m=diff%BOCU1_TRAIL_COUNT;
302                     diff/=BOCU1_TRAIL_COUNT;
303                     result|=BOCU1_TRAIL_TO_BYTE(m);
304
305                     result|=(BOCU1_START_POS_2+diff)<<8;
306                 } else if(diff<=BOCU1_REACH_POS_3) {
307                     /* three bytes */
308                     diff-=BOCU1_REACH_POS_2+1;
309                     result=0x03000000;
310
311                     m=diff%BOCU1_TRAIL_COUNT;
312                     diff/=BOCU1_TRAIL_COUNT;
313                     result|=BOCU1_TRAIL_TO_BYTE(m);
314
315                     m=diff%BOCU1_TRAIL_COUNT;
316                     diff/=BOCU1_TRAIL_COUNT;
317                     result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
318
319                     result|=(BOCU1_START_POS_3+diff)<<16;
320                 } else {
321                     /* four bytes */
322                     diff-=BOCU1_REACH_POS_3+1;
323
324                     m=diff%BOCU1_TRAIL_COUNT;
325                     diff/=BOCU1_TRAIL_COUNT;
326                     result=BOCU1_TRAIL_TO_BYTE(m);
327
328                     m=diff%BOCU1_TRAIL_COUNT;
329                     diff/=BOCU1_TRAIL_COUNT;
330                     result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
331
332                     /*
333                      * We know that / and % would deliver quotient 0 and rest=diff.
334                      * Avoid division and modulo for performance.
335                      */
336                     result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
337
338                     result|=((BOCU1_START_POS_4&UConverterConstants.UNSIGNED_INT_MASK))<<24;
339                 }
340             } else {
341                 /* two- to four-byte negative differences */
342                 if(diff>=BOCU1_REACH_NEG_2) {
343                     /* two bytes */
344                     diff-=BOCU1_REACH_NEG_1;
345                     result=0x02000000;
346
347                     m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
348                     result|=BOCU1_TRAIL_TO_BYTE(m);
349
350                     result|=(BOCU1_START_NEG_2+diff)<<8;
351                 } else if(diff>=BOCU1_REACH_NEG_3) {
352                     /* three bytes */
353                     diff-=BOCU1_REACH_NEG_2;
354                     result=0x03000000;
355
356                     m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
357                     result|=BOCU1_TRAIL_TO_BYTE(m);
358
359                     m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
360                     result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
361
362                     result|=(BOCU1_START_NEG_3+diff)<<16;
363                 } else {
364                     /* four bytes */
365                     diff-=BOCU1_REACH_NEG_3;
366
367                     m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
368                     result=BOCU1_TRAIL_TO_BYTE(m);
369
370                     m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
371                     result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
372
373                     /*
374                      * We know that NEGDIVMOD would deliver
375                      * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
376                      * Avoid division and modulo for performance.
377                      */
378                     m=diff+BOCU1_TRAIL_COUNT;
379                     result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
380
381                     result|=BOCU1_MIN<<24;
382                 }
383             }
384             return result;
385         }
386            
387         protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush){
388             cr = CoderResult.UNDERFLOW;
389             
390             LabelLoop = true; //used to break the while loop
391             checkNegative = false; // its value is set to true to get out of while loop when c = -c
392             LoopAfterTrail = false; // its value is set to true to ignore code before getTrail:
393             
394             /*set up the local pointers*/
395             targetCapacity = target.limit() - target.position();
396             c = fromUChar32;
397             prev = fromUnicodeStatus;
398             
399             if(prev==0){
400                 prev = BOCU1_ASCII_PREV;
401             }
402             
403             /*sourceIndex ==-1 if the current characte began in the previous buffer*/
404             sourceIndex = c == 0 ? 0: -1;
405             nextSourceIndex = 0;
406             
407             /*conversion loop*/
408             if(c!=0 && targetCapacity>0){
409                 labelType = getTrail;
410             }
411             
412             while(LabelLoop){
413                 switch(labelType){
414                     case fastSingle:
415                         labelType = fastSingle(source, target, offsets);
416                         break;
417                     case getTrail:
418                         labelType = getTrail(source, target, offsets);
419                         break;
420                     case regularLoop:
421                         labelType = regularLoop(source, target, offsets);
422                         break;
423                 }
424             }
425                     
426             return cr;
427         }
428         
429         private int fastSingle(CharBuffer source, ByteBuffer target, IntBuffer offsets){                     
430 //fastSingle:        
431             /*fast loop for single-byte differences*/
432             /*use only one loop counter variable , targetCapacity, not also source*/
433             diff = source.limit() - source.position();
434             if(targetCapacity>diff){
435                 targetCapacity = diff;
436             }
437             while(targetCapacity>0 && (c=source.get(source.position()))<0x3000){
438                 if(c<=0x20){
439                     if(c!=0x20){
440                         prev = BOCU1_ASCII_PREV;
441                     }
442                     target.put((byte)c);
443                     if(offsets!=null){
444                         offsets.put(nextSourceIndex++);
445                     }
446                     source.position(source.position()+1);
447                     --targetCapacity;
448                 }else {
449                     diff = c-prev;
450                     if(DIFF_IS_SINGLE(diff)){
451                         prev = BOCU1_SIMPLE_PREV(c);
452                         target.put((byte)PACK_SINGLE_DIFF(diff));
453                         if(offsets!=null){
454                             offsets.put(nextSourceIndex++);
455                         }
456                         source.position(source.position()+1);
457                         --targetCapacity;
458                     }else {
459                         break;
460                     }
461                 }
462             }
463             return regularLoop;
464         }
465         
466         private int getTrail(CharBuffer source, ByteBuffer target, IntBuffer offsets){
467             if(source.hasRemaining()){
468                 /*test the following code unit*/
469                 char trail = source.get(source.position());
470                 if(UTF16.isTrailSurrogate(trail)){
471                     source.position(source.position()+1);
472                     ++nextSourceIndex;
473                     c=UCharacter.getCodePoint((char)c, trail);
474                 }
475             } else {
476                 /*no more input*/
477                 c = -c; /*negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else*/
478                 checkNegative = true;
479             }
480             LoopAfterTrail = true;
481             return regularLoop;
482         }
483
484         @SuppressWarnings("fallthrough")
485         private int regularLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets){
486             if(!LoopAfterTrail){
487                 /*restore real values*/
488                 targetCapacity = target.limit()-target.position();
489                 sourceIndex = nextSourceIndex; /*wrong if offsets==null but does not matter*/
490             }
491             /*regular loop for all classes*/
492             while(LoopAfterTrail || source.hasRemaining()){
493                 if(LoopAfterTrail || targetCapacity>0){
494                     
495                     if(!LoopAfterTrail){
496                         c = source.get();
497                         ++nextSourceIndex;
498                         
499                         if(c<=0x20){
500                             /*
501                              * ISO C0 control & space:
502                              * Encode directly for MIME compatibility,
503                              * and reset state except for space, to not disrupt compression.
504                              */
505                             if(c!=0x20) {
506                                 prev=BOCU1_ASCII_PREV;
507                             }
508                             target.put((byte)c);
509                             if(offsets != null){
510                                 offsets.put(sourceIndex++);
511                             }
512                             --targetCapacity;
513                          
514                             sourceIndex=nextSourceIndex;
515                             continue;
516                         }
517                         
518                         if(UTF16.isLeadSurrogate((char)c)){
519                             getTrail(source, target, offsets);
520                             if(checkNegative){
521                                 break;
522                             }
523                         }
524                     }
525                         
526                     if(LoopAfterTrail){
527                         LoopAfterTrail = false; 
528                     }
529                     
530                     /*
531                      * all other Unicode code points c==U+0021..U+10ffff
532                      * are encoded with the difference c-prev
533                      *
534                      * a new prev is computed from c,
535                      * placed in the middle of a 0x80-block (for most small scripts) or
536                      * in the middle of the Unihan and Hangul blocks
537                      * to statistically minimize the following difference
538                      */
539                     diff = c- prev;
540                     prev = BOCU1_PREV(c);
541                     if(DIFF_IS_SINGLE(diff)){
542                         target.put((byte)PACK_SINGLE_DIFF(diff));
543                         if(offsets!=null){
544                             offsets.put(sourceIndex++);
545                         }
546                         --targetCapacity;
547                         sourceIndex=nextSourceIndex;
548                         if(c<0x3000){
549                             labelType = fastSingle;
550                             return labelType;
551                         }
552                     } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity){
553                         /*optimize 2 byte case*/
554                         int m = 0;
555                         if(diff>=0){
556                             diff -= BOCU1_REACH_POS_1 +1;
557                             m = diff%BOCU1_TRAIL_COUNT;
558                             diff/=BOCU1_TRAIL_COUNT;
559                             diff+=BOCU1_START_POS_2;
560                         } else {
561                             diff -= BOCU1_REACH_NEG_1;
562                             m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
563                             diff+=BOCU1_START_NEG_2;
564                         }
565                         target.put((byte)diff);
566                         target.put((byte)BOCU1_TRAIL_TO_BYTE(m));
567                         if(offsets!=null){
568                             offsets.put(sourceIndex);
569                             offsets.put(sourceIndex);
570                         }
571                         targetCapacity -= 2;
572                         sourceIndex = nextSourceIndex;
573                     } else {
574                         int length; /*will be 2..4*/
575                         diff = packDiff(diff);
576                         length = BOCU1_LENGTH_FROM_PACKED(diff);
577                         
578                         /*write the output character bytes from diff and length*/
579                         /*from the first if in the loop we know that targetCapacity>0*/
580                         if(length<=targetCapacity){
581                             switch(length){
582                                 /*each branch falls through the next one*/
583                                 case 4:
584                                     target.put((byte)(diff>>24));
585                                     if(offsets!= null){
586                                         offsets.put(sourceIndex);
587                                     }
588                                 case 3:
589                                     target.put((byte)(diff>>16));
590                                     if(offsets!= null){
591                                         offsets.put(sourceIndex);
592                                     }
593                                 case 2:
594                                     target.put((byte)(diff>>8));
595                                     if(offsets!= null){
596                                         offsets.put(sourceIndex);
597                                     }
598                                     /*case 1 handled above*/
599                                     target.put((byte)diff);
600                                     if(offsets!= null){
601                                         offsets.put(sourceIndex);
602                                     }
603                                 default:
604                                     /*will never occur*/
605                                     break;
606                             }
607                             targetCapacity -= length;
608                             sourceIndex = nextSourceIndex;
609                         } else {
610                             ByteBuffer error = ByteBuffer.wrap(errorBuffer);
611                             /*
612                              * We actually do this backwards here:
613                              * In order to save an intermediate variable, we output
614                              * first to the overflow buffer what does not fit into the
615                              * regular target.
616                              */
617                             /* we know that 1<=targetCapacity<length<=4 */
618                             length-=targetCapacity;
619                             switch(length) {
620                                 /* each branch falls through to the next one */
621                             case 3:
622                                 error.put((byte)(diff>>16));
623                             case 2:
624                                 error.put((byte)(diff>>8));
625                             case 1:
626                                 error.put((byte)diff);
627                             default:
628                                 /* will never occur */
629                                 break;
630                             }
631                             errorBufferLength = length;
632                             
633                             /* now output what fits into the regular target */
634                             diff>>=8*length; /* length was reduced by targetCapacity */
635                             switch(targetCapacity) {
636                                 /* each branch falls through to the next one */
637                             case 3:
638                                 target.put((byte)(diff>>16));
639                                 if(offsets!= null){
640                                     offsets.put(sourceIndex);
641                                 }
642                             case 2:
643                                 target.put((byte)(diff>>8));
644                                 if(offsets!= null){
645                                     offsets.put(sourceIndex);
646                                 }
647                             case 1:
648                                 target.put((byte)diff);
649                                 if(offsets!= null){
650                                     offsets.put(sourceIndex);
651                                 }
652                             default:
653                                 /* will never occur */
654                                 break;
655                             }
656
657                             /* target overflow */
658                             targetCapacity=0;
659                             cr = CoderResult.OVERFLOW;
660                             break;
661                         }
662                     }
663                 } else{
664                     /*target is full*/
665                     cr = CoderResult.OVERFLOW;
666                     break;
667                 }
668                    
669             }
670             /*set the converter state back into UConverter*/
671             fromUChar32 = c<0 ? -c :0;
672             fromUnicodeStatus = prev;
673             LabelLoop = false;
674             labelType = fastSingle;
675             return labelType;
676         }
677        
678     }
679     
680     class CharsetDecoderBOCU extends CharsetDecoderICU{
681         public CharsetDecoderBOCU(CharsetICU cs) {
682             super(cs);
683         }
684         
685         int byteIndex;
686         int sourceIndex, nextSourceIndex;
687         int prev, c , diff, count;
688         byte[] bytes;
689         int targetCapacity;
690         CoderResult cr;
691         
692         /* label values for supporting behavior similar to goto in C */
693         private static final int fastSingle=0;
694         private static final int getTrail=1;
695         private static final int regularLoop=2;
696         private static final int endLoop=3;
697         
698         private boolean LabelLoop;//used to break the while loop
699         private boolean afterTrail; // its value is set to true to ignore code after getTrail:
700         private int labelType;
701         /*
702          * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
703          * The UConverter fields are used as follows:
704          *
705          * fromUnicodeStatus    encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
706          *
707          * toUnicodeStatus      decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
708          * mode                 decoder's incomplete (diff<<2)|count (ignored when toULength==0)
709          */
710
711         /* BOCU-1-from-Unicode conversion functions --------------------------------- */
712
713         
714         
715         /**
716          * Function for BOCU-1 decoder; handles multi-byte lead bytes.
717          *
718          * @param b lead byte;
719          *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
720          * @return (diff<<2)|count
721          */
722         private int decodeBocu1LeadByte(int b) {
723             int diffValue, countValue;
724
725             if(b >= BOCU1_START_NEG_2) {
726                 /* positive difference */
727                 if(b < BOCU1_START_POS_3) {
728                     /* two bytes */
729                     diffValue = (b - BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_POS_1+1;
730                     countValue = 1;
731                 } else if(b < BOCU1_START_POS_4) {
732                     /* three bytes */
733                     diffValue = (b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
734                     countValue = 2;
735                 } else {
736                     /* four bytes */
737                     diffValue = BOCU1_REACH_POS_3+1;
738                     countValue = 3;
739                 }
740             } else {
741                 /* negative difference */
742                 if(b >= BOCU1_START_NEG_3) {
743                     /* two bytes */
744                     diffValue=(b -BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_1;
745                     countValue=1;
746                 } else if(b>BOCU1_MIN) {
747                     /* three bytes */
748                     diffValue=(b - BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_2;
749                     countValue = 2;
750                 } else {
751                     /* four bytes */
752                     diffValue=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
753                     countValue=3;
754                 }
755             }
756
757             /* return the state for decoding the trail byte(s) */
758             return (diffValue<<2)|countValue;
759         }
760         
761         /**
762          * Function for BOCU-1 decoder; handles multi-byte trail bytes.
763          *
764          * @param count number of remaining trail bytes including this one
765          * @param b trail byte
766          * @return new delta for diff including b - <0 indicates an error
767          *
768          * @see decodeBocu1
769          */
770         private int decodeBocu1TrailByte(int countValue, int b) {
771             b = b&UConverterConstants.UNSIGNED_BYTE_MASK;
772             if((b)<=0x20) {
773                 /* skip some C0 controls and make the trail byte range contiguous */
774                 b = bocu1ByteToTrail[b];
775                 /* b<0 for an illegal trail byte value will result in return<0 below */
776             } else {
777                 //b-= BOCU1_TRAIL_BYTE_OFFSET;
778                 b = b - BOCU1_TRAIL_BYTE_OFFSET;
779             }
780
781             /* add trail byte into difference and decrement count */
782             if(countValue==1) {
783                 return b;
784             } else if(countValue==2) {
785                 return b*BOCU1_TRAIL_COUNT;
786             } else /* count==3 */ {
787                 return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
788             }
789         }
790         
791         protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets,
792                 boolean flush){
793             cr = CoderResult.UNDERFLOW;
794             
795             LabelLoop = true; 
796             afterTrail = false; 
797             labelType = fastSingle; // labelType is set to fastSingle so t
798             
799             /*get the converter state*/
800             prev = toUnicodeStatus;
801             
802             if(prev==0){
803                 prev = BOCU1_ASCII_PREV;
804             }
805             diff = mode;
806             count = diff&3;
807             diff>>=2;
808             
809             byteIndex = toULength;
810             bytes = toUBytesArray;
811             
812             /* sourceIndex=-1 if the current character began in the previous buffer */
813             sourceIndex=byteIndex==0 ? 0 : -1;
814             nextSourceIndex=0;
815             
816             /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
817             if(count>0 && byteIndex>0 && target.position()<target.limit()) {
818                 labelType = getTrail;
819             }
820             
821             while(LabelLoop){
822                 switch(labelType){
823                     case fastSingle:
824                         labelType = fastSingle(source, target, offsets);
825                         break;
826                     case getTrail:
827                         labelType = getTrail(source, target, offsets);
828                         break;
829                     case regularLoop:
830                         labelType = afterGetTrail(source, target, offsets);
831                         break;
832                     case endLoop:
833                         endLoop(source, target, offsets);
834                         break;
835                 }
836             }
837             
838             return cr;
839         }
840         
841         private int fastSingle(ByteBuffer source, CharBuffer target, IntBuffer offsets){
842             labelType = regularLoop;
843             /* fast loop for single-byte differences */
844             /* use count as the only loop counter variable */
845             diff = source.limit() - source.position();
846             count = target.limit()-target.position();
847             if(count>diff) {
848                 count = diff;
849             }
850             while(count>0) {
851                 if(BOCU1_START_NEG_2 <=(c=source.get(source.position())&UConverterConstants.UNSIGNED_BYTE_MASK) && c< BOCU1_START_POS_2) {
852                     c = prev + (c-BOCU1_MIDDLE);
853                     if(c<0x3000) {
854                         target.put((char)c);
855                         if(offsets!=null){
856                             offsets.put(nextSourceIndex++);
857                         } 
858                         prev = BOCU1_SIMPLE_PREV(c);
859                     } else {
860                         break;
861                     }
862                 } else if((c&UConverterConstants.UNSIGNED_BYTE_MASK) <= 0x20) {
863                     if((c&UConverterConstants.UNSIGNED_BYTE_MASK) != 0x20) {
864                         prev = BOCU1_ASCII_PREV;
865                     }
866                     target.put((char)c);
867                     if(offsets!=null){
868                         offsets.put(nextSourceIndex++);
869                     } 
870                 } else {
871                     break;
872                 }
873                 source.position(source.position()+1);
874                 --count;
875             }
876             sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
877             return labelType;
878         }
879         
880         private int getTrail(ByteBuffer source, CharBuffer target, IntBuffer offsets){
881             labelType = regularLoop;
882             for(;;) {
883                 if(source.position() >= source.limit()) {
884                     labelType = endLoop;
885                     return labelType;
886                 }
887                 ++nextSourceIndex;
888                 c = bytes[byteIndex++] = source.get();
889
890                 /* trail byte in any position */
891                 c = decodeBocu1TrailByte(count, c);
892                 if(c<0) {
893                     cr = CoderResult.malformedForLength(1);
894                     labelType = endLoop;
895                     return labelType;
896                 }
897
898                 diff+=c;
899                 if(--count==0) {
900                     /* final trail byte, deliver a code point */
901                     byteIndex=0;
902                     c = prev + diff;
903                     if(c > 0x10ffff) {
904                         cr = CoderResult.malformedForLength(1);
905                         labelType = endLoop;
906                         return labelType;
907                     }
908                     break;
909                 }
910             }
911             afterTrail = true;
912             return labelType;
913             
914         }
915         
916         private int afterGetTrail(ByteBuffer source, CharBuffer target, IntBuffer offsets){
917             /* decode a sequence of single and lead bytes */
918             while(afterTrail || source.hasRemaining()) {
919                 if(!afterTrail){
920                     if(target.position() >= target.limit()) {
921                         /* target is full */
922                         cr = CoderResult.OVERFLOW;
923                         break;
924                     }
925
926                     ++nextSourceIndex;
927                     c = source.get()&UConverterConstants.UNSIGNED_BYTE_MASK;
928                     if(BOCU1_START_NEG_2 <= c && c < BOCU1_START_POS_2) {
929                         /* Write a code point directly from a single-byte difference. */
930                         c = prev + (c-BOCU1_MIDDLE);
931                         if(c<0x3000) {
932                             target.put((char)c);
933                             if(offsets!=null){
934                                 offsets.put(sourceIndex);
935                             }
936                             prev = BOCU1_SIMPLE_PREV(c);
937                             sourceIndex = nextSourceIndex;
938                             labelType = fastSingle;
939                             return labelType;
940                         }
941                     } else if(c <= 0x20) {
942                         /*
943                          * Direct-encoded C0 control code or space.
944                          * Reset prev for C0 control codes but not for space.
945                          */
946                         if(c != 0x20) {
947                             prev=BOCU1_ASCII_PREV;
948                         }
949                         target.put((char)c);
950                         if(offsets!=null){
951                             offsets.put(sourceIndex);
952                         }
953                         sourceIndex=nextSourceIndex;
954                         continue;
955                     } else if(BOCU1_START_NEG_3 <= c && c < BOCU1_START_POS_3 && source.hasRemaining()) {
956                         /* Optimize two-byte case. */
957                         if(c >= BOCU1_MIDDLE) {
958                             diff=(c - BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_POS_1 + 1;
959                         } else {
960                             diff=(c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_1;
961                         }
962
963                         /* trail byte */
964                         ++nextSourceIndex;
965                         c = decodeBocu1TrailByte(1, source.get());
966                         if(c<0 || ((c = prev + diff + c)&UConverterConstants.UNSIGNED_INT_MASK)>0x10ffff) {
967                             bytes[0]= source.get(source.position()-2);
968                             bytes[1]= source.get(source.position()-1);
969                             byteIndex = 2;
970                             cr = CoderResult.malformedForLength(2);
971                             break;
972                         }
973                     } else if(c == BOCU1_RESET) {
974                         /* only reset the state, no code point */
975                         prev=BOCU1_ASCII_PREV;
976                         sourceIndex=nextSourceIndex;
977                         continue;
978                     } else {
979                         /*
980                          * For multi-byte difference lead bytes, set the decoder state
981                          * with the partial difference value from the lead byte and
982                          * with the number of trail bytes.
983                          */
984                         bytes[0]= (byte)c;
985                         byteIndex = 1;
986
987                         diff = decodeBocu1LeadByte(c);
988                         count = diff&3;
989                         diff>>=2;
990                         getTrail(source, target, offsets);
991                         if(labelType != regularLoop){
992                             return labelType;
993                         }
994                     }
995                 }
996                 
997                 if(afterTrail){
998                     afterTrail = false;
999                 }
1000                 
1001                 /* calculate the next prev and output c */
1002                 prev = BOCU1_PREV(c);
1003                 if(c<=0xffff) {
1004                     target.put((char)c);
1005                     if(offsets!=null){
1006                         offsets.put(sourceIndex);
1007                     }
1008                 } else {
1009                     /* output surrogate pair */
1010                     target.put(UTF16.getLeadSurrogate(c));
1011                     if(target.hasRemaining()) {
1012                         target.put(UTF16.getTrailSurrogate(c));
1013                         if(offsets!=null){
1014                             offsets.put(sourceIndex);
1015                             offsets.put(sourceIndex);
1016                         }
1017                     } else {
1018                         /* target overflow */
1019                         if(offsets!=null){
1020                             offsets.put(sourceIndex);
1021                         }
1022                         charErrorBufferArray[0] = UTF16.getTrailSurrogate(c);
1023                         charErrorBufferLength = 1;
1024                         cr = CoderResult.OVERFLOW;
1025                         break;
1026                 }
1027             }
1028             sourceIndex=nextSourceIndex;
1029           }
1030           labelType = endLoop;
1031           return labelType;
1032         }
1033         
1034         private void endLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets){
1035             if(cr.isMalformed()) {
1036                 /* set the converter state in UConverter to deal with the next character */
1037                 toUnicodeStatus = BOCU1_ASCII_PREV;
1038                 mode = 0;
1039             } else {
1040                 /* set the converter state back into UConverter */
1041                 toUnicodeStatus=prev;
1042                 mode=(diff<<2)|count;
1043             }
1044             toULength=byteIndex;
1045             LabelLoop = false;
1046         }
1047     
1048     }
1049     
1050     
1051     public CharsetDecoder newDecoder() {
1052         return new CharsetDecoderBOCU(this);
1053     }
1054
1055     public CharsetEncoder newEncoder() {
1056         return new CharsetEncoderBOCU(this);
1057     }
1058     
1059     void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
1060         CharsetICU.getCompleteUnicodeSet(setFillIn);
1061     }
1062
1063 }