]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-4_4_2-src/main/classes/charset/src/com/ibm/icu/charset/CharsetBOCU1.java
go
[Dictionary.git] / jars / icu4j-4_4_2-src / main / classes / charset / src / com / ibm / icu / charset / CharsetBOCU1.java
1 /*\r
2  *******************************************************************************\r
3  * Copyright (C) 2008-2010, International Business Machines Corporation and         *\r
4  * others. All Rights Reserved.                                                *\r
5  *******************************************************************************\r
6  */\r
7 package com.ibm.icu.charset;\r
8 \r
9 import java.nio.ByteBuffer;\r
10 import java.nio.CharBuffer;\r
11 import java.nio.IntBuffer;\r
12 import java.nio.charset.CharsetDecoder;\r
13 import java.nio.charset.CharsetEncoder;\r
14 import java.nio.charset.CoderResult;\r
15 \r
16 import com.ibm.icu.lang.UCharacter;\r
17 import com.ibm.icu.text.UTF16;\r
18 import com.ibm.icu.text.UnicodeSet;\r
19 \r
20 /**\r
21  * @author krajwade\r
22  *\r
23  */\r
24 class CharsetBOCU1 extends CharsetICU {   \r
25     /* BOCU constants and macros */\r
26     \r
27     /* initial value for "prev": middle of the ASCII range */\r
28     private static final byte BOCU1_ASCII_PREV = 0x40;\r
29     \r
30     /* bounding byte values for differences */\r
31     private static final int BOCU1_MIN = 0x21;\r
32     private static final int BOCU1_MIDDLE = 0x90;\r
33     //private static final int BOCU1_MAX_LEAD = 0xfe;\r
34     private static final int BOCU1_MAX_TRAIL = 0xff;\r
35     private static final int BOCU1_RESET = 0xff;\r
36 \r
37     /* number of lead bytes */\r
38     //private static final int BOCU1_COUNT = (BOCU1_MAX_LEAD-BOCU1_MIN+1);\r
39 \r
40     /* adjust trail byte counts for the use of some C0 control byte values */\r
41     private static final int BOCU1_TRAIL_CONTROLS_COUNT =  20;\r
42     private static final int BOCU1_TRAIL_BYTE_OFFSET = (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT);\r
43 \r
44     /* number of trail bytes */\r
45     private static final int BOCU1_TRAIL_COUNT =((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT);\r
46     \r
47     /*\r
48      * number of positive and negative single-byte codes\r
49      * (counting 0==BOCU1_MIDDLE among the positive ones)\r
50      */\r
51     private static final int BOCU1_SINGLE = 64;\r
52 \r
53     /* number of lead bytes for positive and negative 2/3/4-byte sequences */\r
54     private static final int BOCU1_LEAD_2 = 43;\r
55     private static final int BOCU1_LEAD_3 = 3;\r
56     //private static final int BOCU1_LEAD_4 = 1;\r
57 \r
58     /* The difference value range for single-byters. */\r
59     private static final int BOCU1_REACH_POS_1 = (BOCU1_SINGLE-1);\r
60     private static final int BOCU1_REACH_NEG_1 = (-BOCU1_SINGLE);\r
61 \r
62     /* The difference value range for double-byters. */\r
63     private static final int BOCU1_REACH_POS_2 = (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT);\r
64     private static final int BOCU1_REACH_NEG_2 = (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT);\r
65 \r
66     /* The difference value range for 3-byters. */\r
67     private static final int BOCU1_REACH_POS_3  =\r
68         (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);\r
69 \r
70     private static final int BOCU1_REACH_NEG_3 = (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);\r
71 \r
72     /* The lead byte start values. */\r
73     private static final int BOCU1_START_POS_2 =  (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1);\r
74     private static final int BOCU1_START_POS_3  = (BOCU1_START_POS_2+BOCU1_LEAD_2);\r
75     private static final int BOCU1_START_POS_4  = (BOCU1_START_POS_3+BOCU1_LEAD_3);\r
76          /* ==BOCU1_MAX_LEAD */\r
77 \r
78     private static final int BOCU1_START_NEG_2 = (BOCU1_MIDDLE+BOCU1_REACH_NEG_1);\r
79     private static final int BOCU1_START_NEG_3 = (BOCU1_START_NEG_2-BOCU1_LEAD_2);\r
80     //private static final int BOCU1_START_NEG_4 = (BOCU1_START_NEG_3-BOCU1_LEAD_3);\r
81          /* ==BOCU1_MIN+1 */\r
82 \r
83     /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */\r
84    /* private static int BOCU1_LENGTH_FROM_LEAD(int lead) {\r
85        return ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \r
86          (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \r
87          (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4);\r
88     }*/\r
89 \r
90     /* The length of a byte sequence, according to its packed form. */\r
91     private static int BOCU1_LENGTH_FROM_PACKED(int packed) {\r
92         return (((packed)&UConverterConstants.UNSIGNED_INT_MASK)<0x04000000 ? (packed)>>24 : 4);\r
93     }\r
94     \r
95     /*\r
96      * Byte value map for control codes,\r
97      * from external byte values 0x00..0x20\r
98      * to trail byte values 0..19 (0..0x13) as used in the difference calculation.\r
99      * External byte values that are illegal as trail bytes are mapped to -1.\r
100      */\r
101     private static final int[]\r
102     bocu1ByteToTrail={\r
103     /*  0     1     2     3     4     5     6     7    */\r
104         -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,\r
105 \r
106     /*  8     9     a     b     c     d     e     f    */\r
107         -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,\r
108 \r
109     /*  10    11    12    13    14    15    16    17   */\r
110         0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,\r
111 \r
112     /*  18    19    1a    1b    1c    1d    1e    1f   */\r
113         0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,\r
114 \r
115     /*  20   */\r
116         -1\r
117     };\r
118 \r
119     /*\r
120      * Byte value map for control codes,\r
121      * from trail byte values 0..19 (0..0x13) as used in the difference calculation\r
122      * to external byte values 0x00..0x20.\r
123      */\r
124     private static final int[] \r
125     bocu1TrailToByte = {\r
126     /*  0     1     2     3     4     5     6     7    */\r
127         0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,\r
128 \r
129     /*  8     9     a     b     c     d     e     f    */\r
130         0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,\r
131 \r
132     /*  10    11    12    13   */\r
133         0x1c, 0x1d, 0x1e, 0x1f\r
134     };\r
135     \r
136     \r
137     /*\r
138      * 12 commonly used C0 control codes (and space) are only used to encode\r
139      * themselves directly,\r
140      * which makes BOCU-1 MIME-usable and reasonably safe for\r
141      * ASCII-oriented software.\r
142      *\r
143      * These controls are\r
144      *  0   NUL\r
145      *\r
146      *  7   BEL\r
147      *  8   BS\r
148      *\r
149      *  9   TAB\r
150      *  a   LF\r
151      *  b   VT\r
152      *  c   FF\r
153      *  d   CR\r
154      *\r
155      *  e   SO\r
156      *  f   SI\r
157      *\r
158      * 1a   SUB\r
159      * 1b   ESC\r
160      *\r
161      * The other 20 C0 controls are also encoded directly (to preserve order)\r
162      * but are also used as trail bytes in difference encoding\r
163      * (for better compression).\r
164      */\r
165     private static int BOCU1_TRAIL_TO_BYTE(int trail) {\r
166         return ((trail)>=BOCU1_TRAIL_CONTROLS_COUNT ? (trail)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[trail]);\r
167     }    \r
168     \r
169     /* BOCU-1 implementation functions ------------------------------------------ */\r
170     private static int BOCU1_SIMPLE_PREV(int c){\r
171         return (((c)&~0x7f)+BOCU1_ASCII_PREV);\r
172     }\r
173 \r
174     /**\r
175      * Compute the next "previous" value for differencing\r
176      * from the current code point.\r
177      *\r
178      * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)\r
179      * @return "previous code point" state value\r
180      */\r
181     private static  int bocu1Prev(int c) {\r
182         /* compute new prev */\r
183         if(/* 0x3040<=c && */ c<=0x309f) {\r
184             /* Hiragana is not 128-aligned */\r
185             return 0x3070;\r
186         } else if(0x4e00<=c && c<=0x9fa5) {\r
187             /* CJK Unihan */\r
188             return 0x4e00-BOCU1_REACH_NEG_2;\r
189         } else if(0xac00<=c /* && c<=0xd7a3 */) {\r
190             /* Korean Hangul */\r
191             return (0xd7a3+0xac00)/2;\r
192         } else {\r
193             /* mostly small scripts */\r
194             return BOCU1_SIMPLE_PREV(c);\r
195         }\r
196     }\r
197 \r
198     /** Fast version of bocu1Prev() for most scripts. */\r
199     private static int BOCU1_PREV(int c) {\r
200         return ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c));\r
201     }\r
202     \r
203     protected byte[] fromUSubstitution = new byte[]{(byte)0x1A};\r
204 \r
205     /* Faster versions of packDiff() for single-byte-encoded diff values. */\r
206 \r
207     /** Is a diff value encodable in a single byte? */\r
208     private static boolean DIFF_IS_SINGLE(int diff){\r
209         return (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1);\r
210     }\r
211 \r
212     /** Encode a diff value in a single byte. */\r
213     private static int PACK_SINGLE_DIFF(int diff){\r
214         return (BOCU1_MIDDLE+(diff));\r
215     }\r
216 \r
217     /** Is a diff value encodable in two bytes? */\r
218     private static boolean DIFF_IS_DOUBLE(int diff){\r
219         return (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2);\r
220     }   \r
221       \r
222     public CharsetBOCU1(String icuCanonicalName, String javaCanonicalName, String[] aliases){\r
223         super(icuCanonicalName, javaCanonicalName, aliases);\r
224         maxBytesPerChar = 4; \r
225         minBytesPerChar = 1;\r
226         maxCharsPerByte = 1;\r
227      }\r
228     \r
229     class CharsetEncoderBOCU extends CharsetEncoderICU {\r
230         public CharsetEncoderBOCU(CharsetICU cs) {\r
231             super(cs,fromUSubstitution);\r
232         }\r
233         \r
234         int sourceIndex, nextSourceIndex;\r
235         int prev, c , diff;\r
236         boolean checkNegative;\r
237         boolean LoopAfterTrail;\r
238         int targetCapacity;\r
239         CoderResult cr;        \r
240         \r
241         /* label values for supporting behavior similar to goto in C */\r
242         private static final int fastSingle=0;\r
243         private static final int getTrail=1;\r
244         private static final int regularLoop=2;\r
245         \r
246         private boolean LabelLoop; //used to break the while loop\r
247         private int labelType = fastSingle; //labeType is set to fastSingle to start the code from fastSingle:\r
248         \r
249         /**\r
250          * Integer division and modulo with negative numerators\r
251          * yields negative modulo results and quotients that are one more than\r
252          * what we need here.\r
253          * This macro adjust the results so that the modulo-value m is always >=0.\r
254          *\r
255          * For positive n, the if() condition is always FALSE.\r
256          *\r
257          * @param n Number to be split into quotient and rest.\r
258          *          Will be modified to contain the quotient.\r
259          * @param d Divisor.\r
260          * @param m Output variable for the rest (modulo result).\r
261          */\r
262         private int NEGDIVMOD(int n, int d, int m) {\r
263             diff = n;\r
264             (m)=(diff)%(d); \r
265             (diff)/=(d); \r
266             if((m)<0) { \r
267                 --(diff);\r
268                 (m)+=(d);\r
269             }\r
270             return m;\r
271         }\r
272         \r
273         /**\r
274          * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes\r
275          * and return a packed integer with them.\r
276          *\r
277          * The encoding favors small absolute differences with short encodings\r
278          * to compress runs of same-script characters.\r
279          *\r
280          * Optimized version with unrolled loops and fewer floating-point operations\r
281          * than the standard packDiff().\r
282          *\r
283          * @param diff difference value -0x10ffff..0x10ffff\r
284          * @return\r
285          *      0x010000zz for 1-byte sequence zz\r
286          *      0x0200yyzz for 2-byte sequence yy zz\r
287          *      0x03xxyyzz for 3-byte sequence xx yy zz\r
288          *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)\r
289          */\r
290         private int packDiff(int n) {\r
291             int result, m = 0;\r
292             diff = n;\r
293 \r
294             if(diff>=BOCU1_REACH_NEG_1) {\r
295                 /* mostly positive differences, and single-byte negative ones */\r
296                 if(diff<=BOCU1_REACH_POS_2) {\r
297                     /* two bytes */\r
298                     diff-=BOCU1_REACH_POS_1+1;\r
299                     result=0x02000000;\r
300 \r
301                     m=diff%BOCU1_TRAIL_COUNT;\r
302                     diff/=BOCU1_TRAIL_COUNT;\r
303                     result|=BOCU1_TRAIL_TO_BYTE(m);\r
304 \r
305                     result|=(BOCU1_START_POS_2+diff)<<8;\r
306                 } else if(diff<=BOCU1_REACH_POS_3) {\r
307                     /* three bytes */\r
308                     diff-=BOCU1_REACH_POS_2+1;\r
309                     result=0x03000000;\r
310 \r
311                     m=diff%BOCU1_TRAIL_COUNT;\r
312                     diff/=BOCU1_TRAIL_COUNT;\r
313                     result|=BOCU1_TRAIL_TO_BYTE(m);\r
314 \r
315                     m=diff%BOCU1_TRAIL_COUNT;\r
316                     diff/=BOCU1_TRAIL_COUNT;\r
317                     result|=BOCU1_TRAIL_TO_BYTE(m)<<8;\r
318 \r
319                     result|=(BOCU1_START_POS_3+diff)<<16;\r
320                 } else {\r
321                     /* four bytes */\r
322                     diff-=BOCU1_REACH_POS_3+1;\r
323 \r
324                     m=diff%BOCU1_TRAIL_COUNT;\r
325                     diff/=BOCU1_TRAIL_COUNT;\r
326                     result=BOCU1_TRAIL_TO_BYTE(m);\r
327 \r
328                     m=diff%BOCU1_TRAIL_COUNT;\r
329                     diff/=BOCU1_TRAIL_COUNT;\r
330                     result|=BOCU1_TRAIL_TO_BYTE(m)<<8;\r
331 \r
332                     /*\r
333                      * We know that / and % would deliver quotient 0 and rest=diff.\r
334                      * Avoid division and modulo for performance.\r
335                      */\r
336                     result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;\r
337 \r
338                     result|=((BOCU1_START_POS_4&UConverterConstants.UNSIGNED_INT_MASK))<<24;\r
339                 }\r
340             } else {\r
341                 /* two- to four-byte negative differences */\r
342                 if(diff>=BOCU1_REACH_NEG_2) {\r
343                     /* two bytes */\r
344                     diff-=BOCU1_REACH_NEG_1;\r
345                     result=0x02000000;\r
346 \r
347                     m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);\r
348                     result|=BOCU1_TRAIL_TO_BYTE(m);\r
349 \r
350                     result|=(BOCU1_START_NEG_2+diff)<<8;\r
351                 } else if(diff>=BOCU1_REACH_NEG_3) {\r
352                     /* three bytes */\r
353                     diff-=BOCU1_REACH_NEG_2;\r
354                     result=0x03000000;\r
355 \r
356                     m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);\r
357                     result|=BOCU1_TRAIL_TO_BYTE(m);\r
358 \r
359                     m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);\r
360                     result|=BOCU1_TRAIL_TO_BYTE(m)<<8;\r
361 \r
362                     result|=(BOCU1_START_NEG_3+diff)<<16;\r
363                 } else {\r
364                     /* four bytes */\r
365                     diff-=BOCU1_REACH_NEG_3;\r
366 \r
367                     m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);\r
368                     result=BOCU1_TRAIL_TO_BYTE(m);\r
369 \r
370                     m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);\r
371                     result|=BOCU1_TRAIL_TO_BYTE(m)<<8;\r
372 \r
373                     /*\r
374                      * We know that NEGDIVMOD would deliver\r
375                      * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.\r
376                      * Avoid division and modulo for performance.\r
377                      */\r
378                     m=diff+BOCU1_TRAIL_COUNT;\r
379                     result|=BOCU1_TRAIL_TO_BYTE(m)<<16;\r
380 \r
381                     result|=BOCU1_MIN<<24;\r
382                 }\r
383             }\r
384             return result;\r
385         }\r
386            \r
387         protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush){\r
388             cr = CoderResult.UNDERFLOW;\r
389             \r
390             LabelLoop = true; //used to break the while loop\r
391             checkNegative = false; // its value is set to true to get out of while loop when c = -c\r
392             LoopAfterTrail = false; // its value is set to true to ignore code before getTrail:\r
393             \r
394             /*set up the local pointers*/\r
395             targetCapacity = target.limit() - target.position();\r
396             c = fromUChar32;\r
397             prev = fromUnicodeStatus;\r
398             \r
399             if(prev==0){\r
400                 prev = BOCU1_ASCII_PREV;\r
401             }\r
402             \r
403             /*sourceIndex ==-1 if the current characte began in the previous buffer*/\r
404             sourceIndex = c == 0 ? 0: -1;\r
405             nextSourceIndex = 0;\r
406             \r
407             /*conversion loop*/\r
408             if(c!=0 && targetCapacity>0){\r
409                 labelType = getTrail;\r
410             }\r
411             \r
412             while(LabelLoop){\r
413                 switch(labelType){\r
414                     case fastSingle:\r
415                         labelType = fastSingle(source, target, offsets);\r
416                         break;\r
417                     case getTrail:\r
418                         labelType = getTrail(source, target, offsets);\r
419                         break;\r
420                     case regularLoop:\r
421                         labelType = regularLoop(source, target, offsets);\r
422                         break;\r
423                 }\r
424             }\r
425                     \r
426             return cr;\r
427         }\r
428         \r
429         private int fastSingle(CharBuffer source, ByteBuffer target, IntBuffer offsets){                     \r
430 //fastSingle:        \r
431             /*fast loop for single-byte differences*/\r
432             /*use only one loop counter variable , targetCapacity, not also source*/\r
433             diff = source.limit() - source.position();\r
434             if(targetCapacity>diff){\r
435                 targetCapacity = diff;\r
436             }\r
437             while(targetCapacity>0 && (c=source.get(source.position()))<0x3000){\r
438                 if(c<=0x20){\r
439                     if(c!=0x20){\r
440                         prev = BOCU1_ASCII_PREV;\r
441                     }\r
442                     target.put((byte)c);\r
443                     if(offsets!=null){\r
444                         offsets.put(nextSourceIndex++);\r
445                     }\r
446                     source.position(source.position()+1);\r
447                     --targetCapacity;\r
448                 }else {\r
449                     diff = c-prev;\r
450                     if(DIFF_IS_SINGLE(diff)){\r
451                         prev = BOCU1_SIMPLE_PREV(c);\r
452                         target.put((byte)PACK_SINGLE_DIFF(diff));\r
453                         if(offsets!=null){\r
454                             offsets.put(nextSourceIndex++);\r
455                         }\r
456                         source.position(source.position()+1);\r
457                         --targetCapacity;\r
458                     }else {\r
459                         break;\r
460                     }\r
461                 }\r
462             }\r
463             return regularLoop;\r
464         }\r
465         \r
466         private int getTrail(CharBuffer source, ByteBuffer target, IntBuffer offsets){\r
467             if(source.hasRemaining()){\r
468                 /*test the following code unit*/\r
469                 char trail = source.get(source.position());\r
470                 if(UTF16.isTrailSurrogate(trail)){\r
471                     source.position(source.position()+1);\r
472                     ++nextSourceIndex;\r
473                     c=UCharacter.getCodePoint((char)c, trail);\r
474                 }\r
475             } else {\r
476                 /*no more input*/\r
477                 c = -c; /*negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else*/\r
478                 checkNegative = true;\r
479             }\r
480             LoopAfterTrail = true;\r
481             return regularLoop;\r
482         }\r
483 \r
484         @SuppressWarnings("fallthrough")\r
485         private int regularLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets){\r
486             if(!LoopAfterTrail){\r
487                 /*restore real values*/\r
488                 targetCapacity = target.limit()-target.position();\r
489                 sourceIndex = nextSourceIndex; /*wrong if offsets==null but does not matter*/\r
490             }\r
491             /*regular loop for all classes*/\r
492             while(LoopAfterTrail || source.hasRemaining()){\r
493                 if(LoopAfterTrail || targetCapacity>0){\r
494                     \r
495                     if(!LoopAfterTrail){\r
496                         c = source.get();\r
497                         ++nextSourceIndex;\r
498                         \r
499                         if(c<=0x20){\r
500                             /*\r
501                              * ISO C0 control & space:\r
502                              * Encode directly for MIME compatibility,\r
503                              * and reset state except for space, to not disrupt compression.\r
504                              */\r
505                             if(c!=0x20) {\r
506                                 prev=BOCU1_ASCII_PREV;\r
507                             }\r
508                             target.put((byte)c);\r
509                             if(offsets != null){\r
510                                 offsets.put(sourceIndex++);\r
511                             }\r
512                             --targetCapacity;\r
513                          \r
514                             sourceIndex=nextSourceIndex;\r
515                             continue;\r
516                         }\r
517                         \r
518                         if(UTF16.isLeadSurrogate((char)c)){\r
519                             getTrail(source, target, offsets);\r
520                             if(checkNegative){\r
521                                 break;\r
522                             }\r
523                         }\r
524                     }\r
525                         \r
526                     if(LoopAfterTrail){\r
527                         LoopAfterTrail = false; \r
528                     }\r
529                     \r
530                     /*\r
531                      * all other Unicode code points c==U+0021..U+10ffff\r
532                      * are encoded with the difference c-prev\r
533                      *\r
534                      * a new prev is computed from c,\r
535                      * placed in the middle of a 0x80-block (for most small scripts) or\r
536                      * in the middle of the Unihan and Hangul blocks\r
537                      * to statistically minimize the following difference\r
538                      */\r
539                     diff = c- prev;\r
540                     prev = BOCU1_PREV(c);\r
541                     if(DIFF_IS_SINGLE(diff)){\r
542                         target.put((byte)PACK_SINGLE_DIFF(diff));\r
543                         if(offsets!=null){\r
544                             offsets.put(sourceIndex++);\r
545                         }\r
546                         --targetCapacity;\r
547                         sourceIndex=nextSourceIndex;\r
548                         if(c<0x3000){\r
549                             labelType = fastSingle;\r
550                             return labelType;\r
551                         }\r
552                     } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity){\r
553                         /*optimize 2 byte case*/\r
554                         int m = 0;\r
555                         if(diff>=0){\r
556                             diff -= BOCU1_REACH_POS_1 +1;\r
557                             m = diff%BOCU1_TRAIL_COUNT;\r
558                             diff/=BOCU1_TRAIL_COUNT;\r
559                             diff+=BOCU1_START_POS_2;\r
560                         } else {\r
561                             diff -= BOCU1_REACH_NEG_1;\r
562                             m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);\r
563                             diff+=BOCU1_START_NEG_2;\r
564                         }\r
565                         target.put((byte)diff);\r
566                         target.put((byte)BOCU1_TRAIL_TO_BYTE(m));\r
567                         if(offsets!=null){\r
568                             offsets.put(sourceIndex);\r
569                             offsets.put(sourceIndex);\r
570                         }\r
571                         targetCapacity -= 2;\r
572                         sourceIndex = nextSourceIndex;\r
573                     } else {\r
574                         int length; /*will be 2..4*/\r
575                         diff = packDiff(diff);\r
576                         length = BOCU1_LENGTH_FROM_PACKED(diff);\r
577                         \r
578                         /*write the output character bytes from diff and length*/\r
579                         /*from the first if in the loop we know that targetCapacity>0*/\r
580                         if(length<=targetCapacity){\r
581                             switch(length){\r
582                                 /*each branch falls through the next one*/\r
583                                 case 4:\r
584                                     target.put((byte)(diff>>24));\r
585                                     if(offsets!= null){\r
586                                         offsets.put(sourceIndex);\r
587                                     }\r
588                                 case 3:\r
589                                     target.put((byte)(diff>>16));\r
590                                     if(offsets!= null){\r
591                                         offsets.put(sourceIndex);\r
592                                     }\r
593                                 case 2:\r
594                                     target.put((byte)(diff>>8));\r
595                                     if(offsets!= null){\r
596                                         offsets.put(sourceIndex);\r
597                                     }\r
598                                     /*case 1 handled above*/\r
599                                     target.put((byte)diff);\r
600                                     if(offsets!= null){\r
601                                         offsets.put(sourceIndex);\r
602                                     }\r
603                                 default:\r
604                                     /*will never occur*/\r
605                                     break;\r
606                             }\r
607                             targetCapacity -= length;\r
608                             sourceIndex = nextSourceIndex;\r
609                         } else {\r
610                             ByteBuffer error = ByteBuffer.wrap(errorBuffer);\r
611                             /*\r
612                              * We actually do this backwards here:\r
613                              * In order to save an intermediate variable, we output\r
614                              * first to the overflow buffer what does not fit into the\r
615                              * regular target.\r
616                              */\r
617                             /* we know that 1<=targetCapacity<length<=4 */\r
618                             length-=targetCapacity;\r
619                             switch(length) {\r
620                                 /* each branch falls through to the next one */\r
621                             case 3:\r
622                                 error.put((byte)(diff>>16));\r
623                             case 2:\r
624                                 error.put((byte)(diff>>8));\r
625                             case 1:\r
626                                 error.put((byte)diff);\r
627                             default:\r
628                                 /* will never occur */\r
629                                 break;\r
630                             }\r
631                             errorBufferLength = length;\r
632                             \r
633                             /* now output what fits into the regular target */\r
634                             diff>>=8*length; /* length was reduced by targetCapacity */\r
635                             switch(targetCapacity) {\r
636                                 /* each branch falls through to the next one */\r
637                             case 3:\r
638                                 target.put((byte)(diff>>16));\r
639                                 if(offsets!= null){\r
640                                     offsets.put(sourceIndex);\r
641                                 }\r
642                             case 2:\r
643                                 target.put((byte)(diff>>8));\r
644                                 if(offsets!= null){\r
645                                     offsets.put(sourceIndex);\r
646                                 }\r
647                             case 1:\r
648                                 target.put((byte)diff);\r
649                                 if(offsets!= null){\r
650                                     offsets.put(sourceIndex);\r
651                                 }\r
652                             default:\r
653                                 /* will never occur */\r
654                                 break;\r
655                             }\r
656 \r
657                             /* target overflow */\r
658                             targetCapacity=0;\r
659                             cr = CoderResult.OVERFLOW;\r
660                             break;\r
661                         }\r
662                     }\r
663                 } else{\r
664                     /*target is full*/\r
665                     cr = CoderResult.OVERFLOW;\r
666                     break;\r
667                 }\r
668                    \r
669             }\r
670             /*set the converter state back into UConverter*/\r
671             fromUChar32 = c<0 ? -c :0;\r
672             fromUnicodeStatus = prev;\r
673             LabelLoop = false;\r
674             labelType = fastSingle;\r
675             return labelType;\r
676         }\r
677        \r
678     }\r
679     \r
680     class CharsetDecoderBOCU extends CharsetDecoderICU{\r
681         public CharsetDecoderBOCU(CharsetICU cs) {\r
682             super(cs);\r
683         }\r
684         \r
685         int byteIndex;\r
686         int sourceIndex, nextSourceIndex;\r
687         int prev, c , diff, count;\r
688         byte[] bytes;\r
689         int targetCapacity;\r
690         CoderResult cr;\r
691         \r
692         /* label values for supporting behavior similar to goto in C */\r
693         private static final int fastSingle=0;\r
694         private static final int getTrail=1;\r
695         private static final int regularLoop=2;\r
696         private static final int endLoop=3;\r
697         \r
698         private boolean LabelLoop;//used to break the while loop\r
699         private boolean afterTrail; // its value is set to true to ignore code after getTrail:\r
700         private int labelType;\r
701         /*\r
702          * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.\r
703          * The UConverter fields are used as follows:\r
704          *\r
705          * fromUnicodeStatus    encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)\r
706          *\r
707          * toUnicodeStatus      decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)\r
708          * mode                 decoder's incomplete (diff<<2)|count (ignored when toULength==0)\r
709          */\r
710 \r
711         /* BOCU-1-from-Unicode conversion functions --------------------------------- */\r
712 \r
713         \r
714         \r
715         /**\r
716          * Function for BOCU-1 decoder; handles multi-byte lead bytes.\r
717          *\r
718          * @param b lead byte;\r
719          *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD\r
720          * @return (diff<<2)|count\r
721          */\r
722         private int decodeBocu1LeadByte(int b) {\r
723             int diffValue, countValue;\r
724 \r
725             if(b >= BOCU1_START_NEG_2) {\r
726                 /* positive difference */\r
727                 if(b < BOCU1_START_POS_3) {\r
728                     /* two bytes */\r
729                     diffValue = (b - BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_POS_1+1;\r
730                     countValue = 1;\r
731                 } else if(b < BOCU1_START_POS_4) {\r
732                     /* three bytes */\r
733                     diffValue = (b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;\r
734                     countValue = 2;\r
735                 } else {\r
736                     /* four bytes */\r
737                     diffValue = BOCU1_REACH_POS_3+1;\r
738                     countValue = 3;\r
739                 }\r
740             } else {\r
741                 /* negative difference */\r
742                 if(b >= BOCU1_START_NEG_3) {\r
743                     /* two bytes */\r
744                     diffValue=(b -BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_1;\r
745                     countValue=1;\r
746                 } else if(b>BOCU1_MIN) {\r
747                     /* three bytes */\r
748                     diffValue=(b - BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_2;\r
749                     countValue = 2;\r
750                 } else {\r
751                     /* four bytes */\r
752                     diffValue=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;\r
753                     countValue=3;\r
754                 }\r
755             }\r
756 \r
757             /* return the state for decoding the trail byte(s) */\r
758             return (diffValue<<2)|countValue;\r
759         }\r
760         \r
761         /**\r
762          * Function for BOCU-1 decoder; handles multi-byte trail bytes.\r
763          *\r
764          * @param count number of remaining trail bytes including this one\r
765          * @param b trail byte\r
766          * @return new delta for diff including b - <0 indicates an error\r
767          *\r
768          * @see decodeBocu1\r
769          */\r
770         private int decodeBocu1TrailByte(int countValue, int b) {\r
771             b = b&UConverterConstants.UNSIGNED_BYTE_MASK;\r
772             if((b)<=0x20) {\r
773                 /* skip some C0 controls and make the trail byte range contiguous */\r
774                 b = bocu1ByteToTrail[b];\r
775                 /* b<0 for an illegal trail byte value will result in return<0 below */\r
776             } else {\r
777                 //b-= BOCU1_TRAIL_BYTE_OFFSET;\r
778                 b = b - BOCU1_TRAIL_BYTE_OFFSET;\r
779             }\r
780 \r
781             /* add trail byte into difference and decrement count */\r
782             if(countValue==1) {\r
783                 return b;\r
784             } else if(countValue==2) {\r
785                 return b*BOCU1_TRAIL_COUNT;\r
786             } else /* count==3 */ {\r
787                 return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);\r
788             }\r
789         }\r
790         \r
791         protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets,\r
792                 boolean flush){\r
793             cr = CoderResult.UNDERFLOW;\r
794             \r
795             LabelLoop = true; \r
796             afterTrail = false; \r
797             labelType = fastSingle; // labelType is set to fastSingle so t\r
798             \r
799             /*get the converter state*/\r
800             prev = toUnicodeStatus;\r
801             \r
802             if(prev==0){\r
803                 prev = BOCU1_ASCII_PREV;\r
804             }\r
805             diff = mode;\r
806             count = diff&3;\r
807             diff>>=2;\r
808             \r
809             byteIndex = toULength;\r
810             bytes = toUBytesArray;\r
811             \r
812             /* sourceIndex=-1 if the current character began in the previous buffer */\r
813             sourceIndex=byteIndex==0 ? 0 : -1;\r
814             nextSourceIndex=0;\r
815             \r
816             /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */\r
817             if(count>0 && byteIndex>0 && target.position()<target.limit()) {\r
818                 labelType = getTrail;\r
819             }\r
820             \r
821             while(LabelLoop){\r
822                 switch(labelType){\r
823                     case fastSingle:\r
824                         labelType = fastSingle(source, target, offsets);\r
825                         break;\r
826                     case getTrail:\r
827                         labelType = getTrail(source, target, offsets);\r
828                         break;\r
829                     case regularLoop:\r
830                         labelType = afterGetTrail(source, target, offsets);\r
831                         break;\r
832                     case endLoop:\r
833                         endLoop(source, target, offsets);\r
834                         break;\r
835                 }\r
836             }\r
837             \r
838             return cr;\r
839         }\r
840         \r
841         private int fastSingle(ByteBuffer source, CharBuffer target, IntBuffer offsets){\r
842             labelType = regularLoop;\r
843             /* fast loop for single-byte differences */\r
844             /* use count as the only loop counter variable */\r
845             diff = source.limit() - source.position();\r
846             count = target.limit()-target.position();\r
847             if(count>diff) {\r
848                 count = diff;\r
849             }\r
850             while(count>0) {\r
851                 if(BOCU1_START_NEG_2 <=(c=source.get(source.position())&UConverterConstants.UNSIGNED_BYTE_MASK) && c< BOCU1_START_POS_2) {\r
852                     c = prev + (c-BOCU1_MIDDLE);\r
853                     if(c<0x3000) {\r
854                         target.put((char)c);\r
855                         if(offsets!=null){\r
856                             offsets.put(nextSourceIndex++);\r
857                         } \r
858                         prev = BOCU1_SIMPLE_PREV(c);\r
859                     } else {\r
860                         break;\r
861                     }\r
862                 } else if((c&UConverterConstants.UNSIGNED_BYTE_MASK) <= 0x20) {\r
863                     if((c&UConverterConstants.UNSIGNED_BYTE_MASK) != 0x20) {\r
864                         prev = BOCU1_ASCII_PREV;\r
865                     }\r
866                     target.put((char)c);\r
867                     if(offsets!=null){\r
868                         offsets.put(nextSourceIndex++);\r
869                     } \r
870                 } else {\r
871                     break;\r
872                 }\r
873                 source.position(source.position()+1);\r
874                 --count;\r
875             }\r
876             sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */\r
877             return labelType;\r
878         }\r
879         \r
880         private int getTrail(ByteBuffer source, CharBuffer target, IntBuffer offsets){\r
881             labelType = regularLoop;\r
882             for(;;) {\r
883                 if(source.position() >= source.limit()) {\r
884                     labelType = endLoop;\r
885                     return labelType;\r
886                 }\r
887                 ++nextSourceIndex;\r
888                 c = bytes[byteIndex++] = source.get();\r
889 \r
890                 /* trail byte in any position */\r
891                 c = decodeBocu1TrailByte(count, c);\r
892                 if(c<0) {\r
893                     cr = CoderResult.malformedForLength(1);\r
894                     labelType = endLoop;\r
895                     return labelType;\r
896                 }\r
897 \r
898                 diff+=c;\r
899                 if(--count==0) {\r
900                     /* final trail byte, deliver a code point */\r
901                     byteIndex=0;\r
902                     c = prev + diff;\r
903                     if(c > 0x10ffff) {\r
904                         cr = CoderResult.malformedForLength(1);\r
905                         labelType = endLoop;\r
906                         return labelType;\r
907                     }\r
908                     break;\r
909                 }\r
910             }\r
911             afterTrail = true;\r
912             return labelType;\r
913             \r
914         }\r
915         \r
916         private int afterGetTrail(ByteBuffer source, CharBuffer target, IntBuffer offsets){\r
917             /* decode a sequence of single and lead bytes */\r
918             while(afterTrail || source.hasRemaining()) {\r
919                 if(!afterTrail){\r
920                     if(target.position() >= target.limit()) {\r
921                         /* target is full */\r
922                         cr = CoderResult.OVERFLOW;\r
923                         break;\r
924                     }\r
925 \r
926                     ++nextSourceIndex;\r
927                     c = source.get()&UConverterConstants.UNSIGNED_BYTE_MASK;\r
928                     if(BOCU1_START_NEG_2 <= c && c < BOCU1_START_POS_2) {\r
929                         /* Write a code point directly from a single-byte difference. */\r
930                         c = prev + (c-BOCU1_MIDDLE);\r
931                         if(c<0x3000) {\r
932                             target.put((char)c);\r
933                             if(offsets!=null){\r
934                                 offsets.put(sourceIndex);\r
935                             }\r
936                             prev = BOCU1_SIMPLE_PREV(c);\r
937                             sourceIndex = nextSourceIndex;\r
938                             labelType = fastSingle;\r
939                             return labelType;\r
940                         }\r
941                     } else if(c <= 0x20) {\r
942                         /*\r
943                          * Direct-encoded C0 control code or space.\r
944                          * Reset prev for C0 control codes but not for space.\r
945                          */\r
946                         if(c != 0x20) {\r
947                             prev=BOCU1_ASCII_PREV;\r
948                         }\r
949                         target.put((char)c);\r
950                         if(offsets!=null){\r
951                             offsets.put(sourceIndex);\r
952                         }\r
953                         sourceIndex=nextSourceIndex;\r
954                         continue;\r
955                     } else if(BOCU1_START_NEG_3 <= c && c < BOCU1_START_POS_3 && source.hasRemaining()) {\r
956                         /* Optimize two-byte case. */\r
957                         if(c >= BOCU1_MIDDLE) {\r
958                             diff=(c - BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_POS_1 + 1;\r
959                         } else {\r
960                             diff=(c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_1;\r
961                         }\r
962 \r
963                         /* trail byte */\r
964                         ++nextSourceIndex;\r
965                         c = decodeBocu1TrailByte(1, source.get());\r
966                         if(c<0 || ((c = prev + diff + c)&UConverterConstants.UNSIGNED_INT_MASK)>0x10ffff) {\r
967                             bytes[0]= source.get(source.position()-2);\r
968                             bytes[1]= source.get(source.position()-1);\r
969                             byteIndex = 2;\r
970                             cr = CoderResult.malformedForLength(2);\r
971                             break;\r
972                         }\r
973                     } else if(c == BOCU1_RESET) {\r
974                         /* only reset the state, no code point */\r
975                         prev=BOCU1_ASCII_PREV;\r
976                         sourceIndex=nextSourceIndex;\r
977                         continue;\r
978                     } else {\r
979                         /*\r
980                          * For multi-byte difference lead bytes, set the decoder state\r
981                          * with the partial difference value from the lead byte and\r
982                          * with the number of trail bytes.\r
983                          */\r
984                         bytes[0]= (byte)c;\r
985                         byteIndex = 1;\r
986 \r
987                         diff = decodeBocu1LeadByte(c);\r
988                         count = diff&3;\r
989                         diff>>=2;\r
990                         getTrail(source, target, offsets);\r
991                         if(labelType != regularLoop){\r
992                             return labelType;\r
993                         }\r
994                     }\r
995                 }\r
996                 \r
997                 if(afterTrail){\r
998                     afterTrail = false;\r
999                 }\r
1000                 \r
1001                 /* calculate the next prev and output c */\r
1002                 prev = BOCU1_PREV(c);\r
1003                 if(c<=0xffff) {\r
1004                     target.put((char)c);\r
1005                     if(offsets!=null){\r
1006                         offsets.put(sourceIndex);\r
1007                     }\r
1008                 } else {\r
1009                     /* output surrogate pair */\r
1010                     target.put(UTF16.getLeadSurrogate(c));\r
1011                     if(target.hasRemaining()) {\r
1012                         target.put(UTF16.getTrailSurrogate(c));\r
1013                         if(offsets!=null){\r
1014                             offsets.put(sourceIndex);\r
1015                             offsets.put(sourceIndex);\r
1016                         }\r
1017                     } else {\r
1018                         /* target overflow */\r
1019                         if(offsets!=null){\r
1020                             offsets.put(sourceIndex);\r
1021                         }\r
1022                         charErrorBufferArray[0] = UTF16.getTrailSurrogate(c);\r
1023                         charErrorBufferLength = 1;\r
1024                         cr = CoderResult.OVERFLOW;\r
1025                         break;\r
1026                 }\r
1027             }\r
1028             sourceIndex=nextSourceIndex;\r
1029           }\r
1030           labelType = endLoop;\r
1031           return labelType;\r
1032         }\r
1033         \r
1034         private void endLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets){\r
1035             if(cr.isMalformed()) {\r
1036                 /* set the converter state in UConverter to deal with the next character */\r
1037                 toUnicodeStatus = BOCU1_ASCII_PREV;\r
1038                 mode = 0;\r
1039             } else {\r
1040                 /* set the converter state back into UConverter */\r
1041                 toUnicodeStatus=prev;\r
1042                 mode=(diff<<2)|count;\r
1043             }\r
1044             toULength=byteIndex;\r
1045             LabelLoop = false;\r
1046         }\r
1047     \r
1048     }\r
1049     \r
1050     \r
1051     public CharsetDecoder newDecoder() {\r
1052         return new CharsetDecoderBOCU(this);\r
1053     }\r
1054 \r
1055     public CharsetEncoder newEncoder() {\r
1056         return new CharsetEncoderBOCU(this);\r
1057     }\r
1058     \r
1059     void getUnicodeSetImpl( UnicodeSet setFillIn, int which){\r
1060         CharsetICU.getCompleteUnicodeSet(setFillIn);\r
1061     }\r
1062 \r
1063 }\r