jars/icu4j-4_4_2-src/main/classes/charset/src/com/ibm/icu/charset/CharsetBOCU1.java

   1 /*\r
   2  *******************************************************************************\r
   3  * Copyright (C) 2008-2010, International Business Machines Corporation and         *\r
   4  * others. All Rights Reserved.                                                *\r
   5  *******************************************************************************\r
   6  */\r
   7 package com.ibm.icu.charset;\r
   8 \r
   9 import java.nio.ByteBuffer;\r
  10 import java.nio.CharBuffer;\r
  11 import java.nio.IntBuffer;\r
  12 import java.nio.charset.CharsetDecoder;\r
  13 import java.nio.charset.CharsetEncoder;\r
  14 import java.nio.charset.CoderResult;\r
  15 \r
  16 import com.ibm.icu.lang.UCharacter;\r
  17 import com.ibm.icu.text.UTF16;\r
  18 import com.ibm.icu.text.UnicodeSet;\r
  19 \r
  20 /**\r
  21  * @author krajwade\r
  22  *\r
  23  */\r
  24 class CharsetBOCU1 extends CharsetICU {   \r
  25     /* BOCU constants and macros */\r
  26     \r
  27     /* initial value for "prev": middle of the ASCII range */\r
  28     private static final byte BOCU1_ASCII_PREV = 0x40;\r
  29     \r
  30     /* bounding byte values for differences */\r
  31     private static final int BOCU1_MIN = 0x21;\r
  32     private static final int BOCU1_MIDDLE = 0x90;\r
  33     //private static final int BOCU1_MAX_LEAD = 0xfe;\r
  34     private static final int BOCU1_MAX_TRAIL = 0xff;\r
  35     private static final int BOCU1_RESET = 0xff;\r
  36 \r
  37     /* number of lead bytes */\r
  38     //private static final int BOCU1_COUNT = (BOCU1_MAX_LEAD-BOCU1_MIN+1);\r
  39 \r
  40     /* adjust trail byte counts for the use of some C0 control byte values */\r
  41     private static final int BOCU1_TRAIL_CONTROLS_COUNT =  20;\r
  42     private static final int BOCU1_TRAIL_BYTE_OFFSET = (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT);\r
  43 \r
  44     /* number of trail bytes */\r
  45     private static final int BOCU1_TRAIL_COUNT =((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT);\r
  46     \r
  47     /*\r
  48      * number of positive and negative single-byte codes\r
  49      * (counting 0==BOCU1_MIDDLE among the positive ones)\r
  50      */\r
  51     private static final int BOCU1_SINGLE = 64;\r
  52 \r
  53     /* number of lead bytes for positive and negative 2/3/4-byte sequences */\r
  54     private static final int BOCU1_LEAD_2 = 43;\r
  55     private static final int BOCU1_LEAD_3 = 3;\r
  56     //private static final int BOCU1_LEAD_4 = 1;\r
  57 \r
  58     /* The difference value range for single-byters. */\r
  59     private static final int BOCU1_REACH_POS_1 = (BOCU1_SINGLE-1);\r
  60     private static final int BOCU1_REACH_NEG_1 = (-BOCU1_SINGLE);\r
  61 \r
  62     /* The difference value range for double-byters. */\r
  63     private static final int BOCU1_REACH_POS_2 = (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT);\r
  64     private static final int BOCU1_REACH_NEG_2 = (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT);\r
  65 \r
  66     /* The difference value range for 3-byters. */\r
  67     private static final int BOCU1_REACH_POS_3  =\r
  68         (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);\r
  69 \r
  70     private static final int BOCU1_REACH_NEG_3 = (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);\r
  71 \r
  72     /* The lead byte start values. */\r
  73     private static final int BOCU1_START_POS_2 =  (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1);\r
  74     private static final int BOCU1_START_POS_3  = (BOCU1_START_POS_2+BOCU1_LEAD_2);\r
  75     private static final int BOCU1_START_POS_4  = (BOCU1_START_POS_3+BOCU1_LEAD_3);\r
  76          /* ==BOCU1_MAX_LEAD */\r
  77 \r
  78     private static final int BOCU1_START_NEG_2 = (BOCU1_MIDDLE+BOCU1_REACH_NEG_1);\r
  79     private static final int BOCU1_START_NEG_3 = (BOCU1_START_NEG_2-BOCU1_LEAD_2);\r
  80     //private static final int BOCU1_START_NEG_4 = (BOCU1_START_NEG_3-BOCU1_LEAD_3);\r
  81          /* ==BOCU1_MIN+1 */\r
  82 \r
  83     /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */\r
  84    /* private static int BOCU1_LENGTH_FROM_LEAD(int lead) {\r
  85        return ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \r
  86          (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \r
  87          (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4);\r
  88     }*/\r
  89 \r
  90     /* The length of a byte sequence, according to its packed form. */\r
  91     private static int BOCU1_LENGTH_FROM_PACKED(int packed) {\r
  92         return (((packed)&UConverterConstants.UNSIGNED_INT_MASK)<0x04000000 ? (packed)>>24 : 4);\r
  93     }\r
  94     \r
  95     /*\r
  96      * Byte value map for control codes,\r
  97      * from external byte values 0x00..0x20\r
  98      * to trail byte values 0..19 (0..0x13) as used in the difference calculation.\r
  99      * External byte values that are illegal as trail bytes are mapped to -1.\r
 100      */\r
 101     private static final int[]\r
 102     bocu1ByteToTrail={\r
 103     /*  0     1     2     3     4     5     6     7    */\r
 104         -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,\r
 105 \r
 106     /*  8     9     a     b     c     d     e     f    */\r
 107         -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,\r
 108 \r
 109     /*  10    11    12    13    14    15    16    17   */\r
 110         0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,\r
 111 \r
 112     /*  18    19    1a    1b    1c    1d    1e    1f   */\r
 113         0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,\r
 114 \r
 115     /*  20   */\r
 116         -1\r
 117     };\r
 118 \r
 119     /*\r
 120      * Byte value map for control codes,\r
 121      * from trail byte values 0..19 (0..0x13) as used in the difference calculation\r
 122      * to external byte values 0x00..0x20.\r
 123      */\r
 124     private static final int[] \r
 125     bocu1TrailToByte = {\r
 126     /*  0     1     2     3     4     5     6     7    */\r
 127         0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,\r
 128 \r
 129     /*  8     9     a     b     c     d     e     f    */\r
 130         0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,\r
 131 \r
 132     /*  10    11    12    13   */\r
 133         0x1c, 0x1d, 0x1e, 0x1f\r
 134     };\r
 135     \r
 136     \r
 137     /*\r
 138      * 12 commonly used C0 control codes (and space) are only used to encode\r
 139      * themselves directly,\r
 140      * which makes BOCU-1 MIME-usable and reasonably safe for\r
 141      * ASCII-oriented software.\r
 142      *\r
 143      * These controls are\r
 144      *  0   NUL\r
 145      *\r
 146      *  7   BEL\r
 147      *  8   BS\r
 148      *\r
 149      *  9   TAB\r
 150      *  a   LF\r
 151      *  b   VT\r
 152      *  c   FF\r
 153      *  d   CR\r
 154      *\r
 155      *  e   SO\r
 156      *  f   SI\r
 157      *\r
 158      * 1a   SUB\r
 159      * 1b   ESC\r
 160      *\r
 161      * The other 20 C0 controls are also encoded directly (to preserve order)\r
 162      * but are also used as trail bytes in difference encoding\r
 163      * (for better compression).\r
 164      */\r
 165     private static int BOCU1_TRAIL_TO_BYTE(int trail) {\r
 166         return ((trail)>=BOCU1_TRAIL_CONTROLS_COUNT ? (trail)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[trail]);\r
 167     }    \r
 168     \r
 169     /* BOCU-1 implementation functions ------------------------------------------ */\r
 170     private static int BOCU1_SIMPLE_PREV(int c){\r
 171         return (((c)&~0x7f)+BOCU1_ASCII_PREV);\r
 172     }\r
 173 \r
 174     /**\r
 175      * Compute the next "previous" value for differencing\r
 176      * from the current code point.\r
 177      *\r
 178      * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)\r
 179      * @return "previous code point" state value\r
 180      */\r
 181     private static  int bocu1Prev(int c) {\r
 182         /* compute new prev */\r
 183         if(/* 0x3040<=c && */ c<=0x309f) {\r
 184             /* Hiragana is not 128-aligned */\r
 185             return 0x3070;\r
 186         } else if(0x4e00<=c && c<=0x9fa5) {\r
 187             /* CJK Unihan */\r
 188             return 0x4e00-BOCU1_REACH_NEG_2;\r
 189         } else if(0xac00<=c /* && c<=0xd7a3 */) {\r
 190             /* Korean Hangul */\r
 191             return (0xd7a3+0xac00)/2;\r
 192         } else {\r
 193             /* mostly small scripts */\r
 194             return BOCU1_SIMPLE_PREV(c);\r
 195         }\r
 196     }\r
 197 \r
 198     /** Fast version of bocu1Prev() for most scripts. */\r
 199     private static int BOCU1_PREV(int c) {\r
 200         return ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c));\r
 201     }\r
 202     \r
 203     protected byte[] fromUSubstitution = new byte[]{(byte)0x1A};\r
 204 \r
 205     /* Faster versions of packDiff() for single-byte-encoded diff values. */\r
 206 \r
 207     /** Is a diff value encodable in a single byte? */\r
 208     private static boolean DIFF_IS_SINGLE(int diff){\r
 209         return (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1);\r
 210     }\r
 211 \r
 212     /** Encode a diff value in a single byte. */\r
 213     private static int PACK_SINGLE_DIFF(int diff){\r
 214         return (BOCU1_MIDDLE+(diff));\r
 215     }\r
 216 \r
 217     /** Is a diff value encodable in two bytes? */\r
 218     private static boolean DIFF_IS_DOUBLE(int diff){\r
 219         return (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2);\r
 220     }   \r
 221       \r
 222     public CharsetBOCU1(String icuCanonicalName, String javaCanonicalName, String[] aliases){\r
 223         super(icuCanonicalName, javaCanonicalName, aliases);\r
 224         maxBytesPerChar = 4; \r
 225         minBytesPerChar = 1;\r
 226         maxCharsPerByte = 1;\r
 227      }\r
 228     \r
 229     class CharsetEncoderBOCU extends CharsetEncoderICU {\r
 230         public CharsetEncoderBOCU(CharsetICU cs) {\r
 231             super(cs,fromUSubstitution);\r
 232         }\r
 233         \r
 234         int sourceIndex, nextSourceIndex;\r
 235         int prev, c , diff;\r
 236         boolean checkNegative;\r
 237         boolean LoopAfterTrail;\r
 238         int targetCapacity;\r
 239         CoderResult cr;        \r
 240         \r
 241         /* label values for supporting behavior similar to goto in C */\r
 242         private static final int fastSingle=0;\r
 243         private static final int getTrail=1;\r
 244         private static final int regularLoop=2;\r
 245         \r
 246         private boolean LabelLoop; //used to break the while loop\r
 247         private int labelType = fastSingle; //labeType is set to fastSingle to start the code from fastSingle:\r
 248         \r
 249         /**\r
 250          * Integer division and modulo with negative numerators\r
 251          * yields negative modulo results and quotients that are one more than\r
 252          * what we need here.\r
 253          * This macro adjust the results so that the modulo-value m is always >=0.\r
 254          *\r
 255          * For positive n, the if() condition is always FALSE.\r
 256          *\r
 257          * @param n Number to be split into quotient and rest.\r
 258          *          Will be modified to contain the quotient.\r
 259          * @param d Divisor.\r
 260          * @param m Output variable for the rest (modulo result).\r
 261          */\r
 262         private int NEGDIVMOD(int n, int d, int m) {\r
 263             diff = n;\r
 264             (m)=(diff)%(d); \r
 265             (diff)/=(d); \r
 266             if((m)<0) { \r
 267                 --(diff);\r
 268                 (m)+=(d);\r
 269             }\r
 270             return m;\r
 271         }\r
 272         \r
 273         /**\r
 274          * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes\r
 275          * and return a packed integer with them.\r
 276          *\r
 277          * The encoding favors small absolute differences with short encodings\r
 278          * to compress runs of same-script characters.\r
 279          *\r
 280          * Optimized version with unrolled loops and fewer floating-point operations\r
 281          * than the standard packDiff().\r
 282          *\r
 283          * @param diff difference value -0x10ffff..0x10ffff\r
 284          * @return\r
 285          *      0x010000zz for 1-byte sequence zz\r
 286          *      0x0200yyzz for 2-byte sequence yy zz\r
 287          *      0x03xxyyzz for 3-byte sequence xx yy zz\r
 288          *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)\r
 289          */\r
 290         private int packDiff(int n) {\r
 291             int result, m = 0;\r
 292             diff = n;\r
 293 \r
 294             if(diff>=BOCU1_REACH_NEG_1) {\r
 295                 /* mostly positive differences, and single-byte negative ones */\r
 296                 if(diff<=BOCU1_REACH_POS_2) {\r
 297                     /* two bytes */\r
 298                     diff-=BOCU1_REACH_POS_1+1;\r
 299                     result=0x02000000;\r
 300 \r
 301                     m=diff%BOCU1_TRAIL_COUNT;\r
 302                     diff/=BOCU1_TRAIL_COUNT;\r
 303                     result|=BOCU1_TRAIL_TO_BYTE(m);\r
 304 \r
 305                     result|=(BOCU1_START_POS_2+diff)<<8;\r
 306                 } else if(diff<=BOCU1_REACH_POS_3) {\r
 307                     /* three bytes */\r
 308                     diff-=BOCU1_REACH_POS_2+1;\r
 309                     result=0x03000000;\r
 310 \r
 311                     m=diff%BOCU1_TRAIL_COUNT;\r
 312                     diff/=BOCU1_TRAIL_COUNT;\r
 313                     result|=BOCU1_TRAIL_TO_BYTE(m);\r
 314 \r
 315                     m=diff%BOCU1_TRAIL_COUNT;\r
 316                     diff/=BOCU1_TRAIL_COUNT;\r
 317                     result|=BOCU1_TRAIL_TO_BYTE(m)<<8;\r
 318 \r
 319                     result|=(BOCU1_START_POS_3+diff)<<16;\r
 320                 } else {\r
 321                     /* four bytes */\r
 322                     diff-=BOCU1_REACH_POS_3+1;\r
 323 \r
 324                     m=diff%BOCU1_TRAIL_COUNT;\r
 325                     diff/=BOCU1_TRAIL_COUNT;\r
 326                     result=BOCU1_TRAIL_TO_BYTE(m);\r
 327 \r
 328                     m=diff%BOCU1_TRAIL_COUNT;\r
 329                     diff/=BOCU1_TRAIL_COUNT;\r
 330                     result|=BOCU1_TRAIL_TO_BYTE(m)<<8;\r
 331 \r
 332                     /*\r
 333                      * We know that / and % would deliver quotient 0 and rest=diff.\r
 334                      * Avoid division and modulo for performance.\r
 335                      */\r
 336                     result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;\r
 337 \r
 338                     result|=((BOCU1_START_POS_4&UConverterConstants.UNSIGNED_INT_MASK))<<24;\r
 339                 }\r
 340             } else {\r
 341                 /* two- to four-byte negative differences */\r
 342                 if(diff>=BOCU1_REACH_NEG_2) {\r
 343                     /* two bytes */\r
 344                     diff-=BOCU1_REACH_NEG_1;\r
 345                     result=0x02000000;\r
 346 \r
 347                     m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);\r
 348                     result|=BOCU1_TRAIL_TO_BYTE(m);\r
 349 \r
 350                     result|=(BOCU1_START_NEG_2+diff)<<8;\r
 351                 } else if(diff>=BOCU1_REACH_NEG_3) {\r
 352                     /* three bytes */\r
 353                     diff-=BOCU1_REACH_NEG_2;\r
 354                     result=0x03000000;\r
 355 \r
 356                     m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);\r
 357                     result|=BOCU1_TRAIL_TO_BYTE(m);\r
 358 \r
 359                     m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);\r
 360                     result|=BOCU1_TRAIL_TO_BYTE(m)<<8;\r
 361 \r
 362                     result|=(BOCU1_START_NEG_3+diff)<<16;\r
 363                 } else {\r
 364                     /* four bytes */\r
 365                     diff-=BOCU1_REACH_NEG_3;\r
 366 \r
 367                     m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);\r
 368                     result=BOCU1_TRAIL_TO_BYTE(m);\r
 369 \r
 370                     m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);\r
 371                     result|=BOCU1_TRAIL_TO_BYTE(m)<<8;\r
 372 \r
 373                     /*\r
 374                      * We know that NEGDIVMOD would deliver\r
 375                      * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.\r
 376                      * Avoid division and modulo for performance.\r
 377                      */\r
 378                     m=diff+BOCU1_TRAIL_COUNT;\r
 379                     result|=BOCU1_TRAIL_TO_BYTE(m)<<16;\r
 380 \r
 381                     result|=BOCU1_MIN<<24;\r
 382                 }\r
 383             }\r
 384             return result;\r
 385         }\r
 386            \r
 387         protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush){\r
 388             cr = CoderResult.UNDERFLOW;\r
 389             \r
 390             LabelLoop = true; //used to break the while loop\r
 391             checkNegative = false; // its value is set to true to get out of while loop when c = -c\r
 392             LoopAfterTrail = false; // its value is set to true to ignore code before getTrail:\r
 393             \r
 394             /*set up the local pointers*/\r
 395             targetCapacity = target.limit() - target.position();\r
 396             c = fromUChar32;\r
 397             prev = fromUnicodeStatus;\r
 398             \r
 399             if(prev==0){\r
 400                 prev = BOCU1_ASCII_PREV;\r
 401             }\r
 402             \r
 403             /*sourceIndex ==-1 if the current characte began in the previous buffer*/\r
 404             sourceIndex = c == 0 ? 0: -1;\r
 405             nextSourceIndex = 0;\r
 406             \r
 407             /*conversion loop*/\r
 408             if(c!=0 && targetCapacity>0){\r
 409                 labelType = getTrail;\r
 410             }\r
 411             \r
 412             while(LabelLoop){\r
 413                 switch(labelType){\r
 414                     case fastSingle:\r
 415                         labelType = fastSingle(source, target, offsets);\r
 416                         break;\r
 417                     case getTrail:\r
 418                         labelType = getTrail(source, target, offsets);\r
 419                         break;\r
 420                     case regularLoop:\r
 421                         labelType = regularLoop(source, target, offsets);\r
 422                         break;\r
 423                 }\r
 424             }\r
 425                     \r
 426             return cr;\r
 427         }\r
 428         \r
 429         private int fastSingle(CharBuffer source, ByteBuffer target, IntBuffer offsets){                     \r
 430 //fastSingle:        \r
 431             /*fast loop for single-byte differences*/\r
 432             /*use only one loop counter variable , targetCapacity, not also source*/\r
 433             diff = source.limit() - source.position();\r
 434             if(targetCapacity>diff){\r
 435                 targetCapacity = diff;\r
 436             }\r
 437             while(targetCapacity>0 && (c=source.get(source.position()))<0x3000){\r
 438                 if(c<=0x20){\r
 439                     if(c!=0x20){\r
 440                         prev = BOCU1_ASCII_PREV;\r
 441                     }\r
 442                     target.put((byte)c);\r
 443                     if(offsets!=null){\r
 444                         offsets.put(nextSourceIndex++);\r
 445                     }\r
 446                     source.position(source.position()+1);\r
 447                     --targetCapacity;\r
 448                 }else {\r
 449                     diff = c-prev;\r
 450                     if(DIFF_IS_SINGLE(diff)){\r
 451                         prev = BOCU1_SIMPLE_PREV(c);\r
 452                         target.put((byte)PACK_SINGLE_DIFF(diff));\r
 453                         if(offsets!=null){\r
 454                             offsets.put(nextSourceIndex++);\r
 455                         }\r
 456                         source.position(source.position()+1);\r
 457                         --targetCapacity;\r
 458                     }else {\r
 459                         break;\r
 460                     }\r
 461                 }\r
 462             }\r
 463             return regularLoop;\r
 464         }\r
 465         \r
 466         private int getTrail(CharBuffer source, ByteBuffer target, IntBuffer offsets){\r
 467             if(source.hasRemaining()){\r
 468                 /*test the following code unit*/\r
 469                 char trail = source.get(source.position());\r
 470                 if(UTF16.isTrailSurrogate(trail)){\r
 471                     source.position(source.position()+1);\r
 472                     ++nextSourceIndex;\r
 473                     c=UCharacter.getCodePoint((char)c, trail);\r
 474                 }\r
 475             } else {\r
 476                 /*no more input*/\r
 477                 c = -c; /*negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else*/\r
 478                 checkNegative = true;\r
 479             }\r
 480             LoopAfterTrail = true;\r
 481             return regularLoop;\r
 482         }\r
 483 \r
 484         @SuppressWarnings("fallthrough")\r
 485         private int regularLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets){\r
 486             if(!LoopAfterTrail){\r
 487                 /*restore real values*/\r
 488                 targetCapacity = target.limit()-target.position();\r
 489                 sourceIndex = nextSourceIndex; /*wrong if offsets==null but does not matter*/\r
 490             }\r
 491             /*regular loop for all classes*/\r
 492             while(LoopAfterTrail || source.hasRemaining()){\r
 493                 if(LoopAfterTrail || targetCapacity>0){\r
 494                     \r
 495                     if(!LoopAfterTrail){\r
 496                         c = source.get();\r
 497                         ++nextSourceIndex;\r
 498                         \r
 499                         if(c<=0x20){\r
 500                             /*\r
 501                              * ISO C0 control & space:\r
 502                              * Encode directly for MIME compatibility,\r
 503                              * and reset state except for space, to not disrupt compression.\r
 504                              */\r
 505                             if(c!=0x20) {\r
 506                                 prev=BOCU1_ASCII_PREV;\r
 507                             }\r
 508                             target.put((byte)c);\r
 509                             if(offsets != null){\r
 510                                 offsets.put(sourceIndex++);\r
 511                             }\r
 512                             --targetCapacity;\r
 513                          \r
 514                             sourceIndex=nextSourceIndex;\r
 515                             continue;\r
 516                         }\r
 517                         \r
 518                         if(UTF16.isLeadSurrogate((char)c)){\r
 519                             getTrail(source, target, offsets);\r
 520                             if(checkNegative){\r
 521                                 break;\r
 522                             }\r
 523                         }\r
 524                     }\r
 525                         \r
 526                     if(LoopAfterTrail){\r
 527                         LoopAfterTrail = false; \r
 528                     }\r
 529                     \r
 530                     /*\r
 531                      * all other Unicode code points c==U+0021..U+10ffff\r
 532                      * are encoded with the difference c-prev\r
 533                      *\r
 534                      * a new prev is computed from c,\r
 535                      * placed in the middle of a 0x80-block (for most small scripts) or\r
 536                      * in the middle of the Unihan and Hangul blocks\r
 537                      * to statistically minimize the following difference\r
 538                      */\r
 539                     diff = c- prev;\r
 540                     prev = BOCU1_PREV(c);\r
 541                     if(DIFF_IS_SINGLE(diff)){\r
 542                         target.put((byte)PACK_SINGLE_DIFF(diff));\r
 543                         if(offsets!=null){\r
 544                             offsets.put(sourceIndex++);\r
 545                         }\r
 546                         --targetCapacity;\r
 547                         sourceIndex=nextSourceIndex;\r
 548                         if(c<0x3000){\r
 549                             labelType = fastSingle;\r
 550                             return labelType;\r
 551                         }\r
 552                     } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity){\r
 553                         /*optimize 2 byte case*/\r
 554                         int m = 0;\r
 555                         if(diff>=0){\r
 556                             diff -= BOCU1_REACH_POS_1 +1;\r
 557                             m = diff%BOCU1_TRAIL_COUNT;\r
 558                             diff/=BOCU1_TRAIL_COUNT;\r
 559                             diff+=BOCU1_START_POS_2;\r
 560                         } else {\r
 561                             diff -= BOCU1_REACH_NEG_1;\r
 562                             m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);\r
 563                             diff+=BOCU1_START_NEG_2;\r
 564                         }\r
 565                         target.put((byte)diff);\r
 566                         target.put((byte)BOCU1_TRAIL_TO_BYTE(m));\r
 567                         if(offsets!=null){\r
 568                             offsets.put(sourceIndex);\r
 569                             offsets.put(sourceIndex);\r
 570                         }\r
 571                         targetCapacity -= 2;\r
 572                         sourceIndex = nextSourceIndex;\r
 573                     } else {\r
 574                         int length; /*will be 2..4*/\r
 575                         diff = packDiff(diff);\r
 576                         length = BOCU1_LENGTH_FROM_PACKED(diff);\r
 577                         \r
 578                         /*write the output character bytes from diff and length*/\r
 579                         /*from the first if in the loop we know that targetCapacity>0*/\r
 580                         if(length<=targetCapacity){\r
 581                             switch(length){\r
 582                                 /*each branch falls through the next one*/\r
 583                                 case 4:\r
 584                                     target.put((byte)(diff>>24));\r
 585                                     if(offsets!= null){\r
 586                                         offsets.put(sourceIndex);\r
 587                                     }\r
 588                                 case 3:\r
 589                                     target.put((byte)(diff>>16));\r
 590                                     if(offsets!= null){\r
 591                                         offsets.put(sourceIndex);\r
 592                                     }\r
 593                                 case 2:\r
 594                                     target.put((byte)(diff>>8));\r
 595                                     if(offsets!= null){\r
 596                                         offsets.put(sourceIndex);\r
 597                                     }\r
 598                                     /*case 1 handled above*/\r
 599                                     target.put((byte)diff);\r
 600                                     if(offsets!= null){\r
 601                                         offsets.put(sourceIndex);\r
 602                                     }\r
 603                                 default:\r
 604                                     /*will never occur*/\r
 605                                     break;\r
 606                             }\r
 607                             targetCapacity -= length;\r
 608                             sourceIndex = nextSourceIndex;\r
 609                         } else {\r
 610                             ByteBuffer error = ByteBuffer.wrap(errorBuffer);\r
 611                             /*\r
 612                              * We actually do this backwards here:\r
 613                              * In order to save an intermediate variable, we output\r
 614                              * first to the overflow buffer what does not fit into the\r
 615                              * regular target.\r
 616                              */\r
 617                             /* we know that 1<=targetCapacity<length<=4 */\r
 618                             length-=targetCapacity;\r
 619                             switch(length) {\r
 620                                 /* each branch falls through to the next one */\r
 621                             case 3:\r
 622                                 error.put((byte)(diff>>16));\r
 623                             case 2:\r
 624                                 error.put((byte)(diff>>8));\r
 625                             case 1:\r
 626                                 error.put((byte)diff);\r
 627                             default:\r
 628                                 /* will never occur */\r
 629                                 break;\r
 630                             }\r
 631                             errorBufferLength = length;\r
 632                             \r
 633                             /* now output what fits into the regular target */\r
 634                             diff>>=8*length; /* length was reduced by targetCapacity */\r
 635                             switch(targetCapacity) {\r
 636                                 /* each branch falls through to the next one */\r
 637                             case 3:\r
 638                                 target.put((byte)(diff>>16));\r
 639                                 if(offsets!= null){\r
 640                                     offsets.put(sourceIndex);\r
 641                                 }\r
 642                             case 2:\r
 643                                 target.put((byte)(diff>>8));\r
 644                                 if(offsets!= null){\r
 645                                     offsets.put(sourceIndex);\r
 646                                 }\r
 647                             case 1:\r
 648                                 target.put((byte)diff);\r
 649                                 if(offsets!= null){\r
 650                                     offsets.put(sourceIndex);\r
 651                                 }\r
 652                             default:\r
 653                                 /* will never occur */\r
 654                                 break;\r
 655                             }\r
 656 \r
 657                             /* target overflow */\r
 658                             targetCapacity=0;\r
 659                             cr = CoderResult.OVERFLOW;\r
 660                             break;\r
 661                         }\r
 662                     }\r
 663                 } else{\r
 664                     /*target is full*/\r
 665                     cr = CoderResult.OVERFLOW;\r
 666                     break;\r
 667                 }\r
 668                    \r
 669             }\r
 670             /*set the converter state back into UConverter*/\r
 671             fromUChar32 = c<0 ? -c :0;\r
 672             fromUnicodeStatus = prev;\r
 673             LabelLoop = false;\r
 674             labelType = fastSingle;\r
 675             return labelType;\r
 676         }\r
 677        \r
 678     }\r
 679     \r
 680     class CharsetDecoderBOCU extends CharsetDecoderICU{\r
 681         public CharsetDecoderBOCU(CharsetICU cs) {\r
 682             super(cs);\r
 683         }\r
 684         \r
 685         int byteIndex;\r
 686         int sourceIndex, nextSourceIndex;\r
 687         int prev, c , diff, count;\r
 688         byte[] bytes;\r
 689         int targetCapacity;\r
 690         CoderResult cr;\r
 691         \r
 692         /* label values for supporting behavior similar to goto in C */\r
 693         private static final int fastSingle=0;\r
 694         private static final int getTrail=1;\r
 695         private static final int regularLoop=2;\r
 696         private static final int endLoop=3;\r
 697         \r
 698         private boolean LabelLoop;//used to break the while loop\r
 699         private boolean afterTrail; // its value is set to true to ignore code after getTrail:\r
 700         private int labelType;\r
 701         /*\r
 702          * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.\r
 703          * The UConverter fields are used as follows:\r
 704          *\r
 705          * fromUnicodeStatus    encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)\r
 706          *\r
 707          * toUnicodeStatus      decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)\r
 708          * mode                 decoder's incomplete (diff<<2)|count (ignored when toULength==0)\r
 709          */\r
 710 \r
 711         /* BOCU-1-from-Unicode conversion functions --------------------------------- */\r
 712 \r
 713         \r
 714         \r
 715         /**\r
 716          * Function for BOCU-1 decoder; handles multi-byte lead bytes.\r
 717          *\r
 718          * @param b lead byte;\r
 719          *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD\r
 720          * @return (diff<<2)|count\r
 721          */\r
 722         private int decodeBocu1LeadByte(int b) {\r
 723             int diffValue, countValue;\r
 724 \r
 725             if(b >= BOCU1_START_NEG_2) {\r
 726                 /* positive difference */\r
 727                 if(b < BOCU1_START_POS_3) {\r
 728                     /* two bytes */\r
 729                     diffValue = (b - BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_POS_1+1;\r
 730                     countValue = 1;\r
 731                 } else if(b < BOCU1_START_POS_4) {\r
 732                     /* three bytes */\r
 733                     diffValue = (b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;\r
 734                     countValue = 2;\r
 735                 } else {\r
 736                     /* four bytes */\r
 737                     diffValue = BOCU1_REACH_POS_3+1;\r
 738                     countValue = 3;\r
 739                 }\r
 740             } else {\r
 741                 /* negative difference */\r
 742                 if(b >= BOCU1_START_NEG_3) {\r
 743                     /* two bytes */\r
 744                     diffValue=(b -BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_1;\r
 745                     countValue=1;\r
 746                 } else if(b>BOCU1_MIN) {\r
 747                     /* three bytes */\r
 748                     diffValue=(b - BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_2;\r
 749                     countValue = 2;\r
 750                 } else {\r
 751                     /* four bytes */\r
 752                     diffValue=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;\r
 753                     countValue=3;\r
 754                 }\r
 755             }\r
 756 \r
 757             /* return the state for decoding the trail byte(s) */\r
 758             return (diffValue<<2)|countValue;\r
 759         }\r
 760         \r
 761         /**\r
 762          * Function for BOCU-1 decoder; handles multi-byte trail bytes.\r
 763          *\r
 764          * @param count number of remaining trail bytes including this one\r
 765          * @param b trail byte\r
 766          * @return new delta for diff including b - <0 indicates an error\r
 767          *\r
 768          * @see decodeBocu1\r
 769          */\r
 770         private int decodeBocu1TrailByte(int countValue, int b) {\r
 771             b = b&UConverterConstants.UNSIGNED_BYTE_MASK;\r
 772             if((b)<=0x20) {\r
 773                 /* skip some C0 controls and make the trail byte range contiguous */\r
 774                 b = bocu1ByteToTrail[b];\r
 775                 /* b<0 for an illegal trail byte value will result in return<0 below */\r
 776             } else {\r
 777                 //b-= BOCU1_TRAIL_BYTE_OFFSET;\r
 778                 b = b - BOCU1_TRAIL_BYTE_OFFSET;\r
 779             }\r
 780 \r
 781             /* add trail byte into difference and decrement count */\r
 782             if(countValue==1) {\r
 783                 return b;\r
 784             } else if(countValue==2) {\r
 785                 return b*BOCU1_TRAIL_COUNT;\r
 786             } else /* count==3 */ {\r
 787                 return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);\r
 788             }\r
 789         }\r
 790         \r
 791         protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets,\r
 792                 boolean flush){\r
 793             cr = CoderResult.UNDERFLOW;\r
 794             \r
 795             LabelLoop = true; \r
 796             afterTrail = false; \r
 797             labelType = fastSingle; // labelType is set to fastSingle so t\r
 798             \r
 799             /*get the converter state*/\r
 800             prev = toUnicodeStatus;\r
 801             \r
 802             if(prev==0){\r
 803                 prev = BOCU1_ASCII_PREV;\r
 804             }\r
 805             diff = mode;\r
 806             count = diff&3;\r
 807             diff>>=2;\r
 808             \r
 809             byteIndex = toULength;\r
 810             bytes = toUBytesArray;\r
 811             \r
 812             /* sourceIndex=-1 if the current character began in the previous buffer */\r
 813             sourceIndex=byteIndex==0 ? 0 : -1;\r
 814             nextSourceIndex=0;\r
 815             \r
 816             /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */\r
 817             if(count>0 && byteIndex>0 && target.position()<target.limit()) {\r
 818                 labelType = getTrail;\r
 819             }\r
 820             \r
 821             while(LabelLoop){\r
 822                 switch(labelType){\r
 823                     case fastSingle:\r
 824                         labelType = fastSingle(source, target, offsets);\r
 825                         break;\r
 826                     case getTrail:\r
 827                         labelType = getTrail(source, target, offsets);\r
 828                         break;\r
 829                     case regularLoop:\r
 830                         labelType = afterGetTrail(source, target, offsets);\r
 831                         break;\r
 832                     case endLoop:\r
 833                         endLoop(source, target, offsets);\r
 834                         break;\r
 835                 }\r
 836             }\r
 837             \r
 838             return cr;\r
 839         }\r
 840         \r
 841         private int fastSingle(ByteBuffer source, CharBuffer target, IntBuffer offsets){\r
 842             labelType = regularLoop;\r
 843             /* fast loop for single-byte differences */\r
 844             /* use count as the only loop counter variable */\r
 845             diff = source.limit() - source.position();\r
 846             count = target.limit()-target.position();\r
 847             if(count>diff) {\r
 848                 count = diff;\r
 849             }\r
 850             while(count>0) {\r
 851                 if(BOCU1_START_NEG_2 <=(c=source.get(source.position())&UConverterConstants.UNSIGNED_BYTE_MASK) && c< BOCU1_START_POS_2) {\r
 852                     c = prev + (c-BOCU1_MIDDLE);\r
 853                     if(c<0x3000) {\r
 854                         target.put((char)c);\r
 855                         if(offsets!=null){\r
 856                             offsets.put(nextSourceIndex++);\r
 857                         } \r
 858                         prev = BOCU1_SIMPLE_PREV(c);\r
 859                     } else {\r
 860                         break;\r
 861                     }\r
 862                 } else if((c&UConverterConstants.UNSIGNED_BYTE_MASK) <= 0x20) {\r
 863                     if((c&UConverterConstants.UNSIGNED_BYTE_MASK) != 0x20) {\r
 864                         prev = BOCU1_ASCII_PREV;\r
 865                     }\r
 866                     target.put((char)c);\r
 867                     if(offsets!=null){\r
 868                         offsets.put(nextSourceIndex++);\r
 869                     } \r
 870                 } else {\r
 871                     break;\r
 872                 }\r
 873                 source.position(source.position()+1);\r
 874                 --count;\r
 875             }\r
 876             sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */\r
 877             return labelType;\r
 878         }\r
 879         \r
 880         private int getTrail(ByteBuffer source, CharBuffer target, IntBuffer offsets){\r
 881             labelType = regularLoop;\r
 882             for(;;) {\r
 883                 if(source.position() >= source.limit()) {\r
 884                     labelType = endLoop;\r
 885                     return labelType;\r
 886                 }\r
 887                 ++nextSourceIndex;\r
 888                 c = bytes[byteIndex++] = source.get();\r
 889 \r
 890                 /* trail byte in any position */\r
 891                 c = decodeBocu1TrailByte(count, c);\r
 892                 if(c<0) {\r
 893                     cr = CoderResult.malformedForLength(1);\r
 894                     labelType = endLoop;\r
 895                     return labelType;\r
 896                 }\r
 897 \r
 898                 diff+=c;\r
 899                 if(--count==0) {\r
 900                     /* final trail byte, deliver a code point */\r
 901                     byteIndex=0;\r
 902                     c = prev + diff;\r
 903                     if(c > 0x10ffff) {\r
 904                         cr = CoderResult.malformedForLength(1);\r
 905                         labelType = endLoop;\r
 906                         return labelType;\r
 907                     }\r
 908                     break;\r
 909                 }\r
 910             }\r
 911             afterTrail = true;\r
 912             return labelType;\r
 913             \r
 914         }\r
 915         \r
 916         private int afterGetTrail(ByteBuffer source, CharBuffer target, IntBuffer offsets){\r
 917             /* decode a sequence of single and lead bytes */\r
 918             while(afterTrail || source.hasRemaining()) {\r
 919                 if(!afterTrail){\r
 920                     if(target.position() >= target.limit()) {\r
 921                         /* target is full */\r
 922                         cr = CoderResult.OVERFLOW;\r
 923                         break;\r
 924                     }\r
 925 \r
 926                     ++nextSourceIndex;\r
 927                     c = source.get()&UConverterConstants.UNSIGNED_BYTE_MASK;\r
 928                     if(BOCU1_START_NEG_2 <= c && c < BOCU1_START_POS_2) {\r
 929                         /* Write a code point directly from a single-byte difference. */\r
 930                         c = prev + (c-BOCU1_MIDDLE);\r
 931                         if(c<0x3000) {\r
 932                             target.put((char)c);\r
 933                             if(offsets!=null){\r
 934                                 offsets.put(sourceIndex);\r
 935                             }\r
 936                             prev = BOCU1_SIMPLE_PREV(c);\r
 937                             sourceIndex = nextSourceIndex;\r
 938                             labelType = fastSingle;\r
 939                             return labelType;\r
 940                         }\r
 941                     } else if(c <= 0x20) {\r
 942                         /*\r
 943                          * Direct-encoded C0 control code or space.\r
 944                          * Reset prev for C0 control codes but not for space.\r
 945                          */\r
 946                         if(c != 0x20) {\r
 947                             prev=BOCU1_ASCII_PREV;\r
 948                         }\r
 949                         target.put((char)c);\r
 950                         if(offsets!=null){\r
 951                             offsets.put(sourceIndex);\r
 952                         }\r
 953                         sourceIndex=nextSourceIndex;\r
 954                         continue;\r
 955                     } else if(BOCU1_START_NEG_3 <= c && c < BOCU1_START_POS_3 && source.hasRemaining()) {\r
 956                         /* Optimize two-byte case. */\r
 957                         if(c >= BOCU1_MIDDLE) {\r
 958                             diff=(c - BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_POS_1 + 1;\r
 959                         } else {\r
 960                             diff=(c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_1;\r
 961                         }\r
 962 \r
 963                         /* trail byte */\r
 964                         ++nextSourceIndex;\r
 965                         c = decodeBocu1TrailByte(1, source.get());\r
 966                         if(c<0 || ((c = prev + diff + c)&UConverterConstants.UNSIGNED_INT_MASK)>0x10ffff) {\r
 967                             bytes[0]= source.get(source.position()-2);\r
 968                             bytes[1]= source.get(source.position()-1);\r
 969                             byteIndex = 2;\r
 970                             cr = CoderResult.malformedForLength(2);\r
 971                             break;\r
 972                         }\r
 973                     } else if(c == BOCU1_RESET) {\r
 974                         /* only reset the state, no code point */\r
 975                         prev=BOCU1_ASCII_PREV;\r
 976                         sourceIndex=nextSourceIndex;\r
 977                         continue;\r
 978                     } else {\r
 979                         /*\r
 980                          * For multi-byte difference lead bytes, set the decoder state\r
 981                          * with the partial difference value from the lead byte and\r
 982                          * with the number of trail bytes.\r
 983                          */\r
 984                         bytes[0]= (byte)c;\r
 985                         byteIndex = 1;\r
 986 \r
 987                         diff = decodeBocu1LeadByte(c);\r
 988                         count = diff&3;\r
 989                         diff>>=2;\r
 990                         getTrail(source, target, offsets);\r
 991                         if(labelType != regularLoop){\r
 992                             return labelType;\r
 993                         }\r
 994                     }\r
 995                 }\r
 996                 \r
 997                 if(afterTrail){\r
 998                     afterTrail = false;\r
 999                 }\r
1000                 \r
1001                 /* calculate the next prev and output c */\r
1002                 prev = BOCU1_PREV(c);\r
1003                 if(c<=0xffff) {\r
1004                     target.put((char)c);\r
1005                     if(offsets!=null){\r
1006                         offsets.put(sourceIndex);\r
1007                     }\r
1008                 } else {\r
1009                     /* output surrogate pair */\r
1010                     target.put(UTF16.getLeadSurrogate(c));\r
1011                     if(target.hasRemaining()) {\r
1012                         target.put(UTF16.getTrailSurrogate(c));\r
1013                         if(offsets!=null){\r
1014                             offsets.put(sourceIndex);\r
1015                             offsets.put(sourceIndex);\r
1016                         }\r
1017                     } else {\r
1018                         /* target overflow */\r
1019                         if(offsets!=null){\r
1020                             offsets.put(sourceIndex);\r
1021                         }\r
1022                         charErrorBufferArray[0] = UTF16.getTrailSurrogate(c);\r
1023                         charErrorBufferLength = 1;\r
1024                         cr = CoderResult.OVERFLOW;\r
1025                         break;\r
1026                 }\r
1027             }\r
1028             sourceIndex=nextSourceIndex;\r
1029           }\r
1030           labelType = endLoop;\r
1031           return labelType;\r
1032         }\r
1033         \r
1034         private void endLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets){\r
1035             if(cr.isMalformed()) {\r
1036                 /* set the converter state in UConverter to deal with the next character */\r
1037                 toUnicodeStatus = BOCU1_ASCII_PREV;\r
1038                 mode = 0;\r
1039             } else {\r
1040                 /* set the converter state back into UConverter */\r
1041                 toUnicodeStatus=prev;\r
1042                 mode=(diff<<2)|count;\r
1043             }\r
1044             toULength=byteIndex;\r
1045             LabelLoop = false;\r
1046         }\r
1047     \r
1048     }\r
1049     \r
1050     \r
1051     public CharsetDecoder newDecoder() {\r
1052         return new CharsetDecoderBOCU(this);\r
1053     }\r
1054 \r
1055     public CharsetEncoder newEncoder() {\r
1056         return new CharsetEncoderBOCU(this);\r
1057     }\r
1058     \r
1059     void getUnicodeSetImpl( UnicodeSet setFillIn, int which){\r
1060         CharsetICU.getCompleteUnicodeSet(setFillIn);\r
1061     }\r
1062 \r
1063 }\r