jars/icu4j-52_1/main/classes/core/src/com/ibm/icu/text/UnicodeDecompressor.java

   1 /*
   2  *******************************************************************************
   3  * Copyright (C) 1996-2009, International Business Machines Corporation and    *
   4  * others. All Rights Reserved.                                                *
   5  *******************************************************************************
   6  */
   7
   8 package com.ibm.icu.text;
   9
  10 /**
  11 * A decompression engine implementing the Standard Compression Scheme
  12 * for Unicode (SCSU) as outlined in <A
  13 * HREF="http://www.unicode.org/unicode/reports/tr6">Unicode Technical
  14 * Report #6</A>.
  15 *
  16 * <P><STRONG>USAGE</STRONG></P>
  17 *
  18 * <P>The static methods on <TT>UnicodeDecompressor</TT> may be used in a
  19 * straightforward manner to decompress simple strings:</P>
  20 *
  21 * <PRE>
  22 *  byte [] compressed = ... ; // get compressed bytes from somewhere
  23 *  String result = UnicodeDecompressor.decompress(compressed);
  24 * </PRE>
  25 *
  26 * <P>The static methods have a fairly large memory footprint.
  27 * For finer-grained control over memory usage,
  28 * <TT>UnicodeDecompressor</TT> offers more powerful APIs allowing
  29 * iterative decompression:</P>
  30 *
  31 * <PRE>
  32 *  // Decompress an array "bytes" of length "len" using a buffer of 512 chars
  33 *  // to the Writer "out"
  34 *
  35 *  UnicodeDecompressor myDecompressor         = new UnicodeDecompressor();
  36 *  final static int    BUFSIZE                = 512;
  37 *  char []             charBuffer             = new char [ BUFSIZE ];
  38 *  int                 charsWritten           = 0;
  39 *  int []              bytesRead              = new int [1];
  40 *  int                 totalBytesDecompressed = 0;
  41 *  int                 totalCharsWritten      = 0;
  42 *
  43 *  do {
  44 *    // do the decompression
  45 *    charsWritten = myDecompressor.decompress(bytes, totalBytesDecompressed,
  46 *                                             len, bytesRead,
  47 *                                             charBuffer, 0, BUFSIZE);
  48 *
  49 *    // do something with the current set of chars
  50 *    out.write(charBuffer, 0, charsWritten);
  51 *
  52 *    // update the no. of bytes decompressed
  53 *    totalBytesDecompressed += bytesRead[0];
  54 *
  55 *    // update the no. of chars written
  56 *    totalCharsWritten += charsWritten;
  57 *
  58 *  } while(totalBytesDecompressed < len);
  59 *
  60 *  myDecompressor.reset(); // reuse decompressor
  61 * </PRE>
  62 *
  63 * <P>Decompression is performed according to the standard set forth in
  64 * <A HREF="http://www.unicode.org/unicode/reports/tr6">Unicode Technical
  65 * Report #6</A></P>
  66 *
  67 * @see UnicodeCompressor
  68 *
  69 * @author Stephen F. Booth
  70 * @stable ICU 2.4
  71 */
  72 public final class UnicodeDecompressor implements SCSU
  73 {
  74     //==========================
  75     // Instance variables
  76     //==========================
  77
  78     /** Alias to current dynamic window */
  79     private int       fCurrentWindow   = 0;
  80
  81     /** Dynamic compression window offsets */
  82     private int []    fOffsets         = new int [ NUMWINDOWS ];
  83
  84     /** Current compression mode */
  85     private int       fMode            = SINGLEBYTEMODE;
  86
  87     /** Size of our internal buffer */
  88     private final static int BUFSIZE   = 3;
  89
  90     /** Internal buffer for saving state */
  91     private byte []   fBuffer          = new byte [BUFSIZE];
  92
  93     /** Number of characters in our internal buffer */
  94     private int       fBufferLength    = 0;
  95
  96
  97     /**
  98      * Create a UnicodeDecompressor.
  99      * Sets all windows to their default values.
 100      * @see #reset
 101      * @stable ICU 2.4
 102      */
 103     public UnicodeDecompressor(){
 104         reset();              // initialize to defaults
 105     }
 106
 107     /**
 108      * Decompress a byte array into a String.
 109      * @param buffer The byte array to decompress.
 110      * @return A String containing the decompressed characters.
 111      * @see #decompress(byte [], int, int)
 112      * @stable ICU 2.4
 113      */
 114     public static String decompress(byte [] buffer){
 115         char [] buf = decompress(buffer, 0, buffer.length);
 116         return new String(buf);
 117     }
 118
 119     /**
 120      * Decompress a byte array into a Unicode character array.
 121      * @param buffer The byte array to decompress.
 122      * @param start The start of the byte run to decompress.
 123      * @param limit The limit of the byte run to decompress.
 124      * @return A character array containing the decompressed bytes.
 125      * @see #decompress(byte [])
 126      * @stable ICU 2.4
 127      */
 128     public static char [] decompress(byte [] buffer, int start, int limit) {
 129         UnicodeDecompressor comp = new UnicodeDecompressor();
 130
 131         // use a buffer we know will never overflow
 132         // in the worst case, each byte will decompress
 133         // to a surrogate pair (buffer must be at least 2 chars)
 134         int len = Math.max(2, 2 * (limit - start));
 135         char [] temp = new char [len];
 136
 137         int charCount = comp.decompress(buffer, start, limit, null,
 138                         temp, 0, len);
 139
 140         char [] result = new char [charCount];
 141         System.arraycopy(temp, 0, result, 0, charCount);
 142         return result;
 143     }
 144
 145     /**
 146      * Decompress a byte array into a Unicode character array.
 147      *
 148      * This function will either completely fill the output buffer,
 149      * or consume the entire input.
 150      *
 151      * @param byteBuffer The byte buffer to decompress.
 152      * @param byteBufferStart The start of the byte run to decompress.
 153      * @param byteBufferLimit The limit of the byte run to decompress.
 154      * @param bytesRead A one-element array.  If not null, on return
 155      * the number of bytes read from byteBuffer.
 156      * @param charBuffer A buffer to receive the decompressed data.
 157      * This buffer must be at minimum two characters in size.
 158      * @param charBufferStart The starting offset to which to write
 159      * decompressed data.
 160      * @param charBufferLimit The limiting offset for writing
 161      * decompressed data.
 162      * @return The number of Unicode characters written to charBuffer.
 163      * @stable ICU 2.4
 164      */
 165     public int decompress(byte []    byteBuffer,
 166               int        byteBufferStart,
 167               int        byteBufferLimit,
 168               int []     bytesRead,
 169               char []    charBuffer,
 170               int        charBufferStart,
 171               int        charBufferLimit)
 172     {
 173     // the current position in the source byte buffer
 174     int bytePos      = byteBufferStart;
 175
 176     // the current position in the target char buffer
 177     int ucPos        = charBufferStart;
 178
 179         // the current byte from the source buffer
 180     int aByte        = 0x00;
 181
 182
 183     // charBuffer must be at least 2 chars in size
 184     if(charBuffer.length < 2 || (charBufferLimit - charBufferStart) < 2)
 185         throw new IllegalArgumentException("charBuffer.length < 2");
 186
 187     // if our internal buffer isn't empty, flush its contents
 188     // to the output buffer before doing any more decompression
 189     if(fBufferLength > 0) {
 190
 191         int newBytes = 0;
 192
 193         // fill the buffer completely, to guarantee one full character
 194         if(fBufferLength != BUFSIZE) {
 195         newBytes = fBuffer.length - fBufferLength;
 196
 197         // verify there are newBytes bytes in byteBuffer
 198         if(byteBufferLimit - byteBufferStart < newBytes)
 199             newBytes = byteBufferLimit - byteBufferStart;
 200
 201         System.arraycopy(byteBuffer, byteBufferStart,
 202                  fBuffer, fBufferLength, newBytes);
 203         }
 204
 205         // reset buffer length to 0 before recursive call
 206         fBufferLength = 0;
 207
 208         // call self recursively to decompress the buffer
 209         int count = decompress(fBuffer, 0, fBuffer.length, null,
 210                    charBuffer, charBufferStart,
 211                    charBufferLimit);
 212
 213         // update the positions into the arrays
 214         ucPos += count;
 215         bytePos += newBytes;
 216     }
 217
 218         // the main decompression loop
 219     mainLoop:
 220     while(bytePos < byteBufferLimit && ucPos < charBufferLimit) {
 221         switch(fMode) {
 222         case SINGLEBYTEMODE:
 223         // single-byte mode decompression loop
 224         singleByteModeLoop:
 225         while(bytePos < byteBufferLimit && ucPos < charBufferLimit) {
 226         aByte = byteBuffer[bytePos++] & 0xFF;
 227         switch(aByte) {
 228             // All bytes from 0x80 through 0xFF are remapped
 229             // to chars or surrogate pairs according to the
 230             // currently active window
 231         case 0x80: case 0x81: case 0x82: case 0x83: case 0x84:
 232         case 0x85: case 0x86: case 0x87: case 0x88: case 0x89:
 233         case 0x8A: case 0x8B: case 0x8C: case 0x8D: case 0x8E:
 234         case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93:
 235         case 0x94: case 0x95: case 0x96: case 0x97: case 0x98:
 236         case 0x99: case 0x9A: case 0x9B: case 0x9C: case 0x9D:
 237         case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2:
 238         case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7:
 239         case 0xA8: case 0xA9: case 0xAA: case 0xAB: case 0xAC:
 240         case 0xAD: case 0xAE: case 0xAF: case 0xB0: case 0xB1:
 241         case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6:
 242         case 0xB7: case 0xB8: case 0xB9: case 0xBA: case 0xBB:
 243         case 0xBC: case 0xBD: case 0xBE: case 0xBF: case 0xC0:
 244         case 0xC1: case 0xC2: case 0xC3: case 0xC4: case 0xC5:
 245         case 0xC6: case 0xC7: case 0xC8: case 0xC9: case 0xCA:
 246         case 0xCB: case 0xCC: case 0xCD: case 0xCE: case 0xCF:
 247         case 0xD0: case 0xD1: case 0xD2: case 0xD3: case 0xD4:
 248         case 0xD5: case 0xD6: case 0xD7: case 0xD8: case 0xD9:
 249         case 0xDA: case 0xDB: case 0xDC: case 0xDD: case 0xDE:
 250         case 0xDF: case 0xE0: case 0xE1: case 0xE2: case 0xE3:
 251         case 0xE4: case 0xE5: case 0xE6: case 0xE7: case 0xE8:
 252         case 0xE9: case 0xEA: case 0xEB: case 0xEC: case 0xED:
 253         case 0xEE: case 0xEF: case 0xF0: case 0xF1: case 0xF2:
 254         case 0xF3: case 0xF4: case 0xF5: case 0xF6: case 0xF7:
 255         case 0xF8: case 0xF9: case 0xFA: case 0xFB: case 0xFC:
 256         case 0xFD: case 0xFE: case 0xFF:
 257             // For offsets <= 0xFFFF, convert to a single char
 258             // by adding the window's offset and subtracting
 259             // the generic compression offset
 260             if(fOffsets[ fCurrentWindow ] <= 0xFFFF) {
 261             charBuffer[ucPos++] = (char)
 262                 (aByte + fOffsets[ fCurrentWindow ]
 263                  - COMPRESSIONOFFSET);
 264             }
 265             // For offsets > 0x10000, convert to a surrogate pair by
 266             // normBase = window's offset - 0x10000
 267             // high surr. = 0xD800 + (normBase >> 10)
 268             // low  surr. = 0xDC00 + (normBase & 0x3FF) + (byte & 0x7F)
 269             else {
 270             // make sure there is enough room to write
 271             // both characters
 272             // if not, save state and break out
 273             if((ucPos + 1) >= charBufferLimit) {
 274                 --bytePos;
 275                 System.arraycopy(byteBuffer, bytePos,
 276                          fBuffer, 0,
 277                          byteBufferLimit - bytePos);
 278                 fBufferLength = byteBufferLimit - bytePos;
 279                 bytePos += fBufferLength;
 280                 break mainLoop;
 281             }
 282
 283             int normalizedBase = fOffsets[ fCurrentWindow ]
 284                 - 0x10000;
 285             charBuffer[ucPos++] = (char)
 286                 (0xD800 + (normalizedBase >> 10));
 287             charBuffer[ucPos++] = (char)
 288                 (0xDC00 + (normalizedBase & 0x3FF)+(aByte & 0x7F));
 289             }
 290             break;
 291
 292             // bytes from 0x20 through 0x7F are treated as ASCII and
 293             // are remapped to chars by padding the high byte
 294             // (this is the same as quoting from static window 0)
 295             // NUL (0x00), HT (0x09), CR (0x0A), LF (0x0D)
 296             // are treated as ASCII as well
 297         case 0x00: case 0x09: case 0x0A: case 0x0D:
 298         case 0x20: case 0x21: case 0x22: case 0x23: case 0x24:
 299         case 0x25: case 0x26: case 0x27: case 0x28: case 0x29:
 300         case 0x2A: case 0x2B: case 0x2C: case 0x2D: case 0x2E:
 301         case 0x2F: case 0x30: case 0x31: case 0x32: case 0x33:
 302         case 0x34: case 0x35: case 0x36: case 0x37: case 0x38:
 303         case 0x39: case 0x3A: case 0x3B: case 0x3C: case 0x3D:
 304         case 0x3E: case 0x3F: case 0x40: case 0x41: case 0x42:
 305         case 0x43: case 0x44: case 0x45: case 0x46: case 0x47:
 306         case 0x48: case 0x49: case 0x4A: case 0x4B: case 0x4C:
 307         case 0x4D: case 0x4E: case 0x4F: case 0x50: case 0x51:
 308         case 0x52: case 0x53: case 0x54: case 0x55: case 0x56:
 309         case 0x57: case 0x58: case 0x59: case 0x5A: case 0x5B:
 310         case 0x5C: case 0x5D: case 0x5E: case 0x5F: case 0x60:
 311         case 0x61: case 0x62: case 0x63: case 0x64: case 0x65:
 312         case 0x66: case 0x67: case 0x68: case 0x69: case 0x6A:
 313         case 0x6B: case 0x6C: case 0x6D: case 0x6E: case 0x6F:
 314         case 0x70: case 0x71: case 0x72: case 0x73: case 0x74:
 315         case 0x75: case 0x76: case 0x77: case 0x78: case 0x79:
 316         case 0x7A: case 0x7B: case 0x7C: case 0x7D: case 0x7E:
 317         case 0x7F:
 318             charBuffer[ucPos++] = (char) aByte;
 319             break;
 320
 321             // quote unicode
 322         case SQUOTEU:
 323             // verify we have two bytes following tag
 324             // if not, save state and break out
 325             if( (bytePos + 1) >= byteBufferLimit ) {
 326             --bytePos;
 327             System.arraycopy(byteBuffer, bytePos,
 328                      fBuffer, 0,
 329                      byteBufferLimit - bytePos);
 330             fBufferLength = byteBufferLimit - bytePos;
 331             bytePos += fBufferLength;
 332             break mainLoop;
 333             }
 334
 335             aByte = byteBuffer[bytePos++];
 336             charBuffer[ucPos++] = (char)
 337             (aByte << 8 | (byteBuffer[bytePos++] & 0xFF));
 338             break;
 339
 340             // switch to Unicode mode
 341         case SCHANGEU:
 342             fMode = UNICODEMODE;
 343             break singleByteModeLoop;
 344             //break;
 345
 346             // handle all quote tags
 347         case SQUOTE0: case SQUOTE1: case SQUOTE2: case SQUOTE3:
 348         case SQUOTE4: case SQUOTE5: case SQUOTE6: case SQUOTE7:
 349             // verify there is a byte following the tag
 350             // if not, save state and break out
 351             if(bytePos >= byteBufferLimit) {
 352             --bytePos;
 353             System.arraycopy(byteBuffer, bytePos,
 354                      fBuffer, 0,
 355                      byteBufferLimit - bytePos);
 356             fBufferLength = byteBufferLimit - bytePos;
 357             bytePos += fBufferLength;
 358             break mainLoop;
 359             }
 360
 361             // if the byte is in the range 0x00 - 0x7F, use
 362             // static window n otherwise, use dynamic window n
 363             int dByte = byteBuffer[bytePos++] & 0xFF;
 364             charBuffer[ucPos++] = (char)
 365             (dByte+ (dByte >= 0x00 && dByte < 0x80
 366                  ? sOffsets[aByte - SQUOTE0]
 367                  : (fOffsets[aByte - SQUOTE0]
 368                     - COMPRESSIONOFFSET)));
 369             break;
 370
 371             // handle all change tags
 372         case SCHANGE0: case SCHANGE1: case SCHANGE2: case SCHANGE3:
 373         case SCHANGE4: case SCHANGE5: case SCHANGE6: case SCHANGE7:
 374             fCurrentWindow = aByte - SCHANGE0;
 375             break;
 376
 377             // handle all define tags
 378         case SDEFINE0: case SDEFINE1: case SDEFINE2: case SDEFINE3:
 379         case SDEFINE4: case SDEFINE5: case SDEFINE6: case SDEFINE7:
 380             // verify there is a byte following the tag
 381             // if not, save state and break out
 382             if(bytePos >= byteBufferLimit) {
 383             --bytePos;
 384             System.arraycopy(byteBuffer, bytePos,
 385                      fBuffer, 0,
 386                      byteBufferLimit - bytePos);
 387             fBufferLength = byteBufferLimit - bytePos;
 388             bytePos += fBufferLength;
 389             break mainLoop;
 390             }
 391
 392             fCurrentWindow = aByte - SDEFINE0;
 393             fOffsets[fCurrentWindow] =
 394             sOffsetTable[byteBuffer[bytePos++] & 0xFF];
 395             break;
 396
 397             // handle define extended tag
 398         case SDEFINEX:
 399             // verify we have two bytes following tag
 400             // if not, save state and break out
 401             if((bytePos + 1) >= byteBufferLimit ) {
 402             --bytePos;
 403             System.arraycopy(byteBuffer, bytePos,
 404                      fBuffer, 0,
 405                      byteBufferLimit - bytePos);
 406             fBufferLength = byteBufferLimit - bytePos;
 407             bytePos += fBufferLength;
 408             break mainLoop;
 409             }
 410
 411             aByte = byteBuffer[bytePos++] & 0xFF;
 412             fCurrentWindow = (aByte & 0xE0) >> 5;
 413             fOffsets[fCurrentWindow] = 0x10000 +
 414             (0x80 * (((aByte & 0x1F) << 8)
 415                  | (byteBuffer[bytePos++] & 0xFF)));
 416             break;
 417
 418             // reserved, shouldn't happen
 419         case SRESERVED:
 420             break;
 421
 422         } // end switch
 423         } // end while
 424         break;
 425
 426         case UNICODEMODE:
 427         // unicode mode decompression loop
 428         unicodeModeLoop:
 429         while(bytePos < byteBufferLimit && ucPos < charBufferLimit) {
 430         aByte = byteBuffer[bytePos++] & 0xFF;
 431         switch(aByte) {
 432             // handle all define tags
 433         case UDEFINE0: case UDEFINE1: case UDEFINE2: case UDEFINE3:
 434         case UDEFINE4: case UDEFINE5: case UDEFINE6: case UDEFINE7:
 435             // verify there is a byte following tag
 436             // if not, save state and break out
 437             if(bytePos >= byteBufferLimit ) {
 438             --bytePos;
 439             System.arraycopy(byteBuffer, bytePos,
 440                      fBuffer, 0,
 441                      byteBufferLimit - bytePos);
 442             fBufferLength = byteBufferLimit - bytePos;
 443             bytePos += fBufferLength;
 444             break mainLoop;
 445             }
 446
 447             fCurrentWindow = aByte - UDEFINE0;
 448             fOffsets[fCurrentWindow] =
 449             sOffsetTable[byteBuffer[bytePos++] & 0xFF];
 450             fMode = SINGLEBYTEMODE;
 451             break unicodeModeLoop;
 452             //break;
 453
 454             // handle define extended tag
 455         case UDEFINEX:
 456             // verify we have two bytes following tag
 457             // if not, save state and break out
 458             if((bytePos + 1) >= byteBufferLimit ) {
 459             --bytePos;
 460             System.arraycopy(byteBuffer, bytePos,
 461                      fBuffer, 0,
 462                      byteBufferLimit - bytePos);
 463             fBufferLength = byteBufferLimit - bytePos;
 464             bytePos += fBufferLength;
 465             break mainLoop;
 466             }
 467
 468             aByte = byteBuffer[bytePos++] & 0xFF;
 469             fCurrentWindow = (aByte & 0xE0) >> 5;
 470             fOffsets[fCurrentWindow] = 0x10000 +
 471             (0x80 * (((aByte & 0x1F) << 8)
 472                  | (byteBuffer[bytePos++] & 0xFF)));
 473             fMode = SINGLEBYTEMODE;
 474             break unicodeModeLoop;
 475             //break;
 476
 477             // handle all change tags
 478         case UCHANGE0: case UCHANGE1: case UCHANGE2: case UCHANGE3:
 479         case UCHANGE4: case UCHANGE5: case UCHANGE6: case UCHANGE7:
 480             fCurrentWindow = aByte - UCHANGE0;
 481             fMode = SINGLEBYTEMODE;
 482             break unicodeModeLoop;
 483             //break;
 484
 485             // quote unicode
 486         case UQUOTEU:
 487             // verify we have two bytes following tag
 488             // if not, save state and break out
 489             if(bytePos >= byteBufferLimit  - 1) {
 490             --bytePos;
 491             System.arraycopy(byteBuffer, bytePos,
 492                      fBuffer, 0,
 493                      byteBufferLimit - bytePos);
 494             fBufferLength = byteBufferLimit - bytePos;
 495             bytePos += fBufferLength;
 496             break mainLoop;
 497             }
 498
 499             aByte = byteBuffer[bytePos++];
 500             charBuffer[ucPos++] = (char)
 501             (aByte << 8 | (byteBuffer[bytePos++] & 0xFF));
 502             break;
 503
 504         default:
 505             // verify there is a byte following tag
 506             // if not, save state and break out
 507             if(bytePos >= byteBufferLimit ) {
 508             --bytePos;
 509             System.arraycopy(byteBuffer, bytePos,
 510                      fBuffer, 0,
 511                      byteBufferLimit - bytePos);
 512             fBufferLength = byteBufferLimit - bytePos;
 513             bytePos += fBufferLength;
 514             break mainLoop;
 515             }
 516
 517             charBuffer[ucPos++] = (char)
 518             (aByte << 8 | (byteBuffer[bytePos++] & 0xFF));
 519             break;
 520
 521         } // end switch
 522         } // end while
 523         break;
 524
 525         } // end switch( fMode )
 526     } // end while
 527
 528         // fill in output parameter
 529     if(bytesRead != null)
 530         bytesRead [0] = (bytePos - byteBufferStart);
 531
 532         // return # of chars written
 533     return (ucPos - charBufferStart);
 534     }
 535
 536     /**
 537      * Reset the decompressor to its initial state.
 538      * @stable ICU 2.4
 539      */
 540     public void reset()
 541     {
 542         // reset dynamic windows
 543         fOffsets[0] = 0x0080;    // Latin-1
 544         fOffsets[1] = 0x00C0;    // Latin-1 Supplement + Latin Extended-A
 545         fOffsets[2] = 0x0400;    // Cyrillic
 546         fOffsets[3] = 0x0600;    // Arabic
 547         fOffsets[4] = 0x0900;    // Devanagari
 548         fOffsets[5] = 0x3040;    // Hiragana
 549         fOffsets[6] = 0x30A0;    // Katakana
 550         fOffsets[7] = 0xFF00;    // Fullwidth ASCII
 551
 552
 553         fCurrentWindow  = 0;                // Make current window Latin-1
 554         fMode           = SINGLEBYTEMODE;   // Always start in single-byte mode
 555         fBufferLength   = 0;                // Empty buffer
 556     }
 557 }