jars/icu4j-52_1/main/classes/core/src/com/ibm/icu/text/UnicodeCompressor.java

   1 /*
   2  *******************************************************************************
   3  * Copyright (C) 1996-2009, International Business Machines Corporation and    *
   4  * others. All Rights Reserved.                                                *
   5  *******************************************************************************
   6  */
   7 package com.ibm.icu.text;
   8
   9 /**
  10 * A compression engine implementing the Standard Compression Scheme
  11 * for Unicode (SCSU) as outlined in <A
  12 * HREF="http://www.unicode.org/unicode/reports/tr6">Unicode Technical
  13 * Report #6</A>.
  14 *
  15 * <P>The SCSU works by using dynamically positioned <EM>windows</EM>
  16 * consisting of 128 consecutive characters in Unicode.  During compression,
  17 * characters within a window are encoded in the compressed stream as the bytes
  18 * <TT>0x7F - 0xFF</TT>. The SCSU provides transparency for the characters
  19 * (bytes) between <TT>U+0000 - U+00FF</TT>.  The SCSU approximates the
  20 * storage size of traditional character sets, for example 1 byte per
  21 * character for ASCII or Latin-1 text, and 2 bytes per character for CJK
  22 * ideographs.</P>
  23 *
  24 * <P><STRONG>USAGE</STRONG></P>
  25 *
  26 * <P>The static methods on <TT>UnicodeCompressor</TT> may be used in a
  27 * straightforward manner to compress simple strings:</P>
  28 *
  29 * <PRE>
  30 *  String s = ... ; // get string from somewhere
  31 *  byte [] compressed = UnicodeCompressor.compress(s);
  32 * </PRE>
  33 *
  34 * <P>The static methods have a fairly large memory footprint.
  35 * For finer-grained control over memory usage,
  36 * <TT>UnicodeCompressor</TT> offers more powerful APIs allowing
  37 * iterative compression:</P>
  38 *
  39 * <PRE>
  40 *  // Compress an array "chars" of length "len" using a buffer of 512 bytes
  41 *  // to the OutputStream "out"
  42 *
  43 *  UnicodeCompressor myCompressor         = new UnicodeCompressor();
  44 *  final static int  BUFSIZE              = 512;
  45 *  byte []           byteBuffer           = new byte [ BUFSIZE ];
  46 *  int               bytesWritten         = 0;
  47 *  int []            unicharsRead         = new int [1];
  48 *  int               totalCharsCompressed = 0;
  49 *  int               totalBytesWritten    = 0;
  50 *
  51 *  do {
  52 *    // do the compression
  53 *    bytesWritten = myCompressor.compress(chars, totalCharsCompressed,
  54 *                                         len, unicharsRead,
  55 *                                         byteBuffer, 0, BUFSIZE);
  56 *
  57 *    // do something with the current set of bytes
  58 *    out.write(byteBuffer, 0, bytesWritten);
  59 *
  60 *    // update the no. of characters compressed
  61 *    totalCharsCompressed += unicharsRead[0];
  62 *
  63 *    // update the no. of bytes written
  64 *    totalBytesWritten += bytesWritten;
  65 *
  66 *  } while(totalCharsCompressed < len);
  67 *
  68 *  myCompressor.reset(); // reuse compressor
  69 * </PRE>
  70 *
  71 * @see UnicodeDecompressor
  72 *
  73 * @author Stephen F. Booth
  74 * @stable ICU 2.4
  75 */
  76
  77 /*
  78 *
  79 * COMPRESSION STRATEGY
  80 *
  81 * Single Byte Mode
  82 *
  83 * There are three relevant cases.
  84 * If the character is in the current window or is Latin-1 (U+0000,
  85 * U+0009, U+000A, U+000D, U+0020 - U+007F), the character is placed
  86 * directly in the stream as a single byte.
  87 *
  88 *  1. Current character is in defined, inactive window.
  89 *  2. Current character is in undefined window.
  90 *  3. Current character is uncompressible Unicode (U+3400 - U+DFFF).
  91 *
  92 *  1. Current character is in defined, inactive window
  93 *    A. Look ahead two characters
  94 *    B. If both following characters in same window as current character,
  95 *       switch to defined window
  96 *    C. If only next character is in same window as current character,
  97 *       quote defined window
  98 *    D. If neither of following characters is in same window as current,
  99 *       quote defined window
 100 *
 101 *  2. Current character is in undefined window
 102 *    A. Look ahead two characters
 103 *    B. If both following characters in same window as current character,
 104 *       define new window
 105 *    C. If only next character in same window as current character,
 106 *       switch to Unicode mode
 107 *       NOTE: This costs us one extra byte.  However,
 108 *        since we have a limited number of windows to work with, it is
 109 *        assumed the cost will pay off later in savings from a window with
 110 *        more characters in it.
 111 *    D. If neither of following characters in same window as current,
 112 *       switch to Unicode mode.  Alternative to above: just quote
 113 *       Unicode (same byte cost)
 114 *
 115 *  3. Current character is uncompressible Unicode (U+3400 - U+DFFF)
 116 *    A. Look ahead one character
 117 *    B. If next character in non-compressible region, switch to
 118 *       Unicode mode
 119 *    C. If next character not in non-compressible region, quote Unicode
 120 *
 121 *
 122 * The following chart illustrates the bytes required for encoding characters
 123 * in each possible way
 124 *
 125 *
 126 *                                   SINGLE BYTE MODE
 127 *                                       Characters in a row with same index
 128 *               tag encountered             1       2       3       4
 129 *               ---------------------------------------------------------------
 130 *               none (in current window)    1       2       3       4
 131 *
 132 *               quote Unicode               3       6       9       12
 133 *
 134 *   window not  switch to Unicode           3       5       7       9     byte
 135 *   defined     define window               3       4       5       6     cost
 136 *
 137 *   window      switch to window            2       3       4       5
 138 *   defined     quote window                2       4       6       8
 139 *
 140 *  Unicode Mode
 141 *
 142 * There are two relevant cases.
 143 * If the character is in the non-compressible region
 144 * (U+3400 - U+DFFF), the character is simply written to the
 145 * stream as a pair of bytes.
 146 *
 147 * 1. Current character is in defined, inactive window.
 148 * 2. Current character is in undefined window.
 149 *
 150 *  1.Current character is in defined, inactive window
 151 *    A. Look ahead one character
 152 *    B. If next character has same index as current character,
 153 *       switch to defined window (and switch to single-byte mode)
 154 *    C. If not, just put bytes in stream
 155 *
 156 *
 157 *  2. Current character is in undefined window
 158 *    A. Look ahead two characters
 159 *    B. If both in same window as current character, define window
 160 *       (and switch to single-byte mode)
 161 *    C. If only next character in same window, just put bytes in stream
 162 *        NOTE: This costs us one extra byte.  However,
 163 *        since we have a limited number of windows to work with, it is
 164 *        assumed the cost will pay off later in savings from a window with
 165 *        more characters in it.
 166 *    D. If neither in same window, put bytes in stream
 167 *
 168 *
 169 * The following chart illustrates the bytes required for encoding characters
 170 * in each possible way
 171 *
 172 *
 173 *                                   UNICODE MODE
 174 *                                       Characters in a row with same index
 175 *               tag encountered             1       2       3       4
 176 *               ---------------------------------------------------------------
 177 *               none                        2       4       6       8
 178 *
 179 *               quote Unicode               3       6       9       12
 180 *
 181 *   window not  define window               3       4       5       6     byte
 182 *   defined                                                               cost
 183 *   window      switch to window            2       3       4       5
 184 *   defined
 185 */
 186 public final class UnicodeCompressor implements SCSU
 187 {
 188     //==========================
 189     // Class variables
 190     //==========================
 191
 192     /** For quick identification of a byte as a single-byte mode tag */
 193     private static boolean [] sSingleTagTable = {
 194         // table generated by CompressionTableGenerator
 195         false, true, true, true, true, true, true, true, true, false,
 196     false, true, true, false, true, true, true, true, true, true,
 197     true, true, true, true, true, true, true, true, true, true,
 198     true, true, false, false, false, false, false, false,false,
 199     false, false, false, false, false, false, false, false, false,
 200     false, false, false, false, false, false, false, false, false,
 201     false, false, false, false, false, false, false, false, false,
 202     false, false, false, false, false, false, false, false, false,
 203     false, false, false, false, false, false, false, false, false,
 204     false, false, false, false, false, false, false, false, false,
 205     false, false, false, false, false, false, false, false, false,
 206     false, false, false, false, false, false, false, false, false,
 207     false, false, false, false, false, false, false, false, false,
 208     false, false, false, false, false, false, false, false, false,
 209     false, false, false, false, false, false, false, false, false,
 210     false, false, false, false, false, false, false, false, false,
 211     false, false, false, false, false, false, false, false, false,
 212     false, false, false, false, false, false, false, false, false,
 213     false, false, false, false, false, false, false, false, false,
 214     false, false, false, false, false, false, false, false, false,
 215     false, false, false, false, false, false, false, false, false,
 216     false, false, false, false, false, false, false, false, false,
 217     false, false, false, false, false, false, false, false, false,
 218     false, false, false, false, false, false, false, false, false,
 219     false, false, false, false, false, false, false, false, false,
 220     false, false, false, false, false, false, false, false, false,
 221     false, false, false, false, false, false, false, false, false,
 222     false, false, false, false, false, false, false, false, false,
 223     false
 224     };
 225
 226     /** For quick identification of a byte as a unicode mode tag */
 227     private static boolean [] sUnicodeTagTable = {
 228         // table generated by CompressionTableGenerator
 229         false, false, false, false, false, false, false, false, false,
 230     false, false, false, false, false, false, false, false, false,
 231     false, false, false, false, false, false, false, false, false,
 232     false, false, false, false, false, false, false, false, false,
 233     false, false, false, false, false, false, false, false, false,
 234     false, false, false, false, false, false, false, false, false,
 235     false, false, false, false, false, false, false, false, false,
 236     false, false, false, false, false, false, false, false, false,
 237     false, false, false, false, false, false, false, false, false,
 238     false, false, false, false, false, false, false, false, false,
 239     false, false, false, false, false, false, false, false, false,
 240     false, false, false, false, false, false, false, false, false,
 241     false, false, false, false, false, false, false, false, false,
 242     false, false, false, false, false, false, false, false, false,
 243     false, false, false, false, false, false, false, false, false,
 244     false, false, false, false, false, false, false, false, false,
 245     false, false, false, false, false, false, false, false, false,
 246     false, false, false, false, false, false, false, false, false,
 247     false, false, false, false, false, false, false, false, false,
 248     false, false, false, false, false, false, false, false, false,
 249     false, false, false, false, false, false, false, false, false,
 250     false, false, false, false, false, false, false, false, false,
 251     false, false, false, false, false, false, false, false, false,
 252     false, false, false, false, false, false, false, false, false,
 253     false, false, false, false, false, false, false, false, true,
 254     true, true, true, true, true, true, true, true, true, true,
 255     true, true, true, true, true, true, true, true, false, false,
 256     false, false, false, false, false, false, false, false, false,
 257     false, false
 258     };
 259
 260     //==========================
 261     // Instance variables
 262     //==========================
 263
 264     /** Alias to current dynamic window */
 265     private int       fCurrentWindow   = 0;
 266
 267     /** Dynamic compression window offsets */
 268     private int []    fOffsets         = new int [ NUMWINDOWS ];
 269
 270     /** Current compression mode */
 271     private int       fMode            = SINGLEBYTEMODE;
 272
 273     /** Keeps count of times character indices are encountered */
 274     private int []    fIndexCount      = new int [ MAXINDEX + 1 ];
 275
 276     /** The time stamps indicate when a window was last defined */
 277     private int []    fTimeStamps      = new int [ NUMWINDOWS ];
 278
 279     /** The current time stamp */
 280     private int       fTimeStamp       = 0;
 281
 282
 283     /**
 284      * Create a UnicodeCompressor.
 285      * Sets all windows to their default values.
 286      * @see #reset
 287      * @stable ICU 2.4
 288      */
 289     public UnicodeCompressor()
 290     {
 291     reset();              // initialize to defaults
 292     }
 293
 294     /**
 295      * Compress a string into a byte array.
 296      * @param buffer The string to compress.
 297      * @return A byte array containing the compressed characters.
 298      * @see #compress(char [], int, int)
 299      * @stable ICU 2.4
 300      */
 301     public static byte [] compress(String buffer)
 302     {
 303     return compress(buffer.toCharArray(), 0, buffer.length());
 304     }
 305
 306     /**
 307      * Compress a Unicode character array into a byte array.
 308      * @param buffer The character buffer to compress.
 309      * @param start The start of the character run to compress.
 310      * @param limit The limit of the character run to compress.
 311      * @return A byte array containing the compressed characters.
 312      * @see #compress(String)
 313      * @stable ICU 2.4
 314      */
 315     public static byte [] compress(char [] buffer,
 316                    int start,
 317                    int limit)
 318     {
 319     UnicodeCompressor comp = new UnicodeCompressor();
 320
 321     // use a buffer that we know will never overflow
 322     // in the worst case, each character will take 3 bytes
 323     // to encode: UQU, hibyte, lobyte.  In this case, the
 324     // compressed data will look like: SCU, UQU, hibyte, lobyte, ...
 325     // buffer must be at least 4 bytes in size
 326     int len = Math.max(4, 3 * (limit - start) + 1);
 327     byte [] temp = new byte [len];
 328
 329     int byteCount = comp.compress(buffer, start, limit, null,
 330                       temp, 0, len);
 331
 332     byte [] result = new byte [byteCount];
 333     System.arraycopy(temp, 0, result, 0, byteCount);
 334     return result;
 335     }
 336
 337     /**
 338      * Compress a Unicode character array into a byte array.
 339      *
 340      * This function will only consume input that can be completely
 341      * output.
 342      *
 343      * @param charBuffer The character buffer to compress.
 344      * @param charBufferStart The start of the character run to compress.
 345      * @param charBufferLimit The limit of the character run to compress.
 346      * @param charsRead A one-element array.  If not null, on return
 347      * the number of characters read from charBuffer.
 348      * @param byteBuffer A buffer to receive the compressed data.  This
 349      * buffer must be at minimum four bytes in size.
 350      * @param byteBufferStart The starting offset to which to write
 351      * compressed data.
 352      * @param byteBufferLimit The limiting offset for writing compressed data.
 353      * @return The number of bytes written to byteBuffer.
 354      * @stable ICU 2.4
 355      */
 356     public int compress(char []     charBuffer,
 357             int         charBufferStart,
 358             int         charBufferLimit,
 359             int []      charsRead,
 360             byte []     byteBuffer,
 361             int         byteBufferStart,
 362             int         byteBufferLimit)
 363     {
 364         // the current position in the target byte buffer
 365     int     bytePos       = byteBufferStart;
 366
 367     // the current position in the source unicode character buffer
 368     int     ucPos         = charBufferStart;
 369
 370     // the current unicode character from the source buffer
 371     int     curUC         = INVALIDCHAR;
 372
 373     // the index for the current character
 374         int     curIndex      = -1;
 375
 376     // look ahead
 377     int     nextUC        = INVALIDCHAR;
 378     int     forwardUC     = INVALIDCHAR;
 379
 380         // temporary for window searching
 381     int     whichWindow   = 0;
 382
 383     // high and low bytes of the current unicode character
 384     int     hiByte        = 0;
 385     int     loByte        = 0;
 386
 387
 388     // byteBuffer must be at least 4 bytes in size
 389     if(byteBuffer.length < 4 || (byteBufferLimit - byteBufferStart) < 4)
 390         throw new IllegalArgumentException("byteBuffer.length < 4");
 391
 392     mainLoop:
 393     while(ucPos < charBufferLimit && bytePos < byteBufferLimit) {
 394         switch(fMode) {
 395         // main single byte mode compression loop
 396         case SINGLEBYTEMODE:
 397         singleByteModeLoop:
 398         while(ucPos < charBufferLimit && bytePos < byteBufferLimit) {
 399         // get current char
 400         curUC = charBuffer[ucPos++];
 401
 402         // get next char
 403         if(ucPos < charBufferLimit)
 404             nextUC = charBuffer[ucPos];
 405         else
 406             nextUC = INVALIDCHAR;
 407
 408         // chars less than 0x0080 (excluding tags) go straight
 409         // in stream
 410         if(curUC < 0x0080) {
 411             loByte = curUC & 0xFF;
 412
 413             // we need to check and make sure we don't
 414             // accidentally write a single byte mode tag to
 415             // the stream unless it's quoted
 416             if(sSingleTagTable[loByte]) {
 417                                 // make sure there is enough room to
 418                                 // write both bytes if not, rewind the
 419                                 // source stream and break out
 420             if( (bytePos + 1) >= byteBufferLimit)
 421                 { --ucPos; break mainLoop; }
 422
 423             // since we know the byte is less than 0x80, SQUOTE0
 424             // will use static window 0, or ASCII
 425             byteBuffer[bytePos++] = (byte) SQUOTE0;
 426             }
 427
 428             byteBuffer[bytePos++] = (byte) loByte;
 429         }
 430
 431         // if the char belongs to current window, convert it
 432         // to a byte by adding the generic compression offset
 433         // and subtracting the window's offset
 434         else if(inDynamicWindow(curUC, fCurrentWindow) ) {
 435             byteBuffer[bytePos++] = (byte)
 436             (curUC - fOffsets[ fCurrentWindow ]
 437              + COMPRESSIONOFFSET);
 438         }
 439
 440         // if char is not in compressible range, either switch to or
 441         // quote from unicode
 442         else if( ! isCompressible(curUC) ) {
 443             // only check next character if it is valid
 444             if(nextUC != INVALIDCHAR && isCompressible(nextUC)) {
 445                                 // make sure there is enough room to
 446                                 // write all three bytes if not,
 447                                 // rewind the source stream and break
 448                                 // out
 449             if( (bytePos + 2) >= byteBufferLimit)
 450                 { --ucPos; break mainLoop; }
 451
 452             byteBuffer[bytePos++] = (byte) SQUOTEU;
 453             byteBuffer[bytePos++] = (byte) (curUC >>> 8);
 454             byteBuffer[bytePos++] = (byte) (curUC & 0xFF);
 455             }
 456             else {
 457                                 // make sure there is enough room to
 458                                 // write all four bytes if not, rewind
 459                                 // the source stream and break out
 460             if((bytePos + 3) >= byteBufferLimit)
 461                 { --ucPos; break mainLoop; }
 462
 463             byteBuffer[bytePos++] = (byte) SCHANGEU;
 464
 465             hiByte = curUC >>> 8;
 466             loByte = curUC & 0xFF;
 467
 468             if(sUnicodeTagTable[hiByte])
 469                 // add quote Unicode tag
 470                 byteBuffer[bytePos++]   = (byte) UQUOTEU;
 471
 472             byteBuffer[bytePos++] = (byte) hiByte;
 473             byteBuffer[bytePos++] = (byte) loByte;
 474
 475             fMode = UNICODEMODE;
 476             break singleByteModeLoop;
 477             }
 478         }
 479
 480         // if the char is in a currently defined dynamic
 481         // window, figure out which one, and either switch to
 482         // it or quote from it
 483         else if((whichWindow = findDynamicWindow(curUC))
 484             != INVALIDWINDOW ) {
 485             // look ahead
 486             if( (ucPos + 1) < charBufferLimit )
 487             forwardUC = charBuffer[ucPos + 1];
 488             else
 489             forwardUC = INVALIDCHAR;
 490
 491             // all three chars in same window, switch to that
 492             // window inDynamicWindow will return false for
 493             // INVALIDCHAR
 494             if(inDynamicWindow(nextUC, whichWindow)
 495                && inDynamicWindow(forwardUC, whichWindow)) {
 496                                 // make sure there is enough room to
 497                                 // write both bytes if not, rewind the
 498                                 // source stream and break out
 499             if( (bytePos + 1) >= byteBufferLimit)
 500                 { --ucPos; break mainLoop; }
 501
 502             byteBuffer[bytePos++] = (byte)(SCHANGE0 + whichWindow);
 503             byteBuffer[bytePos++] = (byte)
 504                 (curUC - fOffsets[whichWindow]
 505                  + COMPRESSIONOFFSET);
 506             fTimeStamps [ whichWindow ] = ++fTimeStamp;
 507             fCurrentWindow = whichWindow;
 508             }
 509
 510             // either only next char or neither in same
 511             // window, so quote
 512             else {
 513                                 // make sure there is enough room to
 514                                 // write both bytes if not, rewind the
 515                                 // source stream and break out
 516             if((bytePos + 1) >= byteBufferLimit)
 517                 { --ucPos; break mainLoop; }
 518
 519             byteBuffer[bytePos++] = (byte) (SQUOTE0 + whichWindow);
 520             byteBuffer[bytePos++] = (byte)
 521                 (curUC - fOffsets[whichWindow]
 522                  + COMPRESSIONOFFSET);
 523             }
 524         }
 525
 526         // if a static window is defined, and the following
 527         // character is not in that static window, quote from
 528         // the static window Note: to quote from a static
 529         // window, don't add 0x80
 530         else if((whichWindow = findStaticWindow(curUC))
 531             != INVALIDWINDOW
 532             && ! inStaticWindow(nextUC, whichWindow) ) {
 533             // make sure there is enough room to write both
 534             // bytes if not, rewind the source stream and
 535             // break out
 536             if((bytePos + 1) >= byteBufferLimit)
 537             { --ucPos; break mainLoop; }
 538
 539             byteBuffer[bytePos++] = (byte) (SQUOTE0 + whichWindow);
 540             byteBuffer[bytePos++] = (byte)
 541             (curUC - sOffsets[whichWindow]);
 542         }
 543
 544         // if a window is not defined, decide if we want to
 545         // define a new one or switch to unicode mode
 546         else {
 547             // determine index for current char (char is compressible)
 548             curIndex = makeIndex(curUC);
 549             fIndexCount[curIndex]++;
 550
 551             // look ahead
 552             if((ucPos + 1) < charBufferLimit)
 553             forwardUC = charBuffer[ucPos + 1];
 554             else
 555             forwardUC = INVALIDCHAR;
 556
 557             // if we have encountered this index at least once
 558             // before, define a new window
 559             // OR
 560             // three chars in a row with same index, define a
 561             // new window (makeIndex will return RESERVEDINDEX
 562             // for INVALIDCHAR)
 563             if((fIndexCount[curIndex] > 1) ||
 564                (curIndex == makeIndex(nextUC)
 565             && curIndex == makeIndex(forwardUC))) {
 566             // make sure there is enough room to write all
 567             // three bytes if not, rewind the source
 568             // stream and break out
 569             if( (bytePos + 2) >= byteBufferLimit)
 570                 { --ucPos; break mainLoop; }
 571
 572             // get least recently defined window
 573             whichWindow = getLRDefinedWindow();
 574
 575             byteBuffer[bytePos++] = (byte)(SDEFINE0 + whichWindow);
 576             byteBuffer[bytePos++] = (byte) curIndex;
 577             byteBuffer[bytePos++] = (byte)
 578                 (curUC - sOffsetTable[curIndex]
 579                  + COMPRESSIONOFFSET);
 580
 581             fOffsets[whichWindow] = sOffsetTable[curIndex];
 582             fCurrentWindow = whichWindow;
 583             fTimeStamps [whichWindow] = ++fTimeStamp;
 584             }
 585
 586             // only two chars in a row with same index, so
 587             // switch to unicode mode (makeIndex will return
 588             // RESERVEDINDEX for INVALIDCHAR)
 589             // OR
 590             // three chars have different indices, so switch
 591             // to unicode mode
 592             else {
 593             // make sure there is enough room to write all
 594             // four bytes if not, rewind the source stream
 595             // and break out
 596             if((bytePos + 3) >= byteBufferLimit)
 597                 { --ucPos; break mainLoop; }
 598
 599             byteBuffer[bytePos++] = (byte) SCHANGEU;
 600
 601             hiByte = curUC >>> 8;
 602             loByte = curUC & 0xFF;
 603
 604             if(sUnicodeTagTable[hiByte])
 605                 // add quote Unicode tag
 606                 byteBuffer[bytePos++] = (byte) UQUOTEU;
 607
 608             byteBuffer[bytePos++] = (byte) hiByte;
 609             byteBuffer[bytePos++] = (byte) loByte;
 610
 611             fMode = UNICODEMODE;
 612             break singleByteModeLoop;
 613             }
 614         }
 615         }
 616         break;
 617
 618         case UNICODEMODE:
 619         // main unicode mode compression loop
 620         unicodeModeLoop:
 621         while(ucPos < charBufferLimit && bytePos < byteBufferLimit) {
 622         // get current char
 623         curUC = charBuffer[ucPos++];
 624
 625         // get next char
 626         if( ucPos < charBufferLimit )
 627             nextUC = charBuffer[ucPos];
 628         else
 629             nextUC = INVALIDCHAR;
 630
 631         // if we have two uncompressible chars in a row,
 632         // put the current char's bytes in the stream
 633         if( ! isCompressible(curUC)
 634             || (nextUC != INVALIDCHAR && ! isCompressible(nextUC))) {
 635             // make sure there is enough room to write all three bytes
 636             // if not, rewind the source stream and break out
 637             if( (bytePos + 2) >= byteBufferLimit)
 638             { --ucPos; break mainLoop; }
 639
 640             hiByte = curUC >>> 8;
 641             loByte = curUC & 0xFF;
 642
 643             if(sUnicodeTagTable[ hiByte ])
 644             // add quote Unicode tag
 645             byteBuffer[bytePos++] = (byte) UQUOTEU;
 646
 647             byteBuffer[bytePos++] = (byte) hiByte;
 648             byteBuffer[bytePos++] = (byte) loByte;
 649         }
 650
 651         // bytes less than 0x80 can go straight in the stream,
 652         // but in single-byte mode
 653         else if(curUC < 0x0080) {
 654             loByte = curUC & 0xFF;
 655
 656             // if two chars in a row below 0x80 and the
 657             // current char is not a single-byte mode tag,
 658             // switch to single-byte mode
 659             if(nextUC != INVALIDCHAR
 660                && nextUC < 0x0080 && ! sSingleTagTable[ loByte ] ) {
 661                                 // make sure there is enough room to
 662                                 // write both bytes if not, rewind the
 663                                 // source stream and break out
 664             if( (bytePos + 1) >= byteBufferLimit)
 665                 { --ucPos; break mainLoop; }
 666
 667             // use the last-active window
 668             whichWindow = fCurrentWindow;
 669             byteBuffer[bytePos++] = (byte)(UCHANGE0 + whichWindow);
 670             byteBuffer[bytePos++] = (byte) loByte;
 671
 672             //fCurrentWindow = 0;
 673             fTimeStamps [whichWindow] = ++fTimeStamp;
 674             fMode = SINGLEBYTEMODE;
 675             break unicodeModeLoop;
 676             }
 677
 678             // otherwise, just write the bytes to the stream
 679             // (this will cover the case of only 1 char less than 0x80
 680             // and single-byte mode tags)
 681             else {
 682                                 // make sure there is enough room to
 683                                 // write both bytes if not, rewind the
 684                                 // source stream and break out
 685             if((bytePos + 1) >= byteBufferLimit)
 686                 { --ucPos; break mainLoop; }
 687
 688             // since the character is less than 0x80, the
 689             // high byte is always 0x00 - no need for
 690             // (curUC >>> 8)
 691             byteBuffer[bytePos++] = (byte) 0x00;
 692             byteBuffer[bytePos++] = (byte) loByte;
 693             }
 694         }
 695
 696         // figure out if the current char is in a defined window
 697         else if((whichWindow = findDynamicWindow(curUC))
 698             != INVALIDWINDOW ) {
 699             // if two chars in a row in the same window,
 700             // switch to that window and go to single-byte mode
 701             // inDynamicWindow will return false for INVALIDCHAR
 702             if(inDynamicWindow(nextUC, whichWindow)) {
 703                                 // make sure there is enough room to
 704                                 // write both bytes if not, rewind the
 705                                 // source stream and break out
 706             if((bytePos + 1) >= byteBufferLimit)
 707                 { --ucPos; break mainLoop; }
 708
 709             byteBuffer[bytePos++] = (byte)(UCHANGE0 + whichWindow);
 710             byteBuffer[bytePos++] = (byte)
 711                 (curUC - fOffsets[whichWindow]
 712                  + COMPRESSIONOFFSET);
 713
 714             fTimeStamps [ whichWindow ] = ++fTimeStamp;
 715             fCurrentWindow = whichWindow;
 716             fMode = SINGLEBYTEMODE;
 717             break unicodeModeLoop;
 718             }
 719
 720             // otherwise, just quote the unicode for the char
 721             else {
 722                                 // make sure there is enough room to
 723                                 // write all three bytes if not,
 724                                 // rewind the source stream and break
 725                                 // out
 726             if((bytePos + 2) >= byteBufferLimit)
 727                 { --ucPos; break mainLoop; }
 728
 729             hiByte = curUC >>> 8;
 730             loByte = curUC & 0xFF;
 731
 732             if(sUnicodeTagTable[ hiByte ])
 733                 // add quote Unicode tag
 734                 byteBuffer[bytePos++] = (byte) UQUOTEU;
 735
 736             byteBuffer[bytePos++] = (byte) hiByte;
 737             byteBuffer[bytePos++] = (byte) loByte;
 738             }
 739         }
 740
 741         // char is not in a defined window
 742         else {
 743             // determine index for current char (char is compressible)
 744             curIndex = makeIndex(curUC);
 745             fIndexCount[curIndex]++;
 746
 747             // look ahead
 748             if( (ucPos + 1) < charBufferLimit )
 749             forwardUC = charBuffer[ucPos + 1];
 750             else
 751             forwardUC = INVALIDCHAR;
 752
 753             // if we have encountered this index at least once
 754             // before, define a new window for it that hasn't
 755             // previously been redefined
 756             // OR
 757             // if three chars in a row with the same index,
 758             // define a new window (makeIndex will return
 759             // RESERVEDINDEX for INVALIDCHAR)
 760             if((fIndexCount[curIndex] > 1) ||
 761                (curIndex == makeIndex(nextUC)
 762             && curIndex == makeIndex(forwardUC))) {
 763                                 // make sure there is enough room to
 764                                 // write all three bytes if not,
 765                                 // rewind the source stream and break
 766                                 // out
 767             if((bytePos + 2) >= byteBufferLimit)
 768                 { --ucPos; break mainLoop; }
 769
 770             // get least recently defined window
 771             whichWindow = getLRDefinedWindow();
 772
 773             byteBuffer[bytePos++] = (byte)(UDEFINE0 + whichWindow);
 774             byteBuffer[bytePos++] = (byte) curIndex;
 775             byteBuffer[bytePos++] = (byte)
 776                 (curUC - sOffsetTable[curIndex]
 777                  + COMPRESSIONOFFSET);
 778
 779             fOffsets[whichWindow] = sOffsetTable[curIndex];
 780             fCurrentWindow = whichWindow;
 781             fTimeStamps [whichWindow] = ++fTimeStamp;
 782             fMode = SINGLEBYTEMODE;
 783             break unicodeModeLoop;
 784             }
 785
 786             // otherwise just quote the unicode, and save our
 787             // windows for longer runs
 788             else {
 789                                 // make sure there is enough room to
 790                                 // write all three bytes if not,
 791                                 // rewind the source stream and break
 792                                 // out
 793             if((bytePos + 2) >= byteBufferLimit)
 794                 { --ucPos; break mainLoop; }
 795
 796             hiByte = curUC >>> 8;
 797             loByte = curUC & 0xFF;
 798
 799             if(sUnicodeTagTable[ hiByte ])
 800                 // add quote Unicode tag
 801                 byteBuffer[bytePos++] = (byte) UQUOTEU;
 802
 803             byteBuffer[bytePos++] = (byte) hiByte;
 804             byteBuffer[bytePos++] = (byte) loByte;
 805             }
 806         }
 807         }
 808         }  // end switch
 809     }
 810
 811         // fill in output parameter
 812     if(charsRead != null)
 813         charsRead [0] = (ucPos - charBufferStart);
 814
 815         // return # of bytes written
 816         return (bytePos - byteBufferStart);
 817     }
 818
 819     /**
 820      * Reset the compressor to its initial state.
 821      * @stable ICU 2.4
 822      */
 823     public void reset()
 824     {
 825     int i;
 826
 827         // reset dynamic windows
 828         fOffsets[0] = 0x0080;    // Latin-1
 829         fOffsets[1] = 0x00C0;    // Latin-1 Supplement + Latin Extended-A
 830         fOffsets[2] = 0x0400;    // Cyrillic
 831         fOffsets[3] = 0x0600;    // Arabic
 832         fOffsets[4] = 0x0900;    // Devanagari
 833         fOffsets[5] = 0x3040;    // Hiragana
 834         fOffsets[6] = 0x30A0;    // Katakana
 835         fOffsets[7] = 0xFF00;    // Fullwidth ASCII
 836
 837
 838         // reset time stamps
 839         for(i = 0; i < NUMWINDOWS; i++) {
 840             fTimeStamps[i]          = 0;
 841         }
 842
 843         // reset count of seen indices
 844         for(i = 0; i <= MAXINDEX; i++ ) {
 845             fIndexCount[i] = 0;
 846         }
 847
 848         fTimeStamp      = 0;                // Reset current time stamp
 849         fCurrentWindow  = 0;                // Make current window Latin-1
 850         fMode           = SINGLEBYTEMODE;   // Always start in single-byte mode
 851     }
 852
 853     //==========================
 854     // Determine the index for a character
 855     //==========================
 856
 857     /**
 858      * Create the index value for a character.
 859      * For more information on this function, refer to table X-3
 860      * <A HREF="http://www.unicode.org/unicode/reports/tr6">UTR6</A>.
 861      * @param c The character in question.
 862      * @return An index for c
 863      */
 864     private static int makeIndex(int c)
 865     {
 866         // check the predefined indices
 867         if(c >= 0x00C0 && c < 0x0140)
 868             return LATININDEX;
 869         else if(c >= 0x0250 && c < 0x02D0)
 870             return IPAEXTENSIONINDEX;
 871         else if(c >= 0x0370 && c < 0x03F0)
 872             return GREEKINDEX;
 873         else if(c >= 0x0530 && c < 0x0590)
 874             return ARMENIANINDEX;
 875         else if(c >= 0x3040 && c < 0x30A0)
 876             return HIRAGANAINDEX;
 877         else if(c >= 0x30A0 && c < 0x3120)
 878             return KATAKANAINDEX;
 879         else if(c >= 0xFF60 && c < 0xFF9F)
 880             return HALFWIDTHKATAKANAINDEX;
 881
 882         // calculate index
 883         else if(c >= 0x0080 && c < 0x3400)
 884             return (c / 0x80) & 0xFF;
 885         else if(c >= 0xE000 && c <= 0xFFFF)
 886             return ((c - 0xAC00) / 0x80) & 0xFF;
 887
 888         // should never happen
 889         else {
 890             return RESERVEDINDEX;
 891         }
 892     }
 893
 894     //==========================
 895     // Check if a given character fits in a window
 896     //==========================
 897
 898     /**
 899     * Determine if a character is in a dynamic window.
 900     * @param c The character to test
 901     * @param whichWindow The dynamic window the test
 902     * @return true if <TT>c</TT> will fit in <TT>whichWindow</TT>,
 903     * false otherwise.
 904     */
 905     private boolean inDynamicWindow(int c,
 906                     int whichWindow)
 907     {
 908         return (c >= fOffsets[whichWindow]
 909         && c < (fOffsets[whichWindow] + 0x80));
 910     }
 911
 912     /**
 913      * Determine if a character is in a static window.
 914     * @param c The character to test
 915     * @param whichWindow The static window the test
 916     * @return true if <TT>c</TT> will fit in <TT>whichWindow</TT>,
 917     * false otherwise.
 918     */
 919     private static boolean inStaticWindow(int c,
 920                       int whichWindow)
 921     {
 922         return (c >= sOffsets[whichWindow]
 923         && c < (sOffsets[whichWindow] + 0x80));
 924     }
 925
 926     //==========================
 927     // Check if a given character is compressible
 928     //==========================
 929
 930     /**
 931     * Determine if a character is compressible.
 932     * @param c The character to test.
 933     * @return true if the <TT>c</TT> is compressible, false otherwise.
 934     */
 935     private static boolean isCompressible(int c)
 936     {
 937         return (c < 0x3400 || c >= 0xE000);
 938     }
 939
 940     //==========================
 941     // Check if a window is defined for a given character
 942     //==========================
 943
 944     /**
 945      * Determine if a dynamic window for a certain character is defined
 946      * @param c The character in question
 947      * @return The dynamic window containing <TT>c</TT>, or
 948      * INVALIDWINDOW if not defined.
 949      */
 950     private int findDynamicWindow(int c)
 951     {
 952     // supposedly faster to count down
 953         //for(int i = 0; i < NUMWINDOWS; i++) {
 954     for(int i = NUMWINDOWS - 1; i >= 0; --i) {
 955         if(inDynamicWindow(c, i)) {
 956         ++fTimeStamps[i];
 957                 return i;
 958         }
 959     }
 960
 961         return INVALIDWINDOW;
 962     }
 963
 964     /**
 965      * Determine if a static window for a certain character is defined
 966      * @param c The character in question
 967      * @return The static window containing <TT>c</TT>, or
 968      * INVALIDWINDOW if not defined.
 969      */
 970     private static int findStaticWindow(int c)
 971     {
 972     // supposedly faster to count down
 973         //for(int i = 0; i < NUMSTATICWINDOWS; i++) {
 974     for(int i = NUMSTATICWINDOWS - 1; i >= 0; --i) {
 975         if(inStaticWindow(c, i)) {
 976                 return i;
 977         }
 978     }
 979
 980         return INVALIDWINDOW;
 981     }
 982
 983     //==========================
 984     // Find the least-recently used window
 985     //==========================
 986
 987     /** Find the least-recently defined window */
 988     private int getLRDefinedWindow()
 989     {
 990         int leastRU         = Integer.MAX_VALUE;
 991         int whichWindow     = INVALIDWINDOW;
 992
 993         // find least recently used window
 994         // supposedly faster to count down
 995         //for( int i = 0; i < NUMWINDOWS; i++ ) {
 996         for(int i = NUMWINDOWS - 1; i >= 0; --i ) {
 997             if( fTimeStamps[i] < leastRU ) {
 998                 leastRU   = fTimeStamps[i];
 999                 whichWindow  = i;
1000             }
1001         }
1002
1003         return whichWindow;
1004     }
1005
1006 }