2 *******************************************************************************
3 * Copyright (C) 1996-2009, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
7 package com.ibm.icu.text;
10 * A compression engine implementing the Standard Compression Scheme
11 * for Unicode (SCSU) as outlined in <A
12 * HREF="http://www.unicode.org/unicode/reports/tr6">Unicode Technical
15 * <P>The SCSU works by using dynamically positioned <EM>windows</EM>
16 * consisting of 128 consecutive characters in Unicode. During compression,
17 * characters within a window are encoded in the compressed stream as the bytes
18 * <TT>0x7F - 0xFF</TT>. The SCSU provides transparency for the characters
19 * (bytes) between <TT>U+0000 - U+00FF</TT>. The SCSU approximates the
20 * storage size of traditional character sets, for example 1 byte per
21 * character for ASCII or Latin-1 text, and 2 bytes per character for CJK
24 * <P><STRONG>USAGE</STRONG></P>
26 * <P>The static methods on <TT>UnicodeCompressor</TT> may be used in a
27 * straightforward manner to compress simple strings:</P>
30 * String s = ... ; // get string from somewhere
31 * byte [] compressed = UnicodeCompressor.compress(s);
34 * <P>The static methods have a fairly large memory footprint.
35 * For finer-grained control over memory usage,
36 * <TT>UnicodeCompressor</TT> offers more powerful APIs allowing
37 * iterative compression:</P>
40 * // Compress an array "chars" of length "len" using a buffer of 512 bytes
41 * // to the OutputStream "out"
43 * UnicodeCompressor myCompressor = new UnicodeCompressor();
44 * final static int BUFSIZE = 512;
45 * byte [] byteBuffer = new byte [ BUFSIZE ];
46 * int bytesWritten = 0;
47 * int [] unicharsRead = new int [1];
48 * int totalCharsCompressed = 0;
49 * int totalBytesWritten = 0;
52 * // do the compression
53 * bytesWritten = myCompressor.compress(chars, totalCharsCompressed,
55 * byteBuffer, 0, BUFSIZE);
57 * // do something with the current set of bytes
58 * out.write(byteBuffer, 0, bytesWritten);
60 * // update the no. of characters compressed
61 * totalCharsCompressed += unicharsRead[0];
63 * // update the no. of bytes written
64 * totalBytesWritten += bytesWritten;
66 * } while(totalCharsCompressed < len);
68 * myCompressor.reset(); // reuse compressor
71 * @see UnicodeDecompressor
73 * @author Stephen F. Booth
79 * COMPRESSION STRATEGY
83 * There are three relevant cases.
84 * If the character is in the current window or is Latin-1 (U+0000,
85 * U+0009, U+000A, U+000D, U+0020 - U+007F), the character is placed
86 * directly in the stream as a single byte.
88 * 1. Current character is in defined, inactive window.
89 * 2. Current character is in undefined window.
90 * 3. Current character is uncompressible Unicode (U+3400 - U+DFFF).
92 * 1. Current character is in defined, inactive window
93 * A. Look ahead two characters
94 * B. If both following characters in same window as current character,
95 * switch to defined window
96 * C. If only next character is in same window as current character,
97 * quote defined window
98 * D. If neither of following characters is in same window as current,
99 * quote defined window
101 * 2. Current character is in undefined window
102 * A. Look ahead two characters
103 * B. If both following characters in same window as current character,
105 * C. If only next character in same window as current character,
106 * switch to Unicode mode
107 * NOTE: This costs us one extra byte. However,
108 * since we have a limited number of windows to work with, it is
109 * assumed the cost will pay off later in savings from a window with
110 * more characters in it.
111 * D. If neither of following characters in same window as current,
112 * switch to Unicode mode. Alternative to above: just quote
113 * Unicode (same byte cost)
115 * 3. Current character is uncompressible Unicode (U+3400 - U+DFFF)
116 * A. Look ahead one character
117 * B. If next character in non-compressible region, switch to
119 * C. If next character not in non-compressible region, quote Unicode
122 * The following chart illustrates the bytes required for encoding characters
123 * in each possible way
127 * Characters in a row with same index
128 * tag encountered 1 2 3 4
129 * ---------------------------------------------------------------
130 * none (in current window) 1 2 3 4
132 * quote Unicode 3 6 9 12
134 * window not switch to Unicode 3 5 7 9 byte
135 * defined define window 3 4 5 6 cost
137 * window switch to window 2 3 4 5
138 * defined quote window 2 4 6 8
142 * There are two relevant cases.
143 * If the character is in the non-compressible region
144 * (U+3400 - U+DFFF), the character is simply written to the
145 * stream as a pair of bytes.
147 * 1. Current character is in defined, inactive window.
148 * 2. Current character is in undefined window.
150 * 1.Current character is in defined, inactive window
151 * A. Look ahead one character
152 * B. If next character has same index as current character,
153 * switch to defined window (and switch to single-byte mode)
154 * C. If not, just put bytes in stream
157 * 2. Current character is in undefined window
158 * A. Look ahead two characters
159 * B. If both in same window as current character, define window
160 * (and switch to single-byte mode)
161 * C. If only next character in same window, just put bytes in stream
162 * NOTE: This costs us one extra byte. However,
163 * since we have a limited number of windows to work with, it is
164 * assumed the cost will pay off later in savings from a window with
165 * more characters in it.
166 * D. If neither in same window, put bytes in stream
169 * The following chart illustrates the bytes required for encoding characters
170 * in each possible way
174 * Characters in a row with same index
175 * tag encountered 1 2 3 4
176 * ---------------------------------------------------------------
179 * quote Unicode 3 6 9 12
181 * window not define window 3 4 5 6 byte
183 * window switch to window 2 3 4 5
186 public final class UnicodeCompressor implements SCSU
188 //==========================
190 //==========================
192 /** For quick identification of a byte as a single-byte mode tag */
193 private static boolean [] sSingleTagTable = {
194 // table generated by CompressionTableGenerator
195 false, true, true, true, true, true, true, true, true, false,
196 false, true, true, false, true, true, true, true, true, true,
197 true, true, true, true, true, true, true, true, true, true,
198 true, true, false, false, false, false, false, false,false,
199 false, false, false, false, false, false, false, false, false,
200 false, false, false, false, false, false, false, false, false,
201 false, false, false, false, false, false, false, false, false,
202 false, false, false, false, false, false, false, false, false,
203 false, false, false, false, false, false, false, false, false,
204 false, false, false, false, false, false, false, false, false,
205 false, false, false, false, false, false, false, false, false,
206 false, false, false, false, false, false, false, false, false,
207 false, false, false, false, false, false, false, false, false,
208 false, false, false, false, false, false, false, false, false,
209 false, false, false, false, false, false, false, false, false,
210 false, false, false, false, false, false, false, false, false,
211 false, false, false, false, false, false, false, false, false,
212 false, false, false, false, false, false, false, false, false,
213 false, false, false, false, false, false, false, false, false,
214 false, false, false, false, false, false, false, false, false,
215 false, false, false, false, false, false, false, false, false,
216 false, false, false, false, false, false, false, false, false,
217 false, false, false, false, false, false, false, false, false,
218 false, false, false, false, false, false, false, false, false,
219 false, false, false, false, false, false, false, false, false,
220 false, false, false, false, false, false, false, false, false,
221 false, false, false, false, false, false, false, false, false,
222 false, false, false, false, false, false, false, false, false,
226 /** For quick identification of a byte as a unicode mode tag */
227 private static boolean [] sUnicodeTagTable = {
228 // table generated by CompressionTableGenerator
229 false, false, false, false, false, false, false, false, false,
230 false, false, false, false, false, false, false, false, false,
231 false, false, false, false, false, false, false, false, false,
232 false, false, false, false, false, false, false, false, false,
233 false, false, false, false, false, false, false, false, false,
234 false, false, false, false, false, false, false, false, false,
235 false, false, false, false, false, false, false, false, false,
236 false, false, false, false, false, false, false, false, false,
237 false, false, false, false, false, false, false, false, false,
238 false, false, false, false, false, false, false, false, false,
239 false, false, false, false, false, false, false, false, false,
240 false, false, false, false, false, false, false, false, false,
241 false, false, false, false, false, false, false, false, false,
242 false, false, false, false, false, false, false, false, false,
243 false, false, false, false, false, false, false, false, false,
244 false, false, false, false, false, false, false, false, false,
245 false, false, false, false, false, false, false, false, false,
246 false, false, false, false, false, false, false, false, false,
247 false, false, false, false, false, false, false, false, false,
248 false, false, false, false, false, false, false, false, false,
249 false, false, false, false, false, false, false, false, false,
250 false, false, false, false, false, false, false, false, false,
251 false, false, false, false, false, false, false, false, false,
252 false, false, false, false, false, false, false, false, false,
253 false, false, false, false, false, false, false, false, true,
254 true, true, true, true, true, true, true, true, true, true,
255 true, true, true, true, true, true, true, true, false, false,
256 false, false, false, false, false, false, false, false, false,
260 //==========================
261 // Instance variables
262 //==========================
264 /** Alias to current dynamic window */
265 private int fCurrentWindow = 0;
267 /** Dynamic compression window offsets */
268 private int [] fOffsets = new int [ NUMWINDOWS ];
270 /** Current compression mode */
271 private int fMode = SINGLEBYTEMODE;
273 /** Keeps count of times character indices are encountered */
274 private int [] fIndexCount = new int [ MAXINDEX + 1 ];
276 /** The time stamps indicate when a window was last defined */
277 private int [] fTimeStamps = new int [ NUMWINDOWS ];
279 /** The current time stamp */
280 private int fTimeStamp = 0;
284 * Create a UnicodeCompressor.
285 * Sets all windows to their default values.
289 public UnicodeCompressor()
291 reset(); // initialize to defaults
295 * Compress a string into a byte array.
296 * @param buffer The string to compress.
297 * @return A byte array containing the compressed characters.
298 * @see #compress(char [], int, int)
301 public static byte [] compress(String buffer)
303 return compress(buffer.toCharArray(), 0, buffer.length());
307 * Compress a Unicode character array into a byte array.
308 * @param buffer The character buffer to compress.
309 * @param start The start of the character run to compress.
310 * @param limit The limit of the character run to compress.
311 * @return A byte array containing the compressed characters.
312 * @see #compress(String)
315 public static byte [] compress(char [] buffer,
319 UnicodeCompressor comp = new UnicodeCompressor();
321 // use a buffer that we know will never overflow
322 // in the worst case, each character will take 3 bytes
323 // to encode: UQU, hibyte, lobyte. In this case, the
324 // compressed data will look like: SCU, UQU, hibyte, lobyte, ...
325 // buffer must be at least 4 bytes in size
326 int len = Math.max(4, 3 * (limit - start) + 1);
327 byte [] temp = new byte [len];
329 int byteCount = comp.compress(buffer, start, limit, null,
332 byte [] result = new byte [byteCount];
333 System.arraycopy(temp, 0, result, 0, byteCount);
338 * Compress a Unicode character array into a byte array.
340 * This function will only consume input that can be completely
343 * @param charBuffer The character buffer to compress.
344 * @param charBufferStart The start of the character run to compress.
345 * @param charBufferLimit The limit of the character run to compress.
346 * @param charsRead A one-element array. If not null, on return
347 * the number of characters read from charBuffer.
348 * @param byteBuffer A buffer to receive the compressed data. This
349 * buffer must be at minimum four bytes in size.
350 * @param byteBufferStart The starting offset to which to write
352 * @param byteBufferLimit The limiting offset for writing compressed data.
353 * @return The number of bytes written to byteBuffer.
356 public int compress(char [] charBuffer,
364 // the current position in the target byte buffer
365 int bytePos = byteBufferStart;
367 // the current position in the source unicode character buffer
368 int ucPos = charBufferStart;
370 // the current unicode character from the source buffer
371 int curUC = INVALIDCHAR;
373 // the index for the current character
377 int nextUC = INVALIDCHAR;
378 int forwardUC = INVALIDCHAR;
380 // temporary for window searching
383 // high and low bytes of the current unicode character
388 // byteBuffer must be at least 4 bytes in size
389 if(byteBuffer.length < 4 || (byteBufferLimit - byteBufferStart) < 4)
390 throw new IllegalArgumentException("byteBuffer.length < 4");
393 while(ucPos < charBufferLimit && bytePos < byteBufferLimit) {
395 // main single byte mode compression loop
398 while(ucPos < charBufferLimit && bytePos < byteBufferLimit) {
400 curUC = charBuffer[ucPos++];
403 if(ucPos < charBufferLimit)
404 nextUC = charBuffer[ucPos];
406 nextUC = INVALIDCHAR;
408 // chars less than 0x0080 (excluding tags) go straight
411 loByte = curUC & 0xFF;
413 // we need to check and make sure we don't
414 // accidentally write a single byte mode tag to
415 // the stream unless it's quoted
416 if(sSingleTagTable[loByte]) {
417 // make sure there is enough room to
418 // write both bytes if not, rewind the
419 // source stream and break out
420 if( (bytePos + 1) >= byteBufferLimit)
421 { --ucPos; break mainLoop; }
423 // since we know the byte is less than 0x80, SQUOTE0
424 // will use static window 0, or ASCII
425 byteBuffer[bytePos++] = (byte) SQUOTE0;
428 byteBuffer[bytePos++] = (byte) loByte;
431 // if the char belongs to current window, convert it
432 // to a byte by adding the generic compression offset
433 // and subtracting the window's offset
434 else if(inDynamicWindow(curUC, fCurrentWindow) ) {
435 byteBuffer[bytePos++] = (byte)
436 (curUC - fOffsets[ fCurrentWindow ]
437 + COMPRESSIONOFFSET);
440 // if char is not in compressible range, either switch to or
441 // quote from unicode
442 else if( ! isCompressible(curUC) ) {
443 // only check next character if it is valid
444 if(nextUC != INVALIDCHAR && isCompressible(nextUC)) {
445 // make sure there is enough room to
446 // write all three bytes if not,
447 // rewind the source stream and break
449 if( (bytePos + 2) >= byteBufferLimit)
450 { --ucPos; break mainLoop; }
452 byteBuffer[bytePos++] = (byte) SQUOTEU;
453 byteBuffer[bytePos++] = (byte) (curUC >>> 8);
454 byteBuffer[bytePos++] = (byte) (curUC & 0xFF);
457 // make sure there is enough room to
458 // write all four bytes if not, rewind
459 // the source stream and break out
460 if((bytePos + 3) >= byteBufferLimit)
461 { --ucPos; break mainLoop; }
463 byteBuffer[bytePos++] = (byte) SCHANGEU;
465 hiByte = curUC >>> 8;
466 loByte = curUC & 0xFF;
468 if(sUnicodeTagTable[hiByte])
469 // add quote Unicode tag
470 byteBuffer[bytePos++] = (byte) UQUOTEU;
472 byteBuffer[bytePos++] = (byte) hiByte;
473 byteBuffer[bytePos++] = (byte) loByte;
476 break singleByteModeLoop;
480 // if the char is in a currently defined dynamic
481 // window, figure out which one, and either switch to
482 // it or quote from it
483 else if((whichWindow = findDynamicWindow(curUC))
486 if( (ucPos + 1) < charBufferLimit )
487 forwardUC = charBuffer[ucPos + 1];
489 forwardUC = INVALIDCHAR;
491 // all three chars in same window, switch to that
492 // window inDynamicWindow will return false for
494 if(inDynamicWindow(nextUC, whichWindow)
495 && inDynamicWindow(forwardUC, whichWindow)) {
496 // make sure there is enough room to
497 // write both bytes if not, rewind the
498 // source stream and break out
499 if( (bytePos + 1) >= byteBufferLimit)
500 { --ucPos; break mainLoop; }
502 byteBuffer[bytePos++] = (byte)(SCHANGE0 + whichWindow);
503 byteBuffer[bytePos++] = (byte)
504 (curUC - fOffsets[whichWindow]
505 + COMPRESSIONOFFSET);
506 fTimeStamps [ whichWindow ] = ++fTimeStamp;
507 fCurrentWindow = whichWindow;
510 // either only next char or neither in same
513 // make sure there is enough room to
514 // write both bytes if not, rewind the
515 // source stream and break out
516 if((bytePos + 1) >= byteBufferLimit)
517 { --ucPos; break mainLoop; }
519 byteBuffer[bytePos++] = (byte) (SQUOTE0 + whichWindow);
520 byteBuffer[bytePos++] = (byte)
521 (curUC - fOffsets[whichWindow]
522 + COMPRESSIONOFFSET);
526 // if a static window is defined, and the following
527 // character is not in that static window, quote from
528 // the static window Note: to quote from a static
529 // window, don't add 0x80
530 else if((whichWindow = findStaticWindow(curUC))
532 && ! inStaticWindow(nextUC, whichWindow) ) {
533 // make sure there is enough room to write both
534 // bytes if not, rewind the source stream and
536 if((bytePos + 1) >= byteBufferLimit)
537 { --ucPos; break mainLoop; }
539 byteBuffer[bytePos++] = (byte) (SQUOTE0 + whichWindow);
540 byteBuffer[bytePos++] = (byte)
541 (curUC - sOffsets[whichWindow]);
544 // if a window is not defined, decide if we want to
545 // define a new one or switch to unicode mode
547 // determine index for current char (char is compressible)
548 curIndex = makeIndex(curUC);
549 fIndexCount[curIndex]++;
552 if((ucPos + 1) < charBufferLimit)
553 forwardUC = charBuffer[ucPos + 1];
555 forwardUC = INVALIDCHAR;
557 // if we have encountered this index at least once
558 // before, define a new window
560 // three chars in a row with same index, define a
561 // new window (makeIndex will return RESERVEDINDEX
563 if((fIndexCount[curIndex] > 1) ||
564 (curIndex == makeIndex(nextUC)
565 && curIndex == makeIndex(forwardUC))) {
566 // make sure there is enough room to write all
567 // three bytes if not, rewind the source
568 // stream and break out
569 if( (bytePos + 2) >= byteBufferLimit)
570 { --ucPos; break mainLoop; }
572 // get least recently defined window
573 whichWindow = getLRDefinedWindow();
575 byteBuffer[bytePos++] = (byte)(SDEFINE0 + whichWindow);
576 byteBuffer[bytePos++] = (byte) curIndex;
577 byteBuffer[bytePos++] = (byte)
578 (curUC - sOffsetTable[curIndex]
579 + COMPRESSIONOFFSET);
581 fOffsets[whichWindow] = sOffsetTable[curIndex];
582 fCurrentWindow = whichWindow;
583 fTimeStamps [whichWindow] = ++fTimeStamp;
586 // only two chars in a row with same index, so
587 // switch to unicode mode (makeIndex will return
588 // RESERVEDINDEX for INVALIDCHAR)
590 // three chars have different indices, so switch
593 // make sure there is enough room to write all
594 // four bytes if not, rewind the source stream
596 if((bytePos + 3) >= byteBufferLimit)
597 { --ucPos; break mainLoop; }
599 byteBuffer[bytePos++] = (byte) SCHANGEU;
601 hiByte = curUC >>> 8;
602 loByte = curUC & 0xFF;
604 if(sUnicodeTagTable[hiByte])
605 // add quote Unicode tag
606 byteBuffer[bytePos++] = (byte) UQUOTEU;
608 byteBuffer[bytePos++] = (byte) hiByte;
609 byteBuffer[bytePos++] = (byte) loByte;
612 break singleByteModeLoop;
619 // main unicode mode compression loop
621 while(ucPos < charBufferLimit && bytePos < byteBufferLimit) {
623 curUC = charBuffer[ucPos++];
626 if( ucPos < charBufferLimit )
627 nextUC = charBuffer[ucPos];
629 nextUC = INVALIDCHAR;
631 // if we have two uncompressible chars in a row,
632 // put the current char's bytes in the stream
633 if( ! isCompressible(curUC)
634 || (nextUC != INVALIDCHAR && ! isCompressible(nextUC))) {
635 // make sure there is enough room to write all three bytes
636 // if not, rewind the source stream and break out
637 if( (bytePos + 2) >= byteBufferLimit)
638 { --ucPos; break mainLoop; }
640 hiByte = curUC >>> 8;
641 loByte = curUC & 0xFF;
643 if(sUnicodeTagTable[ hiByte ])
644 // add quote Unicode tag
645 byteBuffer[bytePos++] = (byte) UQUOTEU;
647 byteBuffer[bytePos++] = (byte) hiByte;
648 byteBuffer[bytePos++] = (byte) loByte;
651 // bytes less than 0x80 can go straight in the stream,
652 // but in single-byte mode
653 else if(curUC < 0x0080) {
654 loByte = curUC & 0xFF;
656 // if two chars in a row below 0x80 and the
657 // current char is not a single-byte mode tag,
658 // switch to single-byte mode
659 if(nextUC != INVALIDCHAR
660 && nextUC < 0x0080 && ! sSingleTagTable[ loByte ] ) {
661 // make sure there is enough room to
662 // write both bytes if not, rewind the
663 // source stream and break out
664 if( (bytePos + 1) >= byteBufferLimit)
665 { --ucPos; break mainLoop; }
667 // use the last-active window
668 whichWindow = fCurrentWindow;
669 byteBuffer[bytePos++] = (byte)(UCHANGE0 + whichWindow);
670 byteBuffer[bytePos++] = (byte) loByte;
672 //fCurrentWindow = 0;
673 fTimeStamps [whichWindow] = ++fTimeStamp;
674 fMode = SINGLEBYTEMODE;
675 break unicodeModeLoop;
678 // otherwise, just write the bytes to the stream
679 // (this will cover the case of only 1 char less than 0x80
680 // and single-byte mode tags)
682 // make sure there is enough room to
683 // write both bytes if not, rewind the
684 // source stream and break out
685 if((bytePos + 1) >= byteBufferLimit)
686 { --ucPos; break mainLoop; }
688 // since the character is less than 0x80, the
689 // high byte is always 0x00 - no need for
691 byteBuffer[bytePos++] = (byte) 0x00;
692 byteBuffer[bytePos++] = (byte) loByte;
696 // figure out if the current char is in a defined window
697 else if((whichWindow = findDynamicWindow(curUC))
699 // if two chars in a row in the same window,
700 // switch to that window and go to single-byte mode
701 // inDynamicWindow will return false for INVALIDCHAR
702 if(inDynamicWindow(nextUC, whichWindow)) {
703 // make sure there is enough room to
704 // write both bytes if not, rewind the
705 // source stream and break out
706 if((bytePos + 1) >= byteBufferLimit)
707 { --ucPos; break mainLoop; }
709 byteBuffer[bytePos++] = (byte)(UCHANGE0 + whichWindow);
710 byteBuffer[bytePos++] = (byte)
711 (curUC - fOffsets[whichWindow]
712 + COMPRESSIONOFFSET);
714 fTimeStamps [ whichWindow ] = ++fTimeStamp;
715 fCurrentWindow = whichWindow;
716 fMode = SINGLEBYTEMODE;
717 break unicodeModeLoop;
720 // otherwise, just quote the unicode for the char
722 // make sure there is enough room to
723 // write all three bytes if not,
724 // rewind the source stream and break
726 if((bytePos + 2) >= byteBufferLimit)
727 { --ucPos; break mainLoop; }
729 hiByte = curUC >>> 8;
730 loByte = curUC & 0xFF;
732 if(sUnicodeTagTable[ hiByte ])
733 // add quote Unicode tag
734 byteBuffer[bytePos++] = (byte) UQUOTEU;
736 byteBuffer[bytePos++] = (byte) hiByte;
737 byteBuffer[bytePos++] = (byte) loByte;
741 // char is not in a defined window
743 // determine index for current char (char is compressible)
744 curIndex = makeIndex(curUC);
745 fIndexCount[curIndex]++;
748 if( (ucPos + 1) < charBufferLimit )
749 forwardUC = charBuffer[ucPos + 1];
751 forwardUC = INVALIDCHAR;
753 // if we have encountered this index at least once
754 // before, define a new window for it that hasn't
755 // previously been redefined
757 // if three chars in a row with the same index,
758 // define a new window (makeIndex will return
759 // RESERVEDINDEX for INVALIDCHAR)
760 if((fIndexCount[curIndex] > 1) ||
761 (curIndex == makeIndex(nextUC)
762 && curIndex == makeIndex(forwardUC))) {
763 // make sure there is enough room to
764 // write all three bytes if not,
765 // rewind the source stream and break
767 if((bytePos + 2) >= byteBufferLimit)
768 { --ucPos; break mainLoop; }
770 // get least recently defined window
771 whichWindow = getLRDefinedWindow();
773 byteBuffer[bytePos++] = (byte)(UDEFINE0 + whichWindow);
774 byteBuffer[bytePos++] = (byte) curIndex;
775 byteBuffer[bytePos++] = (byte)
776 (curUC - sOffsetTable[curIndex]
777 + COMPRESSIONOFFSET);
779 fOffsets[whichWindow] = sOffsetTable[curIndex];
780 fCurrentWindow = whichWindow;
781 fTimeStamps [whichWindow] = ++fTimeStamp;
782 fMode = SINGLEBYTEMODE;
783 break unicodeModeLoop;
786 // otherwise just quote the unicode, and save our
787 // windows for longer runs
789 // make sure there is enough room to
790 // write all three bytes if not,
791 // rewind the source stream and break
793 if((bytePos + 2) >= byteBufferLimit)
794 { --ucPos; break mainLoop; }
796 hiByte = curUC >>> 8;
797 loByte = curUC & 0xFF;
799 if(sUnicodeTagTable[ hiByte ])
800 // add quote Unicode tag
801 byteBuffer[bytePos++] = (byte) UQUOTEU;
803 byteBuffer[bytePos++] = (byte) hiByte;
804 byteBuffer[bytePos++] = (byte) loByte;
811 // fill in output parameter
812 if(charsRead != null)
813 charsRead [0] = (ucPos - charBufferStart);
815 // return # of bytes written
816 return (bytePos - byteBufferStart);
820 * Reset the compressor to its initial state.
827 // reset dynamic windows
828 fOffsets[0] = 0x0080; // Latin-1
829 fOffsets[1] = 0x00C0; // Latin-1 Supplement + Latin Extended-A
830 fOffsets[2] = 0x0400; // Cyrillic
831 fOffsets[3] = 0x0600; // Arabic
832 fOffsets[4] = 0x0900; // Devanagari
833 fOffsets[5] = 0x3040; // Hiragana
834 fOffsets[6] = 0x30A0; // Katakana
835 fOffsets[7] = 0xFF00; // Fullwidth ASCII
839 for(i = 0; i < NUMWINDOWS; i++) {
843 // reset count of seen indices
844 for(i = 0; i <= MAXINDEX; i++ ) {
848 fTimeStamp = 0; // Reset current time stamp
849 fCurrentWindow = 0; // Make current window Latin-1
850 fMode = SINGLEBYTEMODE; // Always start in single-byte mode
853 //==========================
854 // Determine the index for a character
855 //==========================
858 * Create the index value for a character.
859 * For more information on this function, refer to table X-3
860 * <A HREF="http://www.unicode.org/unicode/reports/tr6">UTR6</A>.
861 * @param c The character in question.
862 * @return An index for c
864 private static int makeIndex(int c)
866 // check the predefined indices
867 if(c >= 0x00C0 && c < 0x0140)
869 else if(c >= 0x0250 && c < 0x02D0)
870 return IPAEXTENSIONINDEX;
871 else if(c >= 0x0370 && c < 0x03F0)
873 else if(c >= 0x0530 && c < 0x0590)
874 return ARMENIANINDEX;
875 else if(c >= 0x3040 && c < 0x30A0)
876 return HIRAGANAINDEX;
877 else if(c >= 0x30A0 && c < 0x3120)
878 return KATAKANAINDEX;
879 else if(c >= 0xFF60 && c < 0xFF9F)
880 return HALFWIDTHKATAKANAINDEX;
883 else if(c >= 0x0080 && c < 0x3400)
884 return (c / 0x80) & 0xFF;
885 else if(c >= 0xE000 && c <= 0xFFFF)
886 return ((c - 0xAC00) / 0x80) & 0xFF;
888 // should never happen
890 return RESERVEDINDEX;
894 //==========================
895 // Check if a given character fits in a window
896 //==========================
899 * Determine if a character is in a dynamic window.
900 * @param c The character to test
901 * @param whichWindow The dynamic window the test
902 * @return true if <TT>c</TT> will fit in <TT>whichWindow</TT>,
905 private boolean inDynamicWindow(int c,
908 return (c >= fOffsets[whichWindow]
909 && c < (fOffsets[whichWindow] + 0x80));
913 * Determine if a character is in a static window.
914 * @param c The character to test
915 * @param whichWindow The static window the test
916 * @return true if <TT>c</TT> will fit in <TT>whichWindow</TT>,
919 private static boolean inStaticWindow(int c,
922 return (c >= sOffsets[whichWindow]
923 && c < (sOffsets[whichWindow] + 0x80));
926 //==========================
927 // Check if a given character is compressible
928 //==========================
931 * Determine if a character is compressible.
932 * @param c The character to test.
933 * @return true if the <TT>c</TT> is compressible, false otherwise.
935 private static boolean isCompressible(int c)
937 return (c < 0x3400 || c >= 0xE000);
940 //==========================
941 // Check if a window is defined for a given character
942 //==========================
945 * Determine if a dynamic window for a certain character is defined
946 * @param c The character in question
947 * @return The dynamic window containing <TT>c</TT>, or
948 * INVALIDWINDOW if not defined.
950 private int findDynamicWindow(int c)
952 // supposedly faster to count down
953 //for(int i = 0; i < NUMWINDOWS; i++) {
954 for(int i = NUMWINDOWS - 1; i >= 0; --i) {
955 if(inDynamicWindow(c, i)) {
961 return INVALIDWINDOW;
965 * Determine if a static window for a certain character is defined
966 * @param c The character in question
967 * @return The static window containing <TT>c</TT>, or
968 * INVALIDWINDOW if not defined.
970 private static int findStaticWindow(int c)
972 // supposedly faster to count down
973 //for(int i = 0; i < NUMSTATICWINDOWS; i++) {
974 for(int i = NUMSTATICWINDOWS - 1; i >= 0; --i) {
975 if(inStaticWindow(c, i)) {
980 return INVALIDWINDOW;
983 //==========================
984 // Find the least-recently used window
985 //==========================
987 /** Find the least-recently defined window */
988 private int getLRDefinedWindow()
990 int leastRU = Integer.MAX_VALUE;
991 int whichWindow = INVALIDWINDOW;
993 // find least recently used window
994 // supposedly faster to count down
995 //for( int i = 0; i < NUMWINDOWS; i++ ) {
996 for(int i = NUMWINDOWS - 1; i >= 0; --i ) {
997 if( fTimeStamps[i] < leastRU ) {
998 leastRU = fTimeStamps[i];