2 *******************************************************************************
\r
3 * Copyright (C) 1996-2009, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.text;
\r
10 * A compression engine implementing the Standard Compression Scheme
\r
11 * for Unicode (SCSU) as outlined in <A
\r
12 * HREF="http://www.unicode.org/unicode/reports/tr6">Unicode Technical
\r
15 * <P>The SCSU works by using dynamically positioned <EM>windows</EM>
\r
16 * consisting of 128 consecutive characters in Unicode. During compression,
\r
17 * characters within a window are encoded in the compressed stream as the bytes
\r
18 * <TT>0x7F - 0xFF</TT>. The SCSU provides transparency for the characters
\r
19 * (bytes) between <TT>U+0000 - U+00FF</TT>. The SCSU approximates the
\r
20 * storage size of traditional character sets, for example 1 byte per
\r
21 * character for ASCII or Latin-1 text, and 2 bytes per character for CJK
\r
24 * <P><STRONG>USAGE</STRONG></P>
\r
26 * <P>The static methods on <TT>UnicodeCompressor</TT> may be used in a
\r
27 * straightforward manner to compress simple strings:</P>
\r
30 * String s = ... ; // get string from somewhere
\r
31 * byte [] compressed = UnicodeCompressor.compress(s);
\r
34 * <P>The static methods have a fairly large memory footprint.
\r
35 * For finer-grained control over memory usage,
\r
36 * <TT>UnicodeCompressor</TT> offers more powerful APIs allowing
\r
37 * iterative compression:</P>
\r
40 * // Compress an array "chars" of length "len" using a buffer of 512 bytes
\r
41 * // to the OutputStream "out"
\r
43 * UnicodeCompressor myCompressor = new UnicodeCompressor();
\r
44 * final static int BUFSIZE = 512;
\r
45 * byte [] byteBuffer = new byte [ BUFSIZE ];
\r
46 * int bytesWritten = 0;
\r
47 * int [] unicharsRead = new int [1];
\r
48 * int totalCharsCompressed = 0;
\r
49 * int totalBytesWritten = 0;
\r
52 * // do the compression
\r
53 * bytesWritten = myCompressor.compress(chars, totalCharsCompressed,
\r
54 * len, unicharsRead,
\r
55 * byteBuffer, 0, BUFSIZE);
\r
57 * // do something with the current set of bytes
\r
58 * out.write(byteBuffer, 0, bytesWritten);
\r
60 * // update the no. of characters compressed
\r
61 * totalCharsCompressed += unicharsRead[0];
\r
63 * // update the no. of bytes written
\r
64 * totalBytesWritten += bytesWritten;
\r
66 * } while(totalCharsCompressed < len);
\r
68 * myCompressor.reset(); // reuse compressor
\r
71 * @see UnicodeDecompressor
\r
73 * @author Stephen F. Booth
\r
79 * COMPRESSION STRATEGY
\r
83 * There are three relevant cases.
\r
84 * If the character is in the current window or is Latin-1 (U+0000,
\r
85 * U+0009, U+000A, U+000D, U+0020 - U+007F), the character is placed
\r
86 * directly in the stream as a single byte.
\r
88 * 1. Current character is in defined, inactive window.
\r
89 * 2. Current character is in undefined window.
\r
90 * 3. Current character is uncompressible Unicode (U+3400 - U+DFFF).
\r
92 * 1. Current character is in defined, inactive window
\r
93 * A. Look ahead two characters
\r
94 * B. If both following characters in same window as current character,
\r
95 * switch to defined window
\r
96 * C. If only next character is in same window as current character,
\r
97 * quote defined window
\r
98 * D. If neither of following characters is in same window as current,
\r
99 * quote defined window
\r
101 * 2. Current character is in undefined window
\r
102 * A. Look ahead two characters
\r
103 * B. If both following characters in same window as current character,
\r
104 * define new window
\r
105 * C. If only next character in same window as current character,
\r
106 * switch to Unicode mode
\r
107 * NOTE: This costs us one extra byte. However,
\r
108 * since we have a limited number of windows to work with, it is
\r
109 * assumed the cost will pay off later in savings from a window with
\r
110 * more characters in it.
\r
111 * D. If neither of following characters in same window as current,
\r
112 * switch to Unicode mode. Alternative to above: just quote
\r
113 * Unicode (same byte cost)
\r
115 * 3. Current character is uncompressible Unicode (U+3400 - U+DFFF)
\r
116 * A. Look ahead one character
\r
117 * B. If next character in non-compressible region, switch to
\r
119 * C. If next character not in non-compressible region, quote Unicode
\r
122 * The following chart illustrates the bytes required for encoding characters
\r
123 * in each possible way
\r
127 * Characters in a row with same index
\r
128 * tag encountered 1 2 3 4
\r
129 * ---------------------------------------------------------------
\r
130 * none (in current window) 1 2 3 4
\r
132 * quote Unicode 3 6 9 12
\r
134 * window not switch to Unicode 3 5 7 9 byte
\r
135 * defined define window 3 4 5 6 cost
\r
137 * window switch to window 2 3 4 5
\r
138 * defined quote window 2 4 6 8
\r
142 * There are two relevant cases.
\r
143 * If the character is in the non-compressible region
\r
144 * (U+3400 - U+DFFF), the character is simply written to the
\r
145 * stream as a pair of bytes.
\r
147 * 1. Current character is in defined, inactive window.
\r
148 * 2. Current character is in undefined window.
\r
150 * 1.Current character is in defined, inactive window
\r
151 * A. Look ahead one character
\r
152 * B. If next character has same index as current character,
\r
153 * switch to defined window (and switch to single-byte mode)
\r
154 * C. If not, just put bytes in stream
\r
157 * 2. Current character is in undefined window
\r
158 * A. Look ahead two characters
\r
159 * B. If both in same window as current character, define window
\r
160 * (and switch to single-byte mode)
\r
161 * C. If only next character in same window, just put bytes in stream
\r
162 * NOTE: This costs us one extra byte. However,
\r
163 * since we have a limited number of windows to work with, it is
\r
164 * assumed the cost will pay off later in savings from a window with
\r
165 * more characters in it.
\r
166 * D. If neither in same window, put bytes in stream
\r
169 * The following chart illustrates the bytes required for encoding characters
\r
170 * in each possible way
\r
174 * Characters in a row with same index
\r
175 * tag encountered 1 2 3 4
\r
176 * ---------------------------------------------------------------
\r
179 * quote Unicode 3 6 9 12
\r
181 * window not define window 3 4 5 6 byte
\r
183 * window switch to window 2 3 4 5
\r
186 public final class UnicodeCompressor implements SCSU
\r
188 //==========================
\r
190 //==========================
\r
192 /** For quick identification of a byte as a single-byte mode tag */
\r
193 private static boolean [] sSingleTagTable = {
\r
194 // table generated by CompressionTableGenerator
\r
195 false, true, true, true, true, true, true, true, true, false,
\r
196 false, true, true, false, true, true, true, true, true, true,
\r
197 true, true, true, true, true, true, true, true, true, true,
\r
198 true, true, false, false, false, false, false, false,false,
\r
199 false, false, false, false, false, false, false, false, false,
\r
200 false, false, false, false, false, false, false, false, false,
\r
201 false, false, false, false, false, false, false, false, false,
\r
202 false, false, false, false, false, false, false, false, false,
\r
203 false, false, false, false, false, false, false, false, false,
\r
204 false, false, false, false, false, false, false, false, false,
\r
205 false, false, false, false, false, false, false, false, false,
\r
206 false, false, false, false, false, false, false, false, false,
\r
207 false, false, false, false, false, false, false, false, false,
\r
208 false, false, false, false, false, false, false, false, false,
\r
209 false, false, false, false, false, false, false, false, false,
\r
210 false, false, false, false, false, false, false, false, false,
\r
211 false, false, false, false, false, false, false, false, false,
\r
212 false, false, false, false, false, false, false, false, false,
\r
213 false, false, false, false, false, false, false, false, false,
\r
214 false, false, false, false, false, false, false, false, false,
\r
215 false, false, false, false, false, false, false, false, false,
\r
216 false, false, false, false, false, false, false, false, false,
\r
217 false, false, false, false, false, false, false, false, false,
\r
218 false, false, false, false, false, false, false, false, false,
\r
219 false, false, false, false, false, false, false, false, false,
\r
220 false, false, false, false, false, false, false, false, false,
\r
221 false, false, false, false, false, false, false, false, false,
\r
222 false, false, false, false, false, false, false, false, false,
\r
226 /** For quick identification of a byte as a unicode mode tag */
\r
227 private static boolean [] sUnicodeTagTable = {
\r
228 // table generated by CompressionTableGenerator
\r
229 false, false, false, false, false, false, false, false, false,
\r
230 false, false, false, false, false, false, false, false, false,
\r
231 false, false, false, false, false, false, false, false, false,
\r
232 false, false, false, false, false, false, false, false, false,
\r
233 false, false, false, false, false, false, false, false, false,
\r
234 false, false, false, false, false, false, false, false, false,
\r
235 false, false, false, false, false, false, false, false, false,
\r
236 false, false, false, false, false, false, false, false, false,
\r
237 false, false, false, false, false, false, false, false, false,
\r
238 false, false, false, false, false, false, false, false, false,
\r
239 false, false, false, false, false, false, false, false, false,
\r
240 false, false, false, false, false, false, false, false, false,
\r
241 false, false, false, false, false, false, false, false, false,
\r
242 false, false, false, false, false, false, false, false, false,
\r
243 false, false, false, false, false, false, false, false, false,
\r
244 false, false, false, false, false, false, false, false, false,
\r
245 false, false, false, false, false, false, false, false, false,
\r
246 false, false, false, false, false, false, false, false, false,
\r
247 false, false, false, false, false, false, false, false, false,
\r
248 false, false, false, false, false, false, false, false, false,
\r
249 false, false, false, false, false, false, false, false, false,
\r
250 false, false, false, false, false, false, false, false, false,
\r
251 false, false, false, false, false, false, false, false, false,
\r
252 false, false, false, false, false, false, false, false, false,
\r
253 false, false, false, false, false, false, false, false, true,
\r
254 true, true, true, true, true, true, true, true, true, true,
\r
255 true, true, true, true, true, true, true, true, false, false,
\r
256 false, false, false, false, false, false, false, false, false,
\r
260 //==========================
\r
261 // Instance variables
\r
262 //==========================
\r
264 /** Alias to current dynamic window */
\r
265 private int fCurrentWindow = 0;
\r
267 /** Dynamic compression window offsets */
\r
268 private int [] fOffsets = new int [ NUMWINDOWS ];
\r
270 /** Current compression mode */
\r
271 private int fMode = SINGLEBYTEMODE;
\r
273 /** Keeps count of times character indices are encountered */
\r
274 private int [] fIndexCount = new int [ MAXINDEX + 1 ];
\r
276 /** The time stamps indicate when a window was last defined */
\r
277 private int [] fTimeStamps = new int [ NUMWINDOWS ];
\r
279 /** The current time stamp */
\r
280 private int fTimeStamp = 0;
\r
284 * Create a UnicodeCompressor.
\r
285 * Sets all windows to their default values.
\r
289 public UnicodeCompressor()
\r
291 reset(); // initialize to defaults
\r
295 * Compress a string into a byte array.
\r
296 * @param buffer The string to compress.
\r
297 * @return A byte array containing the compressed characters.
\r
298 * @see #compress(char [], int, int)
\r
301 public static byte [] compress(String buffer)
\r
303 return compress(buffer.toCharArray(), 0, buffer.length());
\r
307 * Compress a Unicode character array into a byte array.
\r
308 * @param buffer The character buffer to compress.
\r
309 * @param start The start of the character run to compress.
\r
310 * @param limit The limit of the character run to compress.
\r
311 * @return A byte array containing the compressed characters.
\r
312 * @see #compress(String)
\r
315 public static byte [] compress(char [] buffer,
\r
319 UnicodeCompressor comp = new UnicodeCompressor();
\r
321 // use a buffer that we know will never overflow
\r
322 // in the worst case, each character will take 3 bytes
\r
323 // to encode: UQU, hibyte, lobyte. In this case, the
\r
324 // compressed data will look like: SCU, UQU, hibyte, lobyte, ...
\r
325 // buffer must be at least 4 bytes in size
\r
326 int len = Math.max(4, 3 * (limit - start) + 1);
\r
327 byte [] temp = new byte [len];
\r
329 int byteCount = comp.compress(buffer, start, limit, null,
\r
332 byte [] result = new byte [byteCount];
\r
333 System.arraycopy(temp, 0, result, 0, byteCount);
\r
338 * Compress a Unicode character array into a byte array.
\r
340 * This function will only consume input that can be completely
\r
343 * @param charBuffer The character buffer to compress.
\r
344 * @param charBufferStart The start of the character run to compress.
\r
345 * @param charBufferLimit The limit of the character run to compress.
\r
346 * @param charsRead A one-element array. If not null, on return
\r
347 * the number of characters read from charBuffer.
\r
348 * @param byteBuffer A buffer to receive the compressed data. This
\r
349 * buffer must be at minimum four bytes in size.
\r
350 * @param byteBufferStart The starting offset to which to write
\r
352 * @param byteBufferLimit The limiting offset for writing compressed data.
\r
353 * @return The number of bytes written to byteBuffer.
\r
356 public int compress(char [] charBuffer,
\r
357 int charBufferStart,
\r
358 int charBufferLimit,
\r
360 byte [] byteBuffer,
\r
361 int byteBufferStart,
\r
362 int byteBufferLimit)
\r
364 // the current position in the target byte buffer
\r
365 int bytePos = byteBufferStart;
\r
367 // the current position in the source unicode character buffer
\r
368 int ucPos = charBufferStart;
\r
370 // the current unicode character from the source buffer
\r
371 int curUC = INVALIDCHAR;
\r
373 // the index for the current character
\r
377 int nextUC = INVALIDCHAR;
\r
378 int forwardUC = INVALIDCHAR;
\r
380 // temporary for window searching
\r
381 int whichWindow = 0;
\r
383 // high and low bytes of the current unicode character
\r
388 // byteBuffer must be at least 4 bytes in size
\r
389 if(byteBuffer.length < 4 || (byteBufferLimit - byteBufferStart) < 4)
\r
390 throw new IllegalArgumentException("byteBuffer.length < 4");
\r
393 while(ucPos < charBufferLimit && bytePos < byteBufferLimit) {
\r
395 // main single byte mode compression loop
\r
396 case SINGLEBYTEMODE:
\r
397 singleByteModeLoop:
\r
398 while(ucPos < charBufferLimit && bytePos < byteBufferLimit) {
\r
399 // get current char
\r
400 curUC = charBuffer[ucPos++];
\r
403 if(ucPos < charBufferLimit)
\r
404 nextUC = charBuffer[ucPos];
\r
406 nextUC = INVALIDCHAR;
\r
408 // chars less than 0x0080 (excluding tags) go straight
\r
410 if(curUC < 0x0080) {
\r
411 loByte = curUC & 0xFF;
\r
413 // we need to check and make sure we don't
\r
414 // accidentally write a single byte mode tag to
\r
415 // the stream unless it's quoted
\r
416 if(sSingleTagTable[loByte]) {
\r
417 // make sure there is enough room to
\r
418 // write both bytes if not, rewind the
\r
419 // source stream and break out
\r
420 if( (bytePos + 1) >= byteBufferLimit)
\r
421 { --ucPos; break mainLoop; }
\r
423 // since we know the byte is less than 0x80, SQUOTE0
\r
424 // will use static window 0, or ASCII
\r
425 byteBuffer[bytePos++] = (byte) SQUOTE0;
\r
428 byteBuffer[bytePos++] = (byte) loByte;
\r
431 // if the char belongs to current window, convert it
\r
432 // to a byte by adding the generic compression offset
\r
433 // and subtracting the window's offset
\r
434 else if(inDynamicWindow(curUC, fCurrentWindow) ) {
\r
435 byteBuffer[bytePos++] = (byte)
\r
436 (curUC - fOffsets[ fCurrentWindow ]
\r
437 + COMPRESSIONOFFSET);
\r
440 // if char is not in compressible range, either switch to or
\r
441 // quote from unicode
\r
442 else if( ! isCompressible(curUC) ) {
\r
443 // only check next character if it is valid
\r
444 if(nextUC != INVALIDCHAR && isCompressible(nextUC)) {
\r
445 // make sure there is enough room to
\r
446 // write all three bytes if not,
\r
447 // rewind the source stream and break
\r
449 if( (bytePos + 2) >= byteBufferLimit)
\r
450 { --ucPos; break mainLoop; }
\r
452 byteBuffer[bytePos++] = (byte) SQUOTEU;
\r
453 byteBuffer[bytePos++] = (byte) (curUC >>> 8);
\r
454 byteBuffer[bytePos++] = (byte) (curUC & 0xFF);
\r
457 // make sure there is enough room to
\r
458 // write all four bytes if not, rewind
\r
459 // the source stream and break out
\r
460 if((bytePos + 3) >= byteBufferLimit)
\r
461 { --ucPos; break mainLoop; }
\r
463 byteBuffer[bytePos++] = (byte) SCHANGEU;
\r
465 hiByte = curUC >>> 8;
\r
466 loByte = curUC & 0xFF;
\r
468 if(sUnicodeTagTable[hiByte])
\r
469 // add quote Unicode tag
\r
470 byteBuffer[bytePos++] = (byte) UQUOTEU;
\r
472 byteBuffer[bytePos++] = (byte) hiByte;
\r
473 byteBuffer[bytePos++] = (byte) loByte;
\r
475 fMode = UNICODEMODE;
\r
476 break singleByteModeLoop;
\r
480 // if the char is in a currently defined dynamic
\r
481 // window, figure out which one, and either switch to
\r
482 // it or quote from it
\r
483 else if((whichWindow = findDynamicWindow(curUC))
\r
484 != INVALIDWINDOW ) {
\r
486 if( (ucPos + 1) < charBufferLimit )
\r
487 forwardUC = charBuffer[ucPos + 1];
\r
489 forwardUC = INVALIDCHAR;
\r
491 // all three chars in same window, switch to that
\r
492 // window inDynamicWindow will return false for
\r
494 if(inDynamicWindow(nextUC, whichWindow)
\r
495 && inDynamicWindow(forwardUC, whichWindow)) {
\r
496 // make sure there is enough room to
\r
497 // write both bytes if not, rewind the
\r
498 // source stream and break out
\r
499 if( (bytePos + 1) >= byteBufferLimit)
\r
500 { --ucPos; break mainLoop; }
\r
502 byteBuffer[bytePos++] = (byte)(SCHANGE0 + whichWindow);
\r
503 byteBuffer[bytePos++] = (byte)
\r
504 (curUC - fOffsets[whichWindow]
\r
505 + COMPRESSIONOFFSET);
\r
506 fTimeStamps [ whichWindow ] = ++fTimeStamp;
\r
507 fCurrentWindow = whichWindow;
\r
510 // either only next char or neither in same
\r
511 // window, so quote
\r
513 // make sure there is enough room to
\r
514 // write both bytes if not, rewind the
\r
515 // source stream and break out
\r
516 if((bytePos + 1) >= byteBufferLimit)
\r
517 { --ucPos; break mainLoop; }
\r
519 byteBuffer[bytePos++] = (byte) (SQUOTE0 + whichWindow);
\r
520 byteBuffer[bytePos++] = (byte)
\r
521 (curUC - fOffsets[whichWindow]
\r
522 + COMPRESSIONOFFSET);
\r
526 // if a static window is defined, and the following
\r
527 // character is not in that static window, quote from
\r
528 // the static window Note: to quote from a static
\r
529 // window, don't add 0x80
\r
530 else if((whichWindow = findStaticWindow(curUC))
\r
532 && ! inStaticWindow(nextUC, whichWindow) ) {
\r
533 // make sure there is enough room to write both
\r
534 // bytes if not, rewind the source stream and
\r
536 if((bytePos + 1) >= byteBufferLimit)
\r
537 { --ucPos; break mainLoop; }
\r
539 byteBuffer[bytePos++] = (byte) (SQUOTE0 + whichWindow);
\r
540 byteBuffer[bytePos++] = (byte)
\r
541 (curUC - sOffsets[whichWindow]);
\r
544 // if a window is not defined, decide if we want to
\r
545 // define a new one or switch to unicode mode
\r
547 // determine index for current char (char is compressible)
\r
548 curIndex = makeIndex(curUC);
\r
549 fIndexCount[curIndex]++;
\r
552 if((ucPos + 1) < charBufferLimit)
\r
553 forwardUC = charBuffer[ucPos + 1];
\r
555 forwardUC = INVALIDCHAR;
\r
557 // if we have encountered this index at least once
\r
558 // before, define a new window
\r
560 // three chars in a row with same index, define a
\r
561 // new window (makeIndex will return RESERVEDINDEX
\r
562 // for INVALIDCHAR)
\r
563 if((fIndexCount[curIndex] > 1) ||
\r
564 (curIndex == makeIndex(nextUC)
\r
565 && curIndex == makeIndex(forwardUC))) {
\r
566 // make sure there is enough room to write all
\r
567 // three bytes if not, rewind the source
\r
568 // stream and break out
\r
569 if( (bytePos + 2) >= byteBufferLimit)
\r
570 { --ucPos; break mainLoop; }
\r
572 // get least recently defined window
\r
573 whichWindow = getLRDefinedWindow();
\r
575 byteBuffer[bytePos++] = (byte)(SDEFINE0 + whichWindow);
\r
576 byteBuffer[bytePos++] = (byte) curIndex;
\r
577 byteBuffer[bytePos++] = (byte)
\r
578 (curUC - sOffsetTable[curIndex]
\r
579 + COMPRESSIONOFFSET);
\r
581 fOffsets[whichWindow] = sOffsetTable[curIndex];
\r
582 fCurrentWindow = whichWindow;
\r
583 fTimeStamps [whichWindow] = ++fTimeStamp;
\r
586 // only two chars in a row with same index, so
\r
587 // switch to unicode mode (makeIndex will return
\r
588 // RESERVEDINDEX for INVALIDCHAR)
\r
590 // three chars have different indices, so switch
\r
593 // make sure there is enough room to write all
\r
594 // four bytes if not, rewind the source stream
\r
596 if((bytePos + 3) >= byteBufferLimit)
\r
597 { --ucPos; break mainLoop; }
\r
599 byteBuffer[bytePos++] = (byte) SCHANGEU;
\r
601 hiByte = curUC >>> 8;
\r
602 loByte = curUC & 0xFF;
\r
604 if(sUnicodeTagTable[hiByte])
\r
605 // add quote Unicode tag
\r
606 byteBuffer[bytePos++] = (byte) UQUOTEU;
\r
608 byteBuffer[bytePos++] = (byte) hiByte;
\r
609 byteBuffer[bytePos++] = (byte) loByte;
\r
611 fMode = UNICODEMODE;
\r
612 break singleByteModeLoop;
\r
619 // main unicode mode compression loop
\r
621 while(ucPos < charBufferLimit && bytePos < byteBufferLimit) {
\r
622 // get current char
\r
623 curUC = charBuffer[ucPos++];
\r
626 if( ucPos < charBufferLimit )
\r
627 nextUC = charBuffer[ucPos];
\r
629 nextUC = INVALIDCHAR;
\r
631 // if we have two uncompressible chars in a row,
\r
632 // put the current char's bytes in the stream
\r
633 if( ! isCompressible(curUC)
\r
634 || (nextUC != INVALIDCHAR && ! isCompressible(nextUC))) {
\r
635 // make sure there is enough room to write all three bytes
\r
636 // if not, rewind the source stream and break out
\r
637 if( (bytePos + 2) >= byteBufferLimit)
\r
638 { --ucPos; break mainLoop; }
\r
640 hiByte = curUC >>> 8;
\r
641 loByte = curUC & 0xFF;
\r
643 if(sUnicodeTagTable[ hiByte ])
\r
644 // add quote Unicode tag
\r
645 byteBuffer[bytePos++] = (byte) UQUOTEU;
\r
647 byteBuffer[bytePos++] = (byte) hiByte;
\r
648 byteBuffer[bytePos++] = (byte) loByte;
\r
651 // bytes less than 0x80 can go straight in the stream,
\r
652 // but in single-byte mode
\r
653 else if(curUC < 0x0080) {
\r
654 loByte = curUC & 0xFF;
\r
656 // if two chars in a row below 0x80 and the
\r
657 // current char is not a single-byte mode tag,
\r
658 // switch to single-byte mode
\r
659 if(nextUC != INVALIDCHAR
\r
660 && nextUC < 0x0080 && ! sSingleTagTable[ loByte ] ) {
\r
661 // make sure there is enough room to
\r
662 // write both bytes if not, rewind the
\r
663 // source stream and break out
\r
664 if( (bytePos + 1) >= byteBufferLimit)
\r
665 { --ucPos; break mainLoop; }
\r
667 // use the last-active window
\r
668 whichWindow = fCurrentWindow;
\r
669 byteBuffer[bytePos++] = (byte)(UCHANGE0 + whichWindow);
\r
670 byteBuffer[bytePos++] = (byte) loByte;
\r
672 //fCurrentWindow = 0;
\r
673 fTimeStamps [whichWindow] = ++fTimeStamp;
\r
674 fMode = SINGLEBYTEMODE;
\r
675 break unicodeModeLoop;
\r
678 // otherwise, just write the bytes to the stream
\r
679 // (this will cover the case of only 1 char less than 0x80
\r
680 // and single-byte mode tags)
\r
682 // make sure there is enough room to
\r
683 // write both bytes if not, rewind the
\r
684 // source stream and break out
\r
685 if((bytePos + 1) >= byteBufferLimit)
\r
686 { --ucPos; break mainLoop; }
\r
688 // since the character is less than 0x80, the
\r
689 // high byte is always 0x00 - no need for
\r
691 byteBuffer[bytePos++] = (byte) 0x00;
\r
692 byteBuffer[bytePos++] = (byte) loByte;
\r
696 // figure out if the current char is in a defined window
\r
697 else if((whichWindow = findDynamicWindow(curUC))
\r
698 != INVALIDWINDOW ) {
\r
699 // if two chars in a row in the same window,
\r
700 // switch to that window and go to single-byte mode
\r
701 // inDynamicWindow will return false for INVALIDCHAR
\r
702 if(inDynamicWindow(nextUC, whichWindow)) {
\r
703 // make sure there is enough room to
\r
704 // write both bytes if not, rewind the
\r
705 // source stream and break out
\r
706 if((bytePos + 1) >= byteBufferLimit)
\r
707 { --ucPos; break mainLoop; }
\r
709 byteBuffer[bytePos++] = (byte)(UCHANGE0 + whichWindow);
\r
710 byteBuffer[bytePos++] = (byte)
\r
711 (curUC - fOffsets[whichWindow]
\r
712 + COMPRESSIONOFFSET);
\r
714 fTimeStamps [ whichWindow ] = ++fTimeStamp;
\r
715 fCurrentWindow = whichWindow;
\r
716 fMode = SINGLEBYTEMODE;
\r
717 break unicodeModeLoop;
\r
720 // otherwise, just quote the unicode for the char
\r
722 // make sure there is enough room to
\r
723 // write all three bytes if not,
\r
724 // rewind the source stream and break
\r
726 if((bytePos + 2) >= byteBufferLimit)
\r
727 { --ucPos; break mainLoop; }
\r
729 hiByte = curUC >>> 8;
\r
730 loByte = curUC & 0xFF;
\r
732 if(sUnicodeTagTable[ hiByte ])
\r
733 // add quote Unicode tag
\r
734 byteBuffer[bytePos++] = (byte) UQUOTEU;
\r
736 byteBuffer[bytePos++] = (byte) hiByte;
\r
737 byteBuffer[bytePos++] = (byte) loByte;
\r
741 // char is not in a defined window
\r
743 // determine index for current char (char is compressible)
\r
744 curIndex = makeIndex(curUC);
\r
745 fIndexCount[curIndex]++;
\r
748 if( (ucPos + 1) < charBufferLimit )
\r
749 forwardUC = charBuffer[ucPos + 1];
\r
751 forwardUC = INVALIDCHAR;
\r
753 // if we have encountered this index at least once
\r
754 // before, define a new window for it that hasn't
\r
755 // previously been redefined
\r
757 // if three chars in a row with the same index,
\r
758 // define a new window (makeIndex will return
\r
759 // RESERVEDINDEX for INVALIDCHAR)
\r
760 if((fIndexCount[curIndex] > 1) ||
\r
761 (curIndex == makeIndex(nextUC)
\r
762 && curIndex == makeIndex(forwardUC))) {
\r
763 // make sure there is enough room to
\r
764 // write all three bytes if not,
\r
765 // rewind the source stream and break
\r
767 if((bytePos + 2) >= byteBufferLimit)
\r
768 { --ucPos; break mainLoop; }
\r
770 // get least recently defined window
\r
771 whichWindow = getLRDefinedWindow();
\r
773 byteBuffer[bytePos++] = (byte)(UDEFINE0 + whichWindow);
\r
774 byteBuffer[bytePos++] = (byte) curIndex;
\r
775 byteBuffer[bytePos++] = (byte)
\r
776 (curUC - sOffsetTable[curIndex]
\r
777 + COMPRESSIONOFFSET);
\r
779 fOffsets[whichWindow] = sOffsetTable[curIndex];
\r
780 fCurrentWindow = whichWindow;
\r
781 fTimeStamps [whichWindow] = ++fTimeStamp;
\r
782 fMode = SINGLEBYTEMODE;
\r
783 break unicodeModeLoop;
\r
786 // otherwise just quote the unicode, and save our
\r
787 // windows for longer runs
\r
789 // make sure there is enough room to
\r
790 // write all three bytes if not,
\r
791 // rewind the source stream and break
\r
793 if((bytePos + 2) >= byteBufferLimit)
\r
794 { --ucPos; break mainLoop; }
\r
796 hiByte = curUC >>> 8;
\r
797 loByte = curUC & 0xFF;
\r
799 if(sUnicodeTagTable[ hiByte ])
\r
800 // add quote Unicode tag
\r
801 byteBuffer[bytePos++] = (byte) UQUOTEU;
\r
803 byteBuffer[bytePos++] = (byte) hiByte;
\r
804 byteBuffer[bytePos++] = (byte) loByte;
\r
811 // fill in output parameter
\r
812 if(charsRead != null)
\r
813 charsRead [0] = (ucPos - charBufferStart);
\r
815 // return # of bytes written
\r
816 return (bytePos - byteBufferStart);
\r
820 * Reset the compressor to its initial state.
\r
823 public void reset()
\r
827 // reset dynamic windows
\r
828 fOffsets[0] = 0x0080; // Latin-1
\r
829 fOffsets[1] = 0x00C0; // Latin-1 Supplement + Latin Extended-A
\r
830 fOffsets[2] = 0x0400; // Cyrillic
\r
831 fOffsets[3] = 0x0600; // Arabic
\r
832 fOffsets[4] = 0x0900; // Devanagari
\r
833 fOffsets[5] = 0x3040; // Hiragana
\r
834 fOffsets[6] = 0x30A0; // Katakana
\r
835 fOffsets[7] = 0xFF00; // Fullwidth ASCII
\r
838 // reset time stamps
\r
839 for(i = 0; i < NUMWINDOWS; i++) {
\r
840 fTimeStamps[i] = 0;
\r
843 // reset count of seen indices
\r
844 for(i = 0; i <= MAXINDEX; i++ ) {
\r
845 fIndexCount[i] = 0;
\r
848 fTimeStamp = 0; // Reset current time stamp
\r
849 fCurrentWindow = 0; // Make current window Latin-1
\r
850 fMode = SINGLEBYTEMODE; // Always start in single-byte mode
\r
853 //==========================
\r
854 // Determine the index for a character
\r
855 //==========================
\r
858 * Create the index value for a character.
\r
859 * For more information on this function, refer to table X-3
\r
860 * <A HREF="http://www.unicode.org/unicode/reports/tr6">UTR6</A>.
\r
861 * @param c The character in question.
\r
862 * @return An index for c
\r
864 private static int makeIndex(int c)
\r
866 // check the predefined indices
\r
867 if(c >= 0x00C0 && c < 0x0140)
\r
869 else if(c >= 0x0250 && c < 0x02D0)
\r
870 return IPAEXTENSIONINDEX;
\r
871 else if(c >= 0x0370 && c < 0x03F0)
\r
873 else if(c >= 0x0530 && c < 0x0590)
\r
874 return ARMENIANINDEX;
\r
875 else if(c >= 0x3040 && c < 0x30A0)
\r
876 return HIRAGANAINDEX;
\r
877 else if(c >= 0x30A0 && c < 0x3120)
\r
878 return KATAKANAINDEX;
\r
879 else if(c >= 0xFF60 && c < 0xFF9F)
\r
880 return HALFWIDTHKATAKANAINDEX;
\r
883 else if(c >= 0x0080 && c < 0x3400)
\r
884 return (c / 0x80) & 0xFF;
\r
885 else if(c >= 0xE000 && c <= 0xFFFF)
\r
886 return ((c - 0xAC00) / 0x80) & 0xFF;
\r
888 // should never happen
\r
890 return RESERVEDINDEX;
\r
894 //==========================
\r
895 // Check if a given character fits in a window
\r
896 //==========================
\r
899 * Determine if a character is in a dynamic window.
\r
900 * @param c The character to test
\r
901 * @param whichWindow The dynamic window the test
\r
902 * @return true if <TT>c</TT> will fit in <TT>whichWindow</TT>,
\r
905 private boolean inDynamicWindow(int c,
\r
908 return (c >= fOffsets[whichWindow]
\r
909 && c < (fOffsets[whichWindow] + 0x80));
\r
913 * Determine if a character is in a static window.
\r
914 * @param c The character to test
\r
915 * @param whichWindow The static window the test
\r
916 * @return true if <TT>c</TT> will fit in <TT>whichWindow</TT>,
\r
919 private static boolean inStaticWindow(int c,
\r
922 return (c >= sOffsets[whichWindow]
\r
923 && c < (sOffsets[whichWindow] + 0x80));
\r
926 //==========================
\r
927 // Check if a given character is compressible
\r
928 //==========================
\r
931 * Determine if a character is compressible.
\r
932 * @param c The character to test.
\r
933 * @return true if the <TT>c</TT> is compressible, false otherwise.
\r
935 private static boolean isCompressible(int c)
\r
937 return (c < 0x3400 || c >= 0xE000);
\r
940 //==========================
\r
941 // Check if a window is defined for a given character
\r
942 //==========================
\r
945 * Determine if a dynamic window for a certain character is defined
\r
946 * @param c The character in question
\r
947 * @return The dynamic window containing <TT>c</TT>, or
\r
948 * INVALIDWINDOW if not defined.
\r
950 private int findDynamicWindow(int c)
\r
952 // supposedly faster to count down
\r
953 //for(int i = 0; i < NUMWINDOWS; i++) {
\r
954 for(int i = NUMWINDOWS - 1; i >= 0; --i) {
\r
955 if(inDynamicWindow(c, i)) {
\r
961 return INVALIDWINDOW;
\r
965 * Determine if a static window for a certain character is defined
\r
966 * @param c The character in question
\r
967 * @return The static window containing <TT>c</TT>, or
\r
968 * INVALIDWINDOW if not defined.
\r
970 private static int findStaticWindow(int c)
\r
972 // supposedly faster to count down
\r
973 //for(int i = 0; i < NUMSTATICWINDOWS; i++) {
\r
974 for(int i = NUMSTATICWINDOWS - 1; i >= 0; --i) {
\r
975 if(inStaticWindow(c, i)) {
\r
980 return INVALIDWINDOW;
\r
983 //==========================
\r
984 // Find the least-recently used window
\r
985 //==========================
\r
987 /** Find the least-recently defined window */
\r
988 private int getLRDefinedWindow()
\r
990 int leastRU = Integer.MAX_VALUE;
\r
991 int whichWindow = INVALIDWINDOW;
\r
993 // find least recently used window
\r
994 // supposedly faster to count down
\r
995 //for( int i = 0; i < NUMWINDOWS; i++ ) {
\r
996 for(int i = NUMWINDOWS - 1; i >= 0; --i ) {
\r
997 if( fTimeStamps[i] < leastRU ) {
\r
998 leastRU = fTimeStamps[i];
\r
1003 return whichWindow;
\r