2 *******************************************************************************
\r
3 * Copyright (C) 1996-2009, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
8 package com.ibm.icu.text;
\r
11 * A decompression engine implementing the Standard Compression Scheme
\r
12 * for Unicode (SCSU) as outlined in <A
\r
13 * HREF="http://www.unicode.org/unicode/reports/tr6">Unicode Technical
\r
16 * <P><STRONG>USAGE</STRONG></P>
\r
18 * <P>The static methods on <TT>UnicodeDecompressor</TT> may be used in a
\r
19 * straightforward manner to decompress simple strings:</P>
\r
22 * byte [] compressed = ... ; // get compressed bytes from somewhere
\r
23 * String result = UnicodeDecompressor.decompress(compressed);
\r
26 * <P>The static methods have a fairly large memory footprint.
\r
27 * For finer-grained control over memory usage,
\r
28 * <TT>UnicodeDecompressor</TT> offers more powerful APIs allowing
\r
29 * iterative decompression:</P>
\r
32 * // Decompress an array "bytes" of length "len" using a buffer of 512 chars
\r
33 * // to the Writer "out"
\r
35 * UnicodeDecompressor myDecompressor = new UnicodeDecompressor();
\r
36 * final static int BUFSIZE = 512;
\r
37 * char [] charBuffer = new char [ BUFSIZE ];
\r
38 * int charsWritten = 0;
\r
39 * int [] bytesRead = new int [1];
\r
40 * int totalBytesDecompressed = 0;
\r
41 * int totalCharsWritten = 0;
\r
44 * // do the decompression
\r
45 * charsWritten = myDecompressor.decompress(bytes, totalBytesDecompressed,
\r
47 * charBuffer, 0, BUFSIZE);
\r
49 * // do something with the current set of chars
\r
50 * out.write(charBuffer, 0, charsWritten);
\r
52 * // update the no. of bytes decompressed
\r
53 * totalBytesDecompressed += bytesRead[0];
\r
55 * // update the no. of chars written
\r
56 * totalCharsWritten += charsWritten;
\r
58 * } while(totalBytesDecompressed < len);
\r
60 * myDecompressor.reset(); // reuse decompressor
\r
63 * <P>Decompression is performed according to the standard set forth in
\r
64 * <A HREF="http://www.unicode.org/unicode/reports/tr6">Unicode Technical
\r
67 * @see UnicodeCompressor
\r
69 * @author Stephen F. Booth
\r
72 public final class UnicodeDecompressor implements SCSU
\r
74 //==========================
\r
75 // Instance variables
\r
76 //==========================
\r
78 /** Alias to current dynamic window */
\r
79 private int fCurrentWindow = 0;
\r
81 /** Dynamic compression window offsets */
\r
82 private int [] fOffsets = new int [ NUMWINDOWS ];
\r
84 /** Current compression mode */
\r
85 private int fMode = SINGLEBYTEMODE;
\r
87 /** Size of our internal buffer */
\r
88 private final static int BUFSIZE = 3;
\r
90 /** Internal buffer for saving state */
\r
91 private byte [] fBuffer = new byte [BUFSIZE];
\r
93 /** Number of characters in our internal buffer */
\r
94 private int fBufferLength = 0;
\r
98 * Create a UnicodeDecompressor.
\r
99 * Sets all windows to their default values.
\r
103 public UnicodeDecompressor(){
\r
104 reset(); // initialize to defaults
\r
108 * Decompress a byte array into a String.
\r
109 * @param buffer The byte array to decompress.
\r
110 * @return A String containing the decompressed characters.
\r
111 * @see #decompress(byte [], int, int)
\r
114 public static String decompress(byte [] buffer){
\r
115 char [] buf = decompress(buffer, 0, buffer.length);
\r
116 return new String(buf);
\r
120 * Decompress a byte array into a Unicode character array.
\r
121 * @param buffer The byte array to decompress.
\r
122 * @param start The start of the byte run to decompress.
\r
123 * @param limit The limit of the byte run to decompress.
\r
124 * @return A character array containing the decompressed bytes.
\r
125 * @see #decompress(byte [])
\r
128 public static char [] decompress(byte [] buffer, int start, int limit) {
\r
129 UnicodeDecompressor comp = new UnicodeDecompressor();
\r
131 // use a buffer we know will never overflow
\r
132 // in the worst case, each byte will decompress
\r
133 // to a surrogate pair (buffer must be at least 2 chars)
\r
134 int len = Math.max(2, 2 * (limit - start));
\r
135 char [] temp = new char [len];
\r
137 int charCount = comp.decompress(buffer, start, limit, null,
\r
140 char [] result = new char [charCount];
\r
141 System.arraycopy(temp, 0, result, 0, charCount);
\r
146 * Decompress a byte array into a Unicode character array.
\r
148 * This function will either completely fill the output buffer,
\r
149 * or consume the entire input.
\r
151 * @param byteBuffer The byte buffer to decompress.
\r
152 * @param byteBufferStart The start of the byte run to decompress.
\r
153 * @param byteBufferLimit The limit of the byte run to decompress.
\r
154 * @param bytesRead A one-element array. If not null, on return
\r
155 * the number of bytes read from byteBuffer.
\r
156 * @param charBuffer A buffer to receive the decompressed data.
\r
157 * This buffer must be at minimum two characters in size.
\r
158 * @param charBufferStart The starting offset to which to write
\r
159 * decompressed data.
\r
160 * @param charBufferLimit The limiting offset for writing
\r
161 * decompressed data.
\r
162 * @return The number of Unicode characters written to charBuffer.
\r
165 public int decompress(byte [] byteBuffer,
\r
166 int byteBufferStart,
\r
167 int byteBufferLimit,
\r
169 char [] charBuffer,
\r
170 int charBufferStart,
\r
171 int charBufferLimit)
\r
173 // the current position in the source byte buffer
\r
174 int bytePos = byteBufferStart;
\r
176 // the current position in the target char buffer
\r
177 int ucPos = charBufferStart;
\r
179 // the current byte from the source buffer
\r
183 // charBuffer must be at least 2 chars in size
\r
184 if(charBuffer.length < 2 || (charBufferLimit - charBufferStart) < 2)
\r
185 throw new IllegalArgumentException("charBuffer.length < 2");
\r
187 // if our internal buffer isn't empty, flush its contents
\r
188 // to the output buffer before doing any more decompression
\r
189 if(fBufferLength > 0) {
\r
193 // fill the buffer completely, to guarantee one full character
\r
194 if(fBufferLength != BUFSIZE) {
\r
195 newBytes = fBuffer.length - fBufferLength;
\r
197 // verify there are newBytes bytes in byteBuffer
\r
198 if(byteBufferLimit - byteBufferStart < newBytes)
\r
199 newBytes = byteBufferLimit - byteBufferStart;
\r
201 System.arraycopy(byteBuffer, byteBufferStart,
\r
202 fBuffer, fBufferLength, newBytes);
\r
205 // reset buffer length to 0 before recursive call
\r
208 // call self recursively to decompress the buffer
\r
209 int count = decompress(fBuffer, 0, fBuffer.length, null,
\r
210 charBuffer, charBufferStart,
\r
213 // update the positions into the arrays
\r
215 bytePos += newBytes;
\r
218 // the main decompression loop
\r
220 while(bytePos < byteBufferLimit && ucPos < charBufferLimit) {
\r
222 case SINGLEBYTEMODE:
\r
223 // single-byte mode decompression loop
\r
224 singleByteModeLoop:
\r
225 while(bytePos < byteBufferLimit && ucPos < charBufferLimit) {
\r
226 aByte = byteBuffer[bytePos++] & 0xFF;
\r
228 // All bytes from 0x80 through 0xFF are remapped
\r
229 // to chars or surrogate pairs according to the
\r
230 // currently active window
\r
231 case 0x80: case 0x81: case 0x82: case 0x83: case 0x84:
\r
232 case 0x85: case 0x86: case 0x87: case 0x88: case 0x89:
\r
233 case 0x8A: case 0x8B: case 0x8C: case 0x8D: case 0x8E:
\r
234 case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93:
\r
235 case 0x94: case 0x95: case 0x96: case 0x97: case 0x98:
\r
236 case 0x99: case 0x9A: case 0x9B: case 0x9C: case 0x9D:
\r
237 case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2:
\r
238 case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7:
\r
239 case 0xA8: case 0xA9: case 0xAA: case 0xAB: case 0xAC:
\r
240 case 0xAD: case 0xAE: case 0xAF: case 0xB0: case 0xB1:
\r
241 case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6:
\r
242 case 0xB7: case 0xB8: case 0xB9: case 0xBA: case 0xBB:
\r
243 case 0xBC: case 0xBD: case 0xBE: case 0xBF: case 0xC0:
\r
244 case 0xC1: case 0xC2: case 0xC3: case 0xC4: case 0xC5:
\r
245 case 0xC6: case 0xC7: case 0xC8: case 0xC9: case 0xCA:
\r
246 case 0xCB: case 0xCC: case 0xCD: case 0xCE: case 0xCF:
\r
247 case 0xD0: case 0xD1: case 0xD2: case 0xD3: case 0xD4:
\r
248 case 0xD5: case 0xD6: case 0xD7: case 0xD8: case 0xD9:
\r
249 case 0xDA: case 0xDB: case 0xDC: case 0xDD: case 0xDE:
\r
250 case 0xDF: case 0xE0: case 0xE1: case 0xE2: case 0xE3:
\r
251 case 0xE4: case 0xE5: case 0xE6: case 0xE7: case 0xE8:
\r
252 case 0xE9: case 0xEA: case 0xEB: case 0xEC: case 0xED:
\r
253 case 0xEE: case 0xEF: case 0xF0: case 0xF1: case 0xF2:
\r
254 case 0xF3: case 0xF4: case 0xF5: case 0xF6: case 0xF7:
\r
255 case 0xF8: case 0xF9: case 0xFA: case 0xFB: case 0xFC:
\r
256 case 0xFD: case 0xFE: case 0xFF:
\r
257 // For offsets <= 0xFFFF, convert to a single char
\r
258 // by adding the window's offset and subtracting
\r
259 // the generic compression offset
\r
260 if(fOffsets[ fCurrentWindow ] <= 0xFFFF) {
\r
261 charBuffer[ucPos++] = (char)
\r
262 (aByte + fOffsets[ fCurrentWindow ]
\r
263 - COMPRESSIONOFFSET);
\r
265 // For offsets > 0x10000, convert to a surrogate pair by
\r
266 // normBase = window's offset - 0x10000
\r
267 // high surr. = 0xD800 + (normBase >> 10)
\r
268 // low surr. = 0xDC00 + (normBase & 0x3FF) + (byte & 0x7F)
\r
270 // make sure there is enough room to write
\r
271 // both characters
\r
272 // if not, save state and break out
\r
273 if((ucPos + 1) >= charBufferLimit) {
\r
275 System.arraycopy(byteBuffer, bytePos,
\r
277 byteBufferLimit - bytePos);
\r
278 fBufferLength = byteBufferLimit - bytePos;
\r
279 bytePos += fBufferLength;
\r
283 int normalizedBase = fOffsets[ fCurrentWindow ]
\r
285 charBuffer[ucPos++] = (char)
\r
286 (0xD800 + (normalizedBase >> 10));
\r
287 charBuffer[ucPos++] = (char)
\r
288 (0xDC00 + (normalizedBase & 0x3FF)+(aByte & 0x7F));
\r
292 // bytes from 0x20 through 0x7F are treated as ASCII and
\r
293 // are remapped to chars by padding the high byte
\r
294 // (this is the same as quoting from static window 0)
\r
295 // NUL (0x00), HT (0x09), CR (0x0A), LF (0x0D)
\r
296 // are treated as ASCII as well
\r
297 case 0x00: case 0x09: case 0x0A: case 0x0D:
\r
298 case 0x20: case 0x21: case 0x22: case 0x23: case 0x24:
\r
299 case 0x25: case 0x26: case 0x27: case 0x28: case 0x29:
\r
300 case 0x2A: case 0x2B: case 0x2C: case 0x2D: case 0x2E:
\r
301 case 0x2F: case 0x30: case 0x31: case 0x32: case 0x33:
\r
302 case 0x34: case 0x35: case 0x36: case 0x37: case 0x38:
\r
303 case 0x39: case 0x3A: case 0x3B: case 0x3C: case 0x3D:
\r
304 case 0x3E: case 0x3F: case 0x40: case 0x41: case 0x42:
\r
305 case 0x43: case 0x44: case 0x45: case 0x46: case 0x47:
\r
306 case 0x48: case 0x49: case 0x4A: case 0x4B: case 0x4C:
\r
307 case 0x4D: case 0x4E: case 0x4F: case 0x50: case 0x51:
\r
308 case 0x52: case 0x53: case 0x54: case 0x55: case 0x56:
\r
309 case 0x57: case 0x58: case 0x59: case 0x5A: case 0x5B:
\r
310 case 0x5C: case 0x5D: case 0x5E: case 0x5F: case 0x60:
\r
311 case 0x61: case 0x62: case 0x63: case 0x64: case 0x65:
\r
312 case 0x66: case 0x67: case 0x68: case 0x69: case 0x6A:
\r
313 case 0x6B: case 0x6C: case 0x6D: case 0x6E: case 0x6F:
\r
314 case 0x70: case 0x71: case 0x72: case 0x73: case 0x74:
\r
315 case 0x75: case 0x76: case 0x77: case 0x78: case 0x79:
\r
316 case 0x7A: case 0x7B: case 0x7C: case 0x7D: case 0x7E:
\r
318 charBuffer[ucPos++] = (char) aByte;
\r
323 // verify we have two bytes following tag
\r
324 // if not, save state and break out
\r
325 if( (bytePos + 1) >= byteBufferLimit ) {
\r
327 System.arraycopy(byteBuffer, bytePos,
\r
329 byteBufferLimit - bytePos);
\r
330 fBufferLength = byteBufferLimit - bytePos;
\r
331 bytePos += fBufferLength;
\r
335 aByte = byteBuffer[bytePos++];
\r
336 charBuffer[ucPos++] = (char)
\r
337 (aByte << 8 | (byteBuffer[bytePos++] & 0xFF));
\r
340 // switch to Unicode mode
\r
342 fMode = UNICODEMODE;
\r
343 break singleByteModeLoop;
\r
346 // handle all quote tags
\r
347 case SQUOTE0: case SQUOTE1: case SQUOTE2: case SQUOTE3:
\r
348 case SQUOTE4: case SQUOTE5: case SQUOTE6: case SQUOTE7:
\r
349 // verify there is a byte following the tag
\r
350 // if not, save state and break out
\r
351 if(bytePos >= byteBufferLimit) {
\r
353 System.arraycopy(byteBuffer, bytePos,
\r
355 byteBufferLimit - bytePos);
\r
356 fBufferLength = byteBufferLimit - bytePos;
\r
357 bytePos += fBufferLength;
\r
361 // if the byte is in the range 0x00 - 0x7F, use
\r
362 // static window n otherwise, use dynamic window n
\r
363 int dByte = byteBuffer[bytePos++] & 0xFF;
\r
364 charBuffer[ucPos++] = (char)
\r
365 (dByte+ (dByte >= 0x00 && dByte < 0x80
\r
366 ? sOffsets[aByte - SQUOTE0]
\r
367 : (fOffsets[aByte - SQUOTE0]
\r
368 - COMPRESSIONOFFSET)));
\r
371 // handle all change tags
\r
372 case SCHANGE0: case SCHANGE1: case SCHANGE2: case SCHANGE3:
\r
373 case SCHANGE4: case SCHANGE5: case SCHANGE6: case SCHANGE7:
\r
374 fCurrentWindow = aByte - SCHANGE0;
\r
377 // handle all define tags
\r
378 case SDEFINE0: case SDEFINE1: case SDEFINE2: case SDEFINE3:
\r
379 case SDEFINE4: case SDEFINE5: case SDEFINE6: case SDEFINE7:
\r
380 // verify there is a byte following the tag
\r
381 // if not, save state and break out
\r
382 if(bytePos >= byteBufferLimit) {
\r
384 System.arraycopy(byteBuffer, bytePos,
\r
386 byteBufferLimit - bytePos);
\r
387 fBufferLength = byteBufferLimit - bytePos;
\r
388 bytePos += fBufferLength;
\r
392 fCurrentWindow = aByte - SDEFINE0;
\r
393 fOffsets[fCurrentWindow] =
\r
394 sOffsetTable[byteBuffer[bytePos++] & 0xFF];
\r
397 // handle define extended tag
\r
399 // verify we have two bytes following tag
\r
400 // if not, save state and break out
\r
401 if((bytePos + 1) >= byteBufferLimit ) {
\r
403 System.arraycopy(byteBuffer, bytePos,
\r
405 byteBufferLimit - bytePos);
\r
406 fBufferLength = byteBufferLimit - bytePos;
\r
407 bytePos += fBufferLength;
\r
411 aByte = byteBuffer[bytePos++] & 0xFF;
\r
412 fCurrentWindow = (aByte & 0xE0) >> 5;
\r
413 fOffsets[fCurrentWindow] = 0x10000 +
\r
414 (0x80 * (((aByte & 0x1F) << 8)
\r
415 | (byteBuffer[bytePos++] & 0xFF)));
\r
418 // reserved, shouldn't happen
\r
427 // unicode mode decompression loop
\r
429 while(bytePos < byteBufferLimit && ucPos < charBufferLimit) {
\r
430 aByte = byteBuffer[bytePos++] & 0xFF;
\r
432 // handle all define tags
\r
433 case UDEFINE0: case UDEFINE1: case UDEFINE2: case UDEFINE3:
\r
434 case UDEFINE4: case UDEFINE5: case UDEFINE6: case UDEFINE7:
\r
435 // verify there is a byte following tag
\r
436 // if not, save state and break out
\r
437 if(bytePos >= byteBufferLimit ) {
\r
439 System.arraycopy(byteBuffer, bytePos,
\r
441 byteBufferLimit - bytePos);
\r
442 fBufferLength = byteBufferLimit - bytePos;
\r
443 bytePos += fBufferLength;
\r
447 fCurrentWindow = aByte - UDEFINE0;
\r
448 fOffsets[fCurrentWindow] =
\r
449 sOffsetTable[byteBuffer[bytePos++] & 0xFF];
\r
450 fMode = SINGLEBYTEMODE;
\r
451 break unicodeModeLoop;
\r
454 // handle define extended tag
\r
456 // verify we have two bytes following tag
\r
457 // if not, save state and break out
\r
458 if((bytePos + 1) >= byteBufferLimit ) {
\r
460 System.arraycopy(byteBuffer, bytePos,
\r
462 byteBufferLimit - bytePos);
\r
463 fBufferLength = byteBufferLimit - bytePos;
\r
464 bytePos += fBufferLength;
\r
468 aByte = byteBuffer[bytePos++] & 0xFF;
\r
469 fCurrentWindow = (aByte & 0xE0) >> 5;
\r
470 fOffsets[fCurrentWindow] = 0x10000 +
\r
471 (0x80 * (((aByte & 0x1F) << 8)
\r
472 | (byteBuffer[bytePos++] & 0xFF)));
\r
473 fMode = SINGLEBYTEMODE;
\r
474 break unicodeModeLoop;
\r
477 // handle all change tags
\r
478 case UCHANGE0: case UCHANGE1: case UCHANGE2: case UCHANGE3:
\r
479 case UCHANGE4: case UCHANGE5: case UCHANGE6: case UCHANGE7:
\r
480 fCurrentWindow = aByte - UCHANGE0;
\r
481 fMode = SINGLEBYTEMODE;
\r
482 break unicodeModeLoop;
\r
487 // verify we have two bytes following tag
\r
488 // if not, save state and break out
\r
489 if(bytePos >= byteBufferLimit - 1) {
\r
491 System.arraycopy(byteBuffer, bytePos,
\r
493 byteBufferLimit - bytePos);
\r
494 fBufferLength = byteBufferLimit - bytePos;
\r
495 bytePos += fBufferLength;
\r
499 aByte = byteBuffer[bytePos++];
\r
500 charBuffer[ucPos++] = (char)
\r
501 (aByte << 8 | (byteBuffer[bytePos++] & 0xFF));
\r
505 // verify there is a byte following tag
\r
506 // if not, save state and break out
\r
507 if(bytePos >= byteBufferLimit ) {
\r
509 System.arraycopy(byteBuffer, bytePos,
\r
511 byteBufferLimit - bytePos);
\r
512 fBufferLength = byteBufferLimit - bytePos;
\r
513 bytePos += fBufferLength;
\r
517 charBuffer[ucPos++] = (char)
\r
518 (aByte << 8 | (byteBuffer[bytePos++] & 0xFF));
\r
525 } // end switch( fMode )
\r
528 // fill in output parameter
\r
529 if(bytesRead != null)
\r
530 bytesRead [0] = (bytePos - byteBufferStart);
\r
532 // return # of chars written
\r
533 return (ucPos - charBufferStart);
\r
537 * Reset the decompressor to its initial state.
\r
540 public void reset()
\r
542 // reset dynamic windows
\r
543 fOffsets[0] = 0x0080; // Latin-1
\r
544 fOffsets[1] = 0x00C0; // Latin-1 Supplement + Latin Extended-A
\r
545 fOffsets[2] = 0x0400; // Cyrillic
\r
546 fOffsets[3] = 0x0600; // Arabic
\r
547 fOffsets[4] = 0x0900; // Devanagari
\r
548 fOffsets[5] = 0x3040; // Hiragana
\r
549 fOffsets[6] = 0x30A0; // Katakana
\r
550 fOffsets[7] = 0xFF00; // Fullwidth ASCII
\r
553 fCurrentWindow = 0; // Make current window Latin-1
\r
554 fMode = SINGLEBYTEMODE; // Always start in single-byte mode
\r
555 fBufferLength = 0; // Empty buffer
\r