2 *******************************************************************************
\r
3 * Copyright (C) 1996-2009, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.impl;
\r
9 import com.ibm.icu.text.UCharacterIterator;
\r
12 * <p>Binary Ordered Compression for Unicode</p>
\r
14 * <p>Users are strongly encouraged to read the ICU paper on
\r
15 * <a href="http://www.icu-project.org/docs/papers/binary_ordered_compression_for_unicode.html">
\r
16 * BOCU</a> before attempting to use this class.</p>
\r
18 * <p>BOCU is used to compress unicode text into a stream of unsigned
\r
19 * bytes. For many kinds of text the compression compares favorably
\r
20 * to UTF-8, and for some kinds of text (such as CJK) it does better.
\r
21 * The resulting bytes will compare in the same order as the original
\r
22 * code points. The byte stream does not contain the values 0, 1, or
\r
25 * <p>One example of a use of BOCU is in
\r
26 * com.ibm.icu.text.Collator#getCollationKey(String) for a RuleBasedCollator object with
\r
27 * collation strength IDENTICAL. The result CollationKey will consist of the
\r
28 * collation order of the source string followed by the BOCU result of the
\r
32 * <p>Unlike a UTF encoding, BOCU-compressed text is not suitable for
\r
33 * random access.</p>
\r
35 * <p>Method: Slope Detection<br> Remember the previous code point
\r
36 * (initial 0). For each code point in the string, encode the
\r
37 * difference with the previous one. Similar to a UTF, the length of
\r
38 * the byte sequence is encoded in the lead bytes. Unlike a UTF, the
\r
39 * trail byte values may overlap with lead/single byte values. The
\r
40 * signedness of the difference must be encoded as the most
\r
41 * significant part.</p>
\r
43 * <p>We encode differences with few bytes if their absolute values
\r
44 * are small. For correct ordering, we must treat the entire value
\r
45 * range -10ffff..+10ffff in ascending order, which forbids encoding
\r
46 * the sign and the absolute value separately. Instead, we split the
\r
47 * lead byte range in the middle and encode non-negative values going
\r
48 * up and negative values going down.</p>
\r
50 * <p>For very small absolute values, the difference is added to a
\r
51 * middle byte value for single-byte encoded differences. For
\r
52 * somewhat larger absolute values, the difference is divided by the
\r
53 * number of byte values available, the modulo is used for one trail
\r
54 * byte, and the remainder is added to a lead byte avoiding the
\r
55 * single-byte range. For large absolute values, the difference is
\r
56 * similarly encoded in three bytes. (Syn Wee, I need examples
\r
59 * <p>BOCU does not use byte values 0, 1, or 2, but uses all other
\r
60 * byte values for lead and single bytes, so that the middle range of
\r
61 * single bytes is as large as possible.</p>
\r
63 * <p>Note that the lead byte ranges overlap some, but that the
\r
64 * sequences as a whole are well ordered. I.e., even if the lead byte
\r
65 * is the same for sequences of different lengths, the trail bytes
\r
66 * establish correct order. It would be possible to encode slightly
\r
67 * larger ranges for each length (>1) by subtracting the lower bound
\r
68 * of the range. However, that would also slow down the calculation.
\r
69 * (Syn Wee, need an example).</p>
\r
71 * <p>For the actual string encoding, an optimization moves the
\r
72 * previous code point value to the middle of its Unicode script block
\r
73 * to minimize the differences in same-script text runs. (Syn Wee,
\r
74 * need an example.)</p>
\r
76 * @author Syn Wee Quek
\r
77 * @since release 2.2, May 3rd 2002
\r
81 // public constructors --------------------------------------------------
\r
83 // public methods -------------------------------------------------------
\r
86 * <p>Encode the code points of a string as a sequence of bytes,
\r
87 * preserving lexical order.</p>
\r
88 * <p>The minimum size of buffer required for the compression can be
\r
89 * preflighted by getCompressionLength(String).</p>
\r
90 * @param source text source
\r
91 * @param buffer output buffer
\r
92 * @param offset to start writing to
\r
93 * @return end offset where the writing stopped
\r
94 * @see #getCompressionLength(String)
\r
95 * @exception ArrayIndexOutOfBoundsException thrown if size of buffer is
\r
96 * too small for the output.
\r
98 public static int compress(String source, byte buffer[], int offset)
\r
101 UCharacterIterator iterator = UCharacterIterator.getInstance(source);
\r
102 int codepoint = iterator.nextCodePoint();
\r
103 while (codepoint != UCharacterIterator.DONE) {
\r
104 if (prev < 0x4e00 || prev >= 0xa000) {
\r
105 prev = (prev & ~0x7f) - SLOPE_REACH_NEG_1_;
\r
108 // Unihan U+4e00..U+9fa5:
\r
109 // double-bytes down from the upper end
\r
110 prev = 0x9fff - SLOPE_REACH_POS_2_;
\r
113 offset = writeDiff(codepoint - prev, buffer, offset);
\r
115 codepoint = iterator.nextCodePoint();
\r
121 * Return the number of bytes that compress() would write.
\r
122 * @param source text source string
\r
123 * @return the length of the BOCU result
\r
124 * @see #compress(String, byte[], int)
\r
126 public static int getCompressionLength(String source)
\r
130 UCharacterIterator iterator = UCharacterIterator.getInstance(source);
\r
131 int codepoint = iterator.nextCodePoint();
\r
132 while (codepoint != UCharacterIterator.DONE) {
\r
133 if (prev < 0x4e00 || prev >= 0xa000) {
\r
134 prev = (prev & ~0x7f) - SLOPE_REACH_NEG_1_;
\r
137 // Unihan U+4e00..U+9fa5:
\r
138 // double-bytes down from the upper end
\r
139 prev = 0x9fff - SLOPE_REACH_POS_2_;
\r
142 codepoint = iterator.nextCodePoint();
\r
143 result += lengthOfDiff(codepoint - prev);
\r
149 // public setter methods -------------------------------------------------
\r
151 // public getter methods ------------------------------------------------
\r
153 // public other methods -------------------------------------------------
\r
155 // protected constructor ------------------------------------------------
\r
157 // protected data members ------------------------------------------------
\r
159 // protected methods -----------------------------------------------------
\r
161 // private data members --------------------------------------------------
\r
164 * Do not use byte values 0, 1, 2 because they are separators in sort keys.
\r
166 private static final int SLOPE_MIN_ = 3;
\r
167 private static final int SLOPE_MAX_ = 0xff;
\r
168 private static final int SLOPE_MIDDLE_ = 0x81;
\r
169 private static final int SLOPE_TAIL_COUNT_ = SLOPE_MAX_ - SLOPE_MIN_ + 1;
\r
170 //private static final int SLOPE_MAX_BYTES_ = 4;
\r
173 * Number of lead bytes:
\r
174 * 1 middle byte for 0
\r
175 * 2*80=160 single bytes for !=0
\r
176 * 2*42=84 for double-byte values
\r
177 * 2*3=6 for 3-byte values
\r
178 * 2*1=2 for 4-byte values
\r
180 * The sum must be <=SLOPE_TAIL_COUNT.
\r
182 * Why these numbers?
\r
183 * - There should be >=128 single-byte values to cover 128-blocks
\r
184 * with small scripts.
\r
185 * - There should be >=20902 single/double-byte values to cover Unihan.
\r
186 * - It helps CJK Extension B some if there are 3-byte values that cover
\r
187 * the distance between them and Unihan.
\r
188 * This also helps to jump among distant places in the BMP.
\r
189 * - Four-byte values are necessary to cover the rest of Unicode.
\r
191 * Symmetrical lead byte counts are for convenience.
\r
192 * With an equal distribution of even and odd differences there is also
\r
193 * no advantage to asymmetrical lead byte counts.
\r
195 private static final int SLOPE_SINGLE_ = 80;
\r
196 private static final int SLOPE_LEAD_2_ = 42;
\r
197 private static final int SLOPE_LEAD_3_ = 3;
\r
198 //private static final int SLOPE_LEAD_4_ = 1;
\r
201 * The difference value range for single-byters.
\r
203 private static final int SLOPE_REACH_POS_1_ = SLOPE_SINGLE_;
\r
204 private static final int SLOPE_REACH_NEG_1_ = (-SLOPE_SINGLE_);
\r
207 * The difference value range for double-byters.
\r
209 private static final int SLOPE_REACH_POS_2_ =
\r
210 SLOPE_LEAD_2_ * SLOPE_TAIL_COUNT_ + SLOPE_LEAD_2_ - 1;
\r
211 private static final int SLOPE_REACH_NEG_2_ = (-SLOPE_REACH_POS_2_ - 1);
\r
214 * The difference value range for 3-byters.
\r
216 private static final int SLOPE_REACH_POS_3_ = SLOPE_LEAD_3_
\r
217 * SLOPE_TAIL_COUNT_
\r
218 * SLOPE_TAIL_COUNT_
\r
219 + (SLOPE_LEAD_3_ - 1)
\r
220 * SLOPE_TAIL_COUNT_ +
\r
221 (SLOPE_TAIL_COUNT_ - 1);
\r
222 private static final int SLOPE_REACH_NEG_3_ = (-SLOPE_REACH_POS_3_ - 1);
\r
225 * The lead byte start values.
\r
227 private static final int SLOPE_START_POS_2_ = SLOPE_MIDDLE_
\r
228 + SLOPE_SINGLE_ + 1;
\r
229 private static final int SLOPE_START_POS_3_ = SLOPE_START_POS_2_
\r
231 private static final int SLOPE_START_NEG_2_ = SLOPE_MIDDLE_ +
\r
232 SLOPE_REACH_NEG_1_;
\r
233 private static final int SLOPE_START_NEG_3_ = SLOPE_START_NEG_2_
\r
236 // private constructor ---------------------------------------------------
\r
239 * Constructor private to prevent initialization
\r
247 // private methods -------------------------------------------------------
\r
250 * Integer division and modulo with negative numerators
\r
251 * yields negative modulo results and quotients that are one more than
\r
252 * what we need here.
\r
253 * @param number which operations are to be performed on
\r
254 * @param factor the factor to use for division
\r
255 * @return (result of division) << 32 | modulo
\r
257 private static final long getNegDivMod(int number, int factor)
\r
259 int modulo = number % factor;
\r
260 long result = number / factor;
\r
265 return (result << 32) | modulo;
\r
269 * Encode one difference value -0x10ffff..+0x10ffff in 1..3 bytes,
\r
270 * preserving lexical order
\r
272 * @param buffer byte buffer to append to
\r
273 * @param offset to the byte buffer to start appending
\r
274 * @return end offset where the appending stops
\r
276 private static final int writeDiff(int diff, byte buffer[], int offset)
\r
278 if (diff >= SLOPE_REACH_NEG_1_) {
\r
279 if (diff <= SLOPE_REACH_POS_1_) {
\r
280 buffer[offset ++] = (byte)(SLOPE_MIDDLE_ + diff);
\r
282 else if (diff <= SLOPE_REACH_POS_2_) {
\r
283 buffer[offset ++] = (byte)(SLOPE_START_POS_2_
\r
284 + (diff / SLOPE_TAIL_COUNT_));
\r
285 buffer[offset ++] = (byte)(SLOPE_MIN_ +
\r
286 (diff % SLOPE_TAIL_COUNT_));
\r
288 else if (diff <= SLOPE_REACH_POS_3_) {
\r
289 buffer[offset + 2] = (byte)(SLOPE_MIN_
\r
290 + (diff % SLOPE_TAIL_COUNT_));
\r
291 diff /= SLOPE_TAIL_COUNT_;
\r
292 buffer[offset + 1] = (byte)(SLOPE_MIN_
\r
293 + (diff % SLOPE_TAIL_COUNT_));
\r
294 buffer[offset] = (byte)(SLOPE_START_POS_3_
\r
295 + (diff / SLOPE_TAIL_COUNT_));
\r
299 buffer[offset + 3] = (byte)(SLOPE_MIN_
\r
300 + diff % SLOPE_TAIL_COUNT_);
\r
301 diff /= SLOPE_TAIL_COUNT_;
\r
302 buffer[offset] = (byte)(SLOPE_MIN_
\r
303 + diff % SLOPE_TAIL_COUNT_);
\r
304 diff /= SLOPE_TAIL_COUNT_;
\r
305 buffer[offset + 1] = (byte)(SLOPE_MIN_
\r
306 + diff % SLOPE_TAIL_COUNT_);
\r
307 buffer[offset] = (byte)SLOPE_MAX_;
\r
312 long division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);
\r
313 int modulo = (int)division;
\r
314 if (diff >= SLOPE_REACH_NEG_2_) {
\r
315 diff = (int)(division >> 32);
\r
316 buffer[offset ++] = (byte)(SLOPE_START_NEG_2_ + diff);
\r
317 buffer[offset ++] = (byte)(SLOPE_MIN_ + modulo);
\r
319 else if (diff >= SLOPE_REACH_NEG_3_) {
\r
320 buffer[offset + 2] = (byte)(SLOPE_MIN_ + modulo);
\r
321 diff = (int)(division >> 32);
\r
322 division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);
\r
323 modulo = (int)division;
\r
324 diff = (int)(division >> 32);
\r
325 buffer[offset + 1] = (byte)(SLOPE_MIN_ + modulo);
\r
326 buffer[offset] = (byte)(SLOPE_START_NEG_3_ + diff);
\r
330 buffer[offset + 3] = (byte)(SLOPE_MIN_ + modulo);
\r
331 diff = (int)(division >> 32);
\r
332 division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);
\r
333 modulo = (int)division;
\r
334 diff = (int)(division >> 32);
\r
335 buffer[offset + 2] = (byte)(SLOPE_MIN_ + modulo);
\r
336 division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);
\r
337 modulo = (int)division;
\r
338 buffer[offset + 1] = (byte)(SLOPE_MIN_ + modulo);
\r
339 buffer[offset] = SLOPE_MIN_;
\r
347 * How many bytes would writeDiff() write?
\r
350 private static final int lengthOfDiff(int diff)
\r
352 if (diff >= SLOPE_REACH_NEG_1_) {
\r
353 if (diff <= SLOPE_REACH_POS_1_) {
\r
356 else if (diff <= SLOPE_REACH_POS_2_) {
\r
359 else if(diff <= SLOPE_REACH_POS_3_) {
\r
367 if (diff >= SLOPE_REACH_NEG_2_) {
\r
370 else if (diff >= SLOPE_REACH_NEG_3_) {
\r