jars/icu4j-4_4_2-src/main/classes/core/src/com/ibm/icu/impl/BOCU.java

   1 /**\r
   2 *******************************************************************************\r
   3 * Copyright (C) 1996-2009, International Business Machines Corporation and    *\r
   4 * others. All Rights Reserved.                                                *\r
   5 *******************************************************************************\r
   6 */\r
   7 package com.ibm.icu.impl;\r
   8 \r
   9 import com.ibm.icu.text.UCharacterIterator;\r
  10 \r
  11 /**\r
  12  * <p>Binary Ordered Compression for Unicode</p>\r
  13  * \r
  14  * <p>Users are strongly encouraged to read the ICU paper on \r
  15  * <a href="http://www.icu-project.org/docs/papers/binary_ordered_compression_for_unicode.html">\r
  16  * BOCU</a> before attempting to use this class.</p>\r
  17  * \r
  18  * <p>BOCU is used to compress unicode text into a stream of unsigned\r
  19  * bytes.  For many kinds of text the compression compares favorably\r
  20  * to UTF-8, and for some kinds of text (such as CJK) it does better.\r
  21  * The resulting bytes will compare in the same order as the original\r
  22  * code points.  The byte stream does not contain the values 0, 1, or\r
  23  * 2.</p>\r
  24  * \r
  25  * <p>One example of a use of BOCU is in \r
  26  * com.ibm.icu.text.Collator#getCollationKey(String) for a RuleBasedCollator object with \r
  27  * collation strength IDENTICAL. The result CollationKey will consist of the \r
  28  * collation order of the source string followed by the BOCU result of the \r
  29  * source string. \r
  30  * </p>\r
  31  *\r
  32  * <p>Unlike a UTF encoding, BOCU-compressed text is not suitable for\r
  33  * random access.</p>\r
  34  * \r
  35  * <p>Method: Slope Detection<br> Remember the previous code point\r
  36  * (initial 0).  For each code point in the string, encode the\r
  37  * difference with the previous one.  Similar to a UTF, the length of\r
  38  * the byte sequence is encoded in the lead bytes.  Unlike a UTF, the\r
  39  * trail byte values may overlap with lead/single byte values.  The\r
  40  * signedness of the difference must be encoded as the most\r
  41  * significant part.</p>\r
  42  *\r
  43  * <p>We encode differences with few bytes if their absolute values\r
  44  * are small.  For correct ordering, we must treat the entire value\r
  45  * range -10ffff..+10ffff in ascending order, which forbids encoding\r
  46  * the sign and the absolute value separately. Instead, we split the\r
  47  * lead byte range in the middle and encode non-negative values going\r
  48  * up and negative values going down.</p>\r
  49  *\r
  50  * <p>For very small absolute values, the difference is added to a\r
  51  * middle byte value for single-byte encoded differences.  For\r
  52  * somewhat larger absolute values, the difference is divided by the\r
  53  * number of byte values available, the modulo is used for one trail\r
  54  * byte, and the remainder is added to a lead byte avoiding the\r
  55  * single-byte range.  For large absolute values, the difference is\r
  56  * similarly encoded in three bytes. (Syn Wee, I need examples\r
  57  * here.)</p>\r
  58  *\r
  59  * <p>BOCU does not use byte values 0, 1, or 2, but uses all other\r
  60  * byte values for lead and single bytes, so that the middle range of\r
  61  * single bytes is as large as possible.</p>\r
  62  *\r
  63  * <p>Note that the lead byte ranges overlap some, but that the\r
  64  * sequences as a whole are well ordered. I.e., even if the lead byte\r
  65  * is the same for sequences of different lengths, the trail bytes\r
  66  * establish correct order.  It would be possible to encode slightly\r
  67  * larger ranges for each length (>1) by subtracting the lower bound\r
  68  * of the range. However, that would also slow down the calculation.\r
  69  * (Syn Wee, need an example).</p>\r
  70  *\r
  71  * <p>For the actual string encoding, an optimization moves the\r
  72  * previous code point value to the middle of its Unicode script block\r
  73  * to minimize the differences in same-script text runs.  (Syn Wee,\r
  74  * need an example.)</p>\r
  75  *\r
  76  * @author Syn Wee Quek\r
  77  * @since release 2.2, May 3rd 2002\r
  78  */\r
  79 public class BOCU \r
  80 {      \r
  81     // public constructors --------------------------------------------------\r
  82     \r
  83     // public methods -------------------------------------------------------\r
  84         \r
  85     /**\r
  86      * <p>Encode the code points of a string as a sequence of bytes,\r
  87      * preserving lexical order.</p>\r
  88      * <p>The minimum size of buffer required for the compression can be \r
  89      * preflighted by getCompressionLength(String).</p>\r
  90      * @param source text source\r
  91      * @param buffer output buffer\r
  92      * @param offset to start writing to\r
  93      * @return end offset where the writing stopped\r
  94      * @see #getCompressionLength(String)\r
  95      * @exception ArrayIndexOutOfBoundsException thrown if size of buffer is \r
  96      *            too small for the output.\r
  97      */\r
  98     public static int compress(String source, byte buffer[], int offset) \r
  99     {\r
 100         int prev = 0;\r
 101         UCharacterIterator iterator = UCharacterIterator.getInstance(source);\r
 102         int codepoint = iterator.nextCodePoint();\r
 103         while (codepoint != UCharacterIterator.DONE) {\r
 104             if (prev < 0x4e00 || prev >= 0xa000) {\r
 105                 prev = (prev & ~0x7f) - SLOPE_REACH_NEG_1_;\r
 106             } \r
 107             else {\r
 108                 // Unihan U+4e00..U+9fa5:\r
 109                 // double-bytes down from the upper end\r
 110                 prev = 0x9fff - SLOPE_REACH_POS_2_;\r
 111             }\r
 112         \r
 113             offset = writeDiff(codepoint - prev, buffer, offset);\r
 114             prev = codepoint;\r
 115             codepoint = iterator.nextCodePoint();\r
 116         }\r
 117         return offset;\r
 118     }\r
 119         \r
 120     /** \r
 121      * Return the number of  bytes that compress() would write.\r
 122      * @param source text source string\r
 123      * @return the length of the BOCU result \r
 124      * @see #compress(String, byte[], int)\r
 125      */\r
 126     public static int getCompressionLength(String source) \r
 127     {\r
 128         int prev = 0;\r
 129         int result = 0;\r
 130         UCharacterIterator iterator =  UCharacterIterator.getInstance(source);\r
 131         int codepoint = iterator.nextCodePoint();\r
 132         while (codepoint != UCharacterIterator.DONE) {\r
 133             if (prev < 0x4e00 || prev >= 0xa000) {\r
 134                 prev = (prev & ~0x7f) - SLOPE_REACH_NEG_1_;\r
 135             } \r
 136             else {\r
 137                 // Unihan U+4e00..U+9fa5:\r
 138                 // double-bytes down from the upper end\r
 139                 prev = 0x9fff - SLOPE_REACH_POS_2_;\r
 140             }\r
 141         \r
 142             codepoint = iterator.nextCodePoint();\r
 143             result += lengthOfDiff(codepoint - prev);\r
 144             prev = codepoint;\r
 145         }\r
 146         return result;\r
 147     }\r
 148 \r
 149     // public setter methods -------------------------------------------------\r
 150         \r
 151     // public getter methods ------------------------------------------------\r
 152             \r
 153     // public other methods -------------------------------------------------\r
 154     \r
 155     // protected constructor ------------------------------------------------\r
 156       \r
 157     // protected data members ------------------------------------------------\r
 158     \r
 159     // protected methods -----------------------------------------------------\r
 160  \r
 161     // private data members --------------------------------------------------\r
 162     \r
 163     /** \r
 164      * Do not use byte values 0, 1, 2 because they are separators in sort keys.\r
 165      */\r
 166     private static final int SLOPE_MIN_ = 3;\r
 167     private static final int SLOPE_MAX_ = 0xff;\r
 168     private static final int SLOPE_MIDDLE_ = 0x81;\r
 169     private static final int SLOPE_TAIL_COUNT_ = SLOPE_MAX_ - SLOPE_MIN_ + 1;\r
 170     //private static final int SLOPE_MAX_BYTES_ = 4;\r
 171 \r
 172     /**\r
 173      * Number of lead bytes:\r
 174      * 1        middle byte for 0\r
 175      * 2*80=160 single bytes for !=0\r
 176      * 2*42=84  for double-byte values\r
 177      * 2*3=6    for 3-byte values\r
 178      * 2*1=2    for 4-byte values\r
 179      *\r
 180      * The sum must be <=SLOPE_TAIL_COUNT.\r
 181      *\r
 182      * Why these numbers?\r
 183      * - There should be >=128 single-byte values to cover 128-blocks\r
 184      *   with small scripts.\r
 185      * - There should be >=20902 single/double-byte values to cover Unihan.\r
 186      * - It helps CJK Extension B some if there are 3-byte values that cover\r
 187      *   the distance between them and Unihan.\r
 188      *   This also helps to jump among distant places in the BMP.\r
 189      * - Four-byte values are necessary to cover the rest of Unicode.\r
 190      *\r
 191      * Symmetrical lead byte counts are for convenience.\r
 192      * With an equal distribution of even and odd differences there is also\r
 193      * no advantage to asymmetrical lead byte counts.\r
 194      */\r
 195     private static final int SLOPE_SINGLE_ = 80;\r
 196     private static final int SLOPE_LEAD_2_ = 42;\r
 197     private static final int SLOPE_LEAD_3_ = 3;\r
 198     //private static final int SLOPE_LEAD_4_ = 1;\r
 199 \r
 200     /** \r
 201      * The difference value range for single-byters.\r
 202      */\r
 203     private static final int SLOPE_REACH_POS_1_ = SLOPE_SINGLE_;\r
 204     private static final int SLOPE_REACH_NEG_1_ = (-SLOPE_SINGLE_);\r
 205 \r
 206     /** \r
 207      * The difference value range for double-byters.\r
 208      */\r
 209     private static final int SLOPE_REACH_POS_2_ = \r
 210         SLOPE_LEAD_2_ * SLOPE_TAIL_COUNT_ + SLOPE_LEAD_2_ - 1;\r
 211     private static final int SLOPE_REACH_NEG_2_ = (-SLOPE_REACH_POS_2_ - 1);\r
 212 \r
 213     /** \r
 214      * The difference value range for 3-byters.\r
 215      */\r
 216     private static final int SLOPE_REACH_POS_3_ = SLOPE_LEAD_3_ \r
 217         * SLOPE_TAIL_COUNT_ \r
 218         * SLOPE_TAIL_COUNT_ \r
 219         + (SLOPE_LEAD_3_ - 1)\r
 220         * SLOPE_TAIL_COUNT_ +\r
 221         (SLOPE_TAIL_COUNT_ - 1);\r
 222     private static final int SLOPE_REACH_NEG_3_ = (-SLOPE_REACH_POS_3_ - 1);\r
 223 \r
 224     /** \r
 225      * The lead byte start values.\r
 226      */\r
 227     private static final int SLOPE_START_POS_2_ = SLOPE_MIDDLE_ \r
 228         + SLOPE_SINGLE_ + 1;\r
 229     private static final int SLOPE_START_POS_3_ = SLOPE_START_POS_2_ \r
 230         + SLOPE_LEAD_2_;\r
 231     private static final int SLOPE_START_NEG_2_ = SLOPE_MIDDLE_ + \r
 232         SLOPE_REACH_NEG_1_;\r
 233     private static final int SLOPE_START_NEG_3_ = SLOPE_START_NEG_2_\r
 234         - SLOPE_LEAD_2_;\r
 235                                                                                                         \r
 236     // private constructor ---------------------------------------------------\r
 237         \r
 238     /**\r
 239      * Constructor private to prevent initialization\r
 240      */\r
 241     ///CLOVER:OFF\r
 242     private BOCU()\r
 243     {\r
 244     }            \r
 245     ///CLOVER:ON                                                                                       \r
 246     \r
 247     // private methods -------------------------------------------------------\r
 248     \r
 249     /**\r
 250      * Integer division and modulo with negative numerators\r
 251      * yields negative modulo results and quotients that are one more than\r
 252      * what we need here.\r
 253      * @param number which operations are to be performed on\r
 254      * @param factor the factor to use for division\r
 255      * @return (result of division) << 32 | modulo \r
 256      */\r
 257     private static final long getNegDivMod(int number, int factor) \r
 258     {\r
 259         int modulo = number % factor; \r
 260         long result = number / factor;\r
 261         if (modulo < 0) { \r
 262             -- result; \r
 263             modulo += factor; \r
 264         } \r
 265         return (result << 32) | modulo;\r
 266     }\r
 267         \r
 268     /**\r
 269      * Encode one difference value -0x10ffff..+0x10ffff in 1..3 bytes,\r
 270      * preserving lexical order\r
 271      * @param diff\r
 272      * @param buffer byte buffer to append to\r
 273      * @param offset to the byte buffer to start appending\r
 274      * @return end offset where the appending stops\r
 275      */\r
 276     private static final int writeDiff(int diff, byte buffer[], int offset) \r
 277     {\r
 278         if (diff >= SLOPE_REACH_NEG_1_) {\r
 279             if (diff <= SLOPE_REACH_POS_1_) {\r
 280                 buffer[offset ++] = (byte)(SLOPE_MIDDLE_ + diff);\r
 281             } \r
 282             else if (diff <= SLOPE_REACH_POS_2_) {\r
 283                 buffer[offset ++] = (byte)(SLOPE_START_POS_2_ \r
 284                                            + (diff / SLOPE_TAIL_COUNT_));\r
 285                 buffer[offset ++] = (byte)(SLOPE_MIN_ + \r
 286                                            (diff % SLOPE_TAIL_COUNT_));\r
 287             } \r
 288             else if (diff <= SLOPE_REACH_POS_3_) {\r
 289                 buffer[offset + 2] = (byte)(SLOPE_MIN_ \r
 290                                             + (diff % SLOPE_TAIL_COUNT_));\r
 291                 diff /= SLOPE_TAIL_COUNT_;\r
 292                 buffer[offset + 1] = (byte)(SLOPE_MIN_ \r
 293                                             + (diff % SLOPE_TAIL_COUNT_));\r
 294                 buffer[offset] = (byte)(SLOPE_START_POS_3_ \r
 295                                         + (diff / SLOPE_TAIL_COUNT_));\r
 296                 offset += 3;\r
 297             } \r
 298             else {\r
 299                 buffer[offset + 3] = (byte)(SLOPE_MIN_ \r
 300                                             + diff % SLOPE_TAIL_COUNT_);\r
 301                 diff /= SLOPE_TAIL_COUNT_;\r
 302                 buffer[offset] = (byte)(SLOPE_MIN_ \r
 303                                         + diff % SLOPE_TAIL_COUNT_);\r
 304                 diff /= SLOPE_TAIL_COUNT_;\r
 305                 buffer[offset + 1] = (byte)(SLOPE_MIN_ \r
 306                                             + diff % SLOPE_TAIL_COUNT_);\r
 307                 buffer[offset] = (byte)SLOPE_MAX_;\r
 308                 offset += 4;\r
 309             }\r
 310         } \r
 311         else {\r
 312             long division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);\r
 313             int modulo = (int)division;\r
 314             if (diff >= SLOPE_REACH_NEG_2_) {\r
 315                 diff = (int)(division >> 32);\r
 316                 buffer[offset ++] = (byte)(SLOPE_START_NEG_2_ + diff);\r
 317                 buffer[offset ++] = (byte)(SLOPE_MIN_ + modulo);\r
 318             } \r
 319             else if (diff >= SLOPE_REACH_NEG_3_) {\r
 320                 buffer[offset + 2] = (byte)(SLOPE_MIN_ + modulo);\r
 321                 diff = (int)(division >> 32);\r
 322                 division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);\r
 323                 modulo = (int)division;\r
 324                 diff = (int)(division >> 32);\r
 325                 buffer[offset + 1] = (byte)(SLOPE_MIN_ + modulo);\r
 326                 buffer[offset] = (byte)(SLOPE_START_NEG_3_ + diff);\r
 327                 offset += 3;\r
 328             } \r
 329             else {\r
 330                 buffer[offset + 3] = (byte)(SLOPE_MIN_ + modulo);\r
 331                 diff = (int)(division >> 32);\r
 332                 division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);\r
 333                 modulo = (int)division;\r
 334                 diff = (int)(division >> 32);\r
 335                 buffer[offset + 2] = (byte)(SLOPE_MIN_ + modulo);\r
 336                 division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);\r
 337                 modulo = (int)division;\r
 338                 buffer[offset + 1] = (byte)(SLOPE_MIN_ + modulo);\r
 339                 buffer[offset] = SLOPE_MIN_;\r
 340                 offset += 4;\r
 341             }\r
 342         }\r
 343         return offset;\r
 344     }\r
 345         \r
 346     /**\r
 347      * How many bytes would writeDiff() write? \r
 348      * @param diff\r
 349      */\r
 350     private static final int lengthOfDiff(int diff) \r
 351     {\r
 352         if (diff >= SLOPE_REACH_NEG_1_) {\r
 353             if (diff <= SLOPE_REACH_POS_1_) {\r
 354                 return 1;\r
 355             } \r
 356             else if (diff <= SLOPE_REACH_POS_2_) {\r
 357                 return 2;\r
 358             } \r
 359             else if(diff <= SLOPE_REACH_POS_3_) {\r
 360                 return 3;\r
 361             } \r
 362             else {\r
 363                 return 4;\r
 364             }\r
 365         } \r
 366         else {\r
 367             if (diff >= SLOPE_REACH_NEG_2_) {\r
 368                 return 2;\r
 369             } \r
 370             else if (diff >= SLOPE_REACH_NEG_3_) {\r
 371                 return 3;\r
 372             } \r
 373             else {\r
 374                 return 4;\r
 375             }\r
 376         }\r
 377     }\r
 378 }\r