jars/icu4j-4_4_2-src/main/classes/core/src/com/ibm/icu/impl/IntTrieBuilder.java

   1 /*\r
   2 ******************************************************************************\r
   3 * Copyright (C) 1996-2010, International Business Machines Corporation and   *\r
   4 * others. All Rights Reserved.                                               *\r
   5 ******************************************************************************\r
   6 */\r
   7 \r
   8 package com.ibm.icu.impl;\r
   9 \r
  10 import java.io.DataOutputStream;\r
  11 import java.io.IOException;\r
  12 import java.io.OutputStream;\r
  13 import java.util.Arrays;\r
  14 \r
  15 import com.ibm.icu.lang.UCharacter;\r
  16 import com.ibm.icu.text.UTF16;\r
  17 \r
  18 /**\r
  19  * Builder class to manipulate and generate a trie.\r
  20  * This is useful for ICU data in primitive types.\r
  21  * Provides a compact way to store information that is indexed by Unicode \r
  22  * values, such as character properties, types, keyboard values, etc. This is \r
  23  * very useful when you have a block of Unicode data that contains significant \r
  24  * values while the rest of the Unicode data is unused in the application or \r
  25  * when you have a lot of redundance, such as where all 21,000 Han ideographs \r
  26  * have the same value.  However, lookup is much faster than a hash table.\r
  27  * A trie of any primitive data type serves two purposes:\r
  28  * <UL type = round>\r
  29  *     <LI>Fast access of the indexed values.\r
  30  *     <LI>Smaller memory footprint.\r
  31  * </UL>\r
  32  * This is a direct port from the ICU4C version\r
  33  * @author             Syn Wee Quek\r
  34  */\r
  35 public class IntTrieBuilder extends TrieBuilder\r
  36 {\r
  37     // public constructor ----------------------------------------------\r
  38                 \r
  39     /**\r
  40      * Copy constructor\r
  41      */\r
  42     public IntTrieBuilder(IntTrieBuilder table)\r
  43     {\r
  44         super(table);\r
  45         m_data_ = new int[m_dataCapacity_];\r
  46         System.arraycopy(table.m_data_, 0, m_data_, 0, m_dataLength_);\r
  47         m_initialValue_ = table.m_initialValue_;\r
  48         m_leadUnitValue_ = table.m_leadUnitValue_;\r
  49     }\r
  50     \r
  51     /**\r
  52      * Constructs a build table\r
  53      * @param aliasdata data to be filled into table\r
  54      * @param maxdatalength maximum data length allowed in table\r
  55      * @param initialvalue inital data value\r
  56      * @param latin1linear is latin 1 to be linear\r
  57      */\r
  58     public IntTrieBuilder(int aliasdata[], int maxdatalength, \r
  59                           int initialvalue, int leadunitvalue, \r
  60                           boolean latin1linear) \r
  61     {\r
  62         super();\r
  63         if (maxdatalength < DATA_BLOCK_LENGTH || (latin1linear \r
  64                                                   && maxdatalength < 1024)) {\r
  65             throw new IllegalArgumentException(\r
  66                                                "Argument maxdatalength is too small");\r
  67         }\r
  68             \r
  69         if (aliasdata != null) {\r
  70             m_data_ = aliasdata;\r
  71         } \r
  72         else {\r
  73             m_data_ = new int[maxdatalength];\r
  74         }\r
  75         \r
  76         // preallocate and reset the first data block (block index 0)\r
  77         int j = DATA_BLOCK_LENGTH;\r
  78         \r
  79         if (latin1linear) {\r
  80             // preallocate and reset the first block (number 0) and Latin-1 \r
  81             // (U+0000..U+00ff) after that made sure above that \r
  82             // maxDataLength >= 1024\r
  83             // set indexes to point to consecutive data blocks\r
  84             int i = 0;\r
  85             do {\r
  86                 // do this at least for trie->index[0] even if that block is \r
  87                 // only partly used for Latin-1\r
  88                 m_index_[i ++] = j;\r
  89                 j += DATA_BLOCK_LENGTH;\r
  90             } while (i < (256 >> SHIFT_));\r
  91         }\r
  92         \r
  93         m_dataLength_ = j;\r
  94         // reset the initially allocated blocks to the initial value\r
  95         Arrays.fill(m_data_, 0, m_dataLength_, initialvalue);\r
  96         m_initialValue_ = initialvalue;\r
  97         m_leadUnitValue_ = leadunitvalue;\r
  98         m_dataCapacity_ = maxdatalength;\r
  99         m_isLatin1Linear_ = latin1linear;\r
 100         m_isCompacted_ = false;\r
 101     }\r
 102 \r
 103     // public methods -------------------------------------------------------\r
 104      \r
 105     /*public final void print()\r
 106       {\r
 107       int i = 0;\r
 108       int oldvalue = m_index_[i];\r
 109       int count = 0;\r
 110       System.out.println("index length " + m_indexLength_ \r
 111       + " --------------------------");\r
 112       while (i < m_indexLength_) {\r
 113       if (m_index_[i] != oldvalue) {\r
 114       System.out.println("index has " + count + " counts of " \r
 115       + Integer.toHexString(oldvalue));\r
 116       count = 0;\r
 117       oldvalue = m_index_[i];\r
 118       }\r
 119       count ++;\r
 120       i ++;\r
 121       }\r
 122       System.out.println("index has " + count + " counts of " \r
 123       + Integer.toHexString(oldvalue));\r
 124       i = 0;\r
 125       oldvalue = m_data_[i];\r
 126       count = 0;\r
 127       System.out.println("data length " + m_dataLength_ \r
 128       + " --------------------------");\r
 129       while (i < m_dataLength_) {\r
 130       if (m_data_[i] != oldvalue) {\r
 131       if ((oldvalue & 0xf1000000) == 0xf1000000) {\r
 132       int temp = oldvalue & 0xffffff; \r
 133       temp += 0x320;\r
 134       oldvalue = 0xf1000000 | temp;\r
 135       }\r
 136       if ((oldvalue & 0xf2000000) == 0xf2000000) {\r
 137       int temp = oldvalue & 0xffffff; \r
 138       temp += 0x14a;\r
 139       oldvalue = 0xf2000000 | temp;\r
 140       }\r
 141       System.out.println("data has " + count + " counts of " \r
 142       + Integer.toHexString(oldvalue));\r
 143       count = 0;\r
 144       oldvalue = m_data_[i];\r
 145       }\r
 146       count ++;\r
 147       i ++;\r
 148       }\r
 149       if ((oldvalue & 0xf1000000) == 0xf1000000) {\r
 150       int temp = oldvalue & 0xffffff; \r
 151       temp += 0x320;\r
 152       oldvalue = 0xf1000000 | temp;\r
 153       }\r
 154       if ((oldvalue & 0xf2000000) == 0xf2000000) {\r
 155       int temp = oldvalue & 0xffffff; \r
 156       temp += 0x14a;\r
 157       oldvalue = 0xf2000000 | temp;\r
 158       }\r
 159       System.out.println("data has " + count + " counts of " \r
 160       + Integer.toHexString(oldvalue));\r
 161       }\r
 162     */   \r
 163     /**\r
 164      * Gets a 32 bit data from the table data\r
 165      * @param ch codepoint which data is to be retrieved\r
 166      * @return the 32 bit data\r
 167      */\r
 168     public int getValue(int ch) \r
 169     {\r
 170         // valid, uncompacted trie and valid c?\r
 171         if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < 0) {\r
 172             return 0;\r
 173         }\r
 174     \r
 175         int block = m_index_[ch >> SHIFT_];\r
 176         return m_data_[Math.abs(block) + (ch & MASK_)];\r
 177     }\r
 178     \r
 179     /**\r
 180      * Get a 32 bit data from the table data\r
 181      * @param ch  code point for which data is to be retrieved.\r
 182      * @param inBlockZero  Output parameter, inBlockZero[0] returns true if the\r
 183      *                      char maps into block zero, otherwise false.\r
 184      * @return the 32 bit data value.\r
 185      */\r
 186     public int getValue(int ch, boolean [] inBlockZero) \r
 187     {\r
 188         // valid, uncompacted trie and valid c?\r
 189         if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < 0) {\r
 190             if (inBlockZero != null) {\r
 191                 inBlockZero[0] = true;\r
 192             }\r
 193             return 0;\r
 194         }\r
 195     \r
 196         int block = m_index_[ch >> SHIFT_];\r
 197         if (inBlockZero != null) {\r
 198             inBlockZero[0] = (block == 0);\r
 199         }\r
 200         return m_data_[Math.abs(block) + (ch & MASK_)];\r
 201     }\r
 202     \r
 203     \r
 204     /**\r
 205      * Sets a 32 bit data in the table data\r
 206      * @param ch codepoint which data is to be set\r
 207      * @param value to set\r
 208      * @return true if the set is successful, otherwise \r
 209      *              if the table has been compacted return false\r
 210      */\r
 211     public boolean setValue(int ch, int value) \r
 212     {\r
 213         // valid, uncompacted trie and valid c? \r
 214         if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < 0) {\r
 215             return false;\r
 216         }\r
 217     \r
 218         int block = getDataBlock(ch);\r
 219         if (block < 0) {\r
 220             return false;\r
 221         }\r
 222     \r
 223         m_data_[block + (ch & MASK_)] = value;\r
 224         return true;\r
 225     }\r
 226     \r
 227     /**\r
 228      * Serializes the build table with 32 bit data\r
 229      * @param datamanipulate builder raw fold method implementation\r
 230      * @param triedatamanipulate result trie fold method\r
 231      * @return a new trie\r
 232      */\r
 233     public IntTrie serialize(TrieBuilder.DataManipulate datamanipulate, \r
 234                              Trie.DataManipulate triedatamanipulate)\r
 235     {\r
 236         if (datamanipulate == null) {\r
 237             throw new IllegalArgumentException("Parameters can not be null");\r
 238         }\r
 239         // fold and compact if necessary, also checks that indexLength is \r
 240         // within limits \r
 241         if (!m_isCompacted_) {\r
 242             // compact once without overlap to improve folding\r
 243             compact(false);\r
 244             // fold the supplementary part of the index array\r
 245             fold(datamanipulate);\r
 246             // compact again with overlap for minimum data array length\r
 247             compact(true);\r
 248             m_isCompacted_ = true;\r
 249         }\r
 250         // is dataLength within limits? \r
 251         if (m_dataLength_ >= MAX_DATA_LENGTH_) {\r
 252             throw new ArrayIndexOutOfBoundsException("Data length too small");\r
 253         }\r
 254     \r
 255         char index[] = new char[m_indexLength_];\r
 256         int data[] = new int[m_dataLength_];\r
 257         // write the index (stage 1) array and the 32-bit data (stage 2) array\r
 258         // write 16-bit index values shifted right by INDEX_SHIFT_ \r
 259         for (int i = 0; i < m_indexLength_; i ++) {\r
 260             index[i] = (char)(m_index_[i] >>> INDEX_SHIFT_);\r
 261         }\r
 262         // write 32-bit data values\r
 263         System.arraycopy(m_data_, 0, data, 0, m_dataLength_);\r
 264         \r
 265         int options = SHIFT_ | (INDEX_SHIFT_ << OPTIONS_INDEX_SHIFT_);\r
 266         options |= OPTIONS_DATA_IS_32_BIT_;\r
 267         if (m_isLatin1Linear_) {\r
 268             options |= OPTIONS_LATIN1_IS_LINEAR_;\r
 269         }\r
 270         return new IntTrie(index, data, m_initialValue_, options, \r
 271                            triedatamanipulate);\r
 272     }\r
 273     \r
 274     \r
 275     /**\r
 276      * Serializes the build table to an output stream.\r
 277      * \r
 278      * Compacts the build-time trie after all values are set, and then\r
 279      * writes the serialized form onto an output stream.\r
 280      * \r
 281      * After this, this build-time Trie can only be serialized again and/or closed;\r
 282      * no further values can be added.\r
 283      * \r
 284      * This function is the rough equivalent of utrie_seriaize() in ICU4C.\r
 285      * \r
 286      * @param os the output stream to which the seriaized trie will be written.\r
 287      *         If nul, the function still returns the size of the serialized Trie.\r
 288      * @param reduceTo16Bits If true, reduce the data size to 16 bits.  The resulting\r
 289      *         serialized form can then be used to create a CharTrie.\r
 290      * @param datamanipulate builder raw fold method implementation\r
 291      * @return the number of bytes written to the output stream.\r
 292      */\r
 293      public int serialize(OutputStream os, boolean reduceTo16Bits,\r
 294             TrieBuilder.DataManipulate datamanipulate)  throws IOException {\r
 295          if (datamanipulate == null) {\r
 296              throw new IllegalArgumentException("Parameters can not be null");\r
 297          }\r
 298 \r
 299          // fold and compact if necessary, also checks that indexLength is \r
 300          // within limits \r
 301          if (!m_isCompacted_) {\r
 302              // compact once without overlap to improve folding\r
 303              compact(false);\r
 304              // fold the supplementary part of the index array\r
 305              fold(datamanipulate);\r
 306              // compact again with overlap for minimum data array length\r
 307              compact(true);\r
 308              m_isCompacted_ = true;\r
 309          }\r
 310          \r
 311          // is dataLength within limits? \r
 312          int length;\r
 313          if (reduceTo16Bits) {\r
 314              length = m_dataLength_ + m_indexLength_;\r
 315          } else {\r
 316              length = m_dataLength_;\r
 317          }\r
 318          if (length >= MAX_DATA_LENGTH_) {\r
 319              throw new ArrayIndexOutOfBoundsException("Data length too small");\r
 320          }\r
 321          \r
 322          //  struct UTrieHeader {\r
 323          //      int32_t   signature;\r
 324          //      int32_t   options  (a bit field)\r
 325          //      int32_t   indexLength\r
 326          //      int32_t   dataLength\r
 327          length = Trie.HEADER_LENGTH_ + 2*m_indexLength_;\r
 328          if(reduceTo16Bits) {\r
 329              length+=2*m_dataLength_;\r
 330          } else {\r
 331              length+=4*m_dataLength_;\r
 332          }\r
 333          \r
 334          if (os == null) {\r
 335              // No output stream.  Just return the length of the serialized Trie, in bytes.\r
 336              return length;\r
 337          }\r
 338 \r
 339          DataOutputStream dos = new DataOutputStream(os);\r
 340          dos.writeInt(Trie.HEADER_SIGNATURE_);  \r
 341          \r
 342          int options = Trie.INDEX_STAGE_1_SHIFT_ | (Trie.INDEX_STAGE_2_SHIFT_<<Trie.HEADER_OPTIONS_INDEX_SHIFT_);\r
 343          if(!reduceTo16Bits) {\r
 344              options |= Trie.HEADER_OPTIONS_DATA_IS_32_BIT_;\r
 345          }\r
 346          if(m_isLatin1Linear_) {\r
 347              options |= Trie.HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_;\r
 348          }\r
 349          dos.writeInt(options);\r
 350          \r
 351          dos.writeInt(m_indexLength_);\r
 352          dos.writeInt(m_dataLength_);\r
 353          \r
 354          /* write the index (stage 1) array and the 16/32-bit data (stage 2) array */\r
 355          if(reduceTo16Bits) {\r
 356              /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT, after adding indexLength */\r
 357              for (int i=0; i<m_indexLength_; i++) {\r
 358                  int v = (m_index_[i] + m_indexLength_) >>> Trie.INDEX_STAGE_2_SHIFT_;\r
 359                  dos.writeChar(v);\r
 360              }\r
 361              \r
 362              /* write 16-bit data values */\r
 363              for(int i=0; i<m_dataLength_; i++) {\r
 364                  int v = m_data_[i] & 0x0000ffff;\r
 365                  dos.writeChar(v);\r
 366              }\r
 367          } else {\r
 368              /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT */\r
 369              for (int i=0; i<m_indexLength_; i++) {\r
 370                  int v = (m_index_[i]) >>> Trie.INDEX_STAGE_2_SHIFT_;\r
 371                  dos.writeChar(v);\r
 372              }\r
 373              \r
 374              /* write 32-bit data values */\r
 375              for(int i=0; i<m_dataLength_; i++) {\r
 376                  dos.writeInt(m_data_[i]);\r
 377              }\r
 378         }\r
 379 \r
 380          return length;\r
 381 \r
 382     }\r
 383     \r
 384     \r
 385     /**\r
 386      * Set a value in a range of code points [start..limit].\r
 387      * All code points c with start &lt;= c &lt; limit will get the value if\r
 388      * overwrite is true or if the old value is 0.\r
 389      * @param start the first code point to get the value\r
 390      * @param limit one past the last code point to get the value\r
 391      * @param value the value\r
 392      * @param overwrite flag for whether old non-initial values are to be \r
 393      *        overwritten\r
 394      * @return false if a failure occurred (illegal argument or data array \r
 395      *               overrun)\r
 396      */\r
 397     public boolean setRange(int start, int limit, int value, \r
 398                             boolean overwrite) \r
 399     {\r
 400         // repeat value in [start..limit[\r
 401         // mark index values for repeat-data blocks by setting bit 31 of the \r
 402         // index values fill around existing values if any, if(overwrite)\r
 403             \r
 404         // valid, uncompacted trie and valid indexes?\r
 405         if (m_isCompacted_ || start < UCharacter.MIN_VALUE \r
 406             || start > UCharacter.MAX_VALUE || limit < UCharacter.MIN_VALUE\r
 407             || limit > (UCharacter.MAX_VALUE + 1) || start > limit) {\r
 408             return false;\r
 409         }\r
 410             \r
 411         if (start == limit) {\r
 412             return true; // nothing to do\r
 413         }\r
 414         \r
 415         if ((start & MASK_) != 0) {\r
 416             // set partial block at [start..following block boundary[\r
 417             int block = getDataBlock(start);\r
 418             if (block < 0) {\r
 419                 return false;\r
 420             }\r
 421         \r
 422             int nextStart = (start + DATA_BLOCK_LENGTH) & ~MASK_;\r
 423             if (nextStart <= limit) {\r
 424                 fillBlock(block, start & MASK_, DATA_BLOCK_LENGTH,\r
 425                           value, overwrite);\r
 426                 start = nextStart;\r
 427             } \r
 428             else {\r
 429                 fillBlock(block, start & MASK_, limit & MASK_,\r
 430                           value, overwrite);\r
 431                 return true;\r
 432             }\r
 433         }\r
 434         \r
 435         // number of positions in the last, partial block\r
 436         int rest = limit & MASK_;\r
 437         \r
 438         // round down limit to a block boundary \r
 439         limit &= ~MASK_;\r
 440         \r
 441         // iterate over all-value blocks \r
 442         int repeatBlock = 0;\r
 443         if (value == m_initialValue_) {\r
 444             // repeatBlock = 0; assigned above\r
 445         } \r
 446         else {\r
 447             repeatBlock = -1;\r
 448         }\r
 449         while (start < limit) {\r
 450             // get index value \r
 451             int block = m_index_[start >> SHIFT_];\r
 452             if (block > 0) {\r
 453                 // already allocated, fill in value\r
 454                 fillBlock(block, 0, DATA_BLOCK_LENGTH, value, overwrite);\r
 455             } \r
 456             else if (m_data_[-block] != value && (block == 0 || overwrite)) {\r
 457                 // set the repeatBlock instead of the current block 0 or range \r
 458                 // block \r
 459                 if (repeatBlock >= 0) {\r
 460                     m_index_[start >> SHIFT_] = -repeatBlock;\r
 461                 } \r
 462                 else {\r
 463                     // create and set and fill the repeatBlock\r
 464                     repeatBlock = getDataBlock(start);\r
 465                     if (repeatBlock < 0) {\r
 466                         return false;\r
 467                     }\r
 468         \r
 469                     // set the negative block number to indicate that it is a \r
 470                     // repeat block\r
 471                     m_index_[start >> SHIFT_] = -repeatBlock;\r
 472                     fillBlock(repeatBlock, 0, DATA_BLOCK_LENGTH, value, true);\r
 473                 }\r
 474             }\r
 475         \r
 476             start += DATA_BLOCK_LENGTH;\r
 477         }\r
 478         \r
 479         if (rest > 0) {\r
 480             // set partial block at [last block boundary..limit[\r
 481             int block = getDataBlock(start);\r
 482             if (block < 0) {\r
 483                 return false;\r
 484             }\r
 485             fillBlock(block, 0, rest, value, overwrite);\r
 486         }\r
 487         \r
 488         return true;\r
 489     }\r
 490     \r
 491     // protected data member ------------------------------------------------\r
 492                 \r
 493     protected int m_data_[];\r
 494     protected int m_initialValue_;  \r
 495     \r
 496     //  private data member ------------------------------------------------\r
 497         \r
 498     private int m_leadUnitValue_;  \r
 499      \r
 500     // private methods ------------------------------------------------------\r
 501    \r
 502     private int allocDataBlock() \r
 503     {\r
 504         int newBlock = m_dataLength_;\r
 505         int newTop = newBlock + DATA_BLOCK_LENGTH;\r
 506         if (newTop > m_dataCapacity_) {\r
 507             // out of memory in the data array\r
 508             return -1;\r
 509         }\r
 510         m_dataLength_ = newTop;\r
 511         return newBlock;\r
 512     }\r
 513 \r
 514     /**\r
 515      * No error checking for illegal arguments.\r
 516      * @param ch codepoint to look for\r
 517      * @return -1 if no new data block available (out of memory in data array)\r
 518      */\r
 519     private int getDataBlock(int ch) \r
 520     {\r
 521         ch >>= SHIFT_;\r
 522         int indexValue = m_index_[ch];\r
 523         if (indexValue > 0) {\r
 524             return indexValue;\r
 525         }\r
 526     \r
 527         // allocate a new data block\r
 528         int newBlock = allocDataBlock();\r
 529         if (newBlock < 0) {\r
 530             // out of memory in the data array \r
 531             return -1;\r
 532         }\r
 533         m_index_[ch] = newBlock;\r
 534     \r
 535         // copy-on-write for a block from a setRange()\r
 536         System.arraycopy(m_data_, Math.abs(indexValue), m_data_, newBlock,  \r
 537                          DATA_BLOCK_LENGTH << 2);\r
 538         return newBlock;\r
 539     }\r
 540     \r
 541     /**\r
 542      * Compact a folded build-time trie.\r
 543      * The compaction\r
 544      * - removes blocks that are identical with earlier ones\r
 545      * - overlaps adjacent blocks as much as possible (if overlap == true)\r
 546      * - moves blocks in steps of the data granularity\r
 547      * - moves and overlaps blocks that overlap with multiple values in the overlap region\r
 548      *\r
 549      * It does not\r
 550      * - try to move and overlap blocks that are not already adjacent\r
 551      * @param overlap flag\r
 552      */\r
 553     private void compact(boolean overlap) \r
 554     {\r
 555         if (m_isCompacted_) {\r
 556             return; // nothing left to do\r
 557         }\r
 558     \r
 559         // compaction\r
 560         // initialize the index map with "block is used/unused" flags\r
 561         findUnusedBlocks();\r
 562         \r
 563         // if Latin-1 is preallocated and linear, then do not compact Latin-1 \r
 564         // data\r
 565         int overlapStart = DATA_BLOCK_LENGTH;\r
 566         if (m_isLatin1Linear_ && SHIFT_ <= 8) {\r
 567             overlapStart += 256;\r
 568         }\r
 569        \r
 570         int newStart = DATA_BLOCK_LENGTH;\r
 571         int i;\r
 572         for (int start = newStart; start < m_dataLength_;) {\r
 573             // start: index of first entry of current block\r
 574             // newStart: index where the current block is to be moved\r
 575             //           (right after current end of already-compacted data)\r
 576             // skip blocks that are not used \r
 577             if (m_map_[start >>> SHIFT_] < 0) {\r
 578                 // advance start to the next block \r
 579                 start += DATA_BLOCK_LENGTH;\r
 580                 // leave newStart with the previous block!\r
 581                 continue;\r
 582             }\r
 583             // search for an identical block\r
 584             if (start >= overlapStart) {\r
 585                 i = findSameDataBlock(m_data_, newStart, start,\r
 586                                           overlap ? DATA_GRANULARITY_ : DATA_BLOCK_LENGTH);\r
 587                 if (i >= 0) {\r
 588                     // found an identical block, set the other block's index \r
 589                     // value for the current block\r
 590                     m_map_[start >>> SHIFT_] = i;\r
 591                     // advance start to the next block\r
 592                     start += DATA_BLOCK_LENGTH;\r
 593                     // leave newStart with the previous block!\r
 594                     continue;\r
 595                 }\r
 596             }\r
 597             // see if the beginning of this block can be overlapped with the \r
 598             // end of the previous block\r
 599             if(overlap && start>=overlapStart) {\r
 600                 /* look for maximum overlap (modulo granularity) with the previous, adjacent block */\r
 601                 for(i=DATA_BLOCK_LENGTH-DATA_GRANULARITY_;\r
 602                     i>0 && !equal_int(m_data_, newStart-i, start, i);\r
 603                     i-=DATA_GRANULARITY_) {}\r
 604             } else {\r
 605                 i=0;\r
 606             }\r
 607             if (i > 0) {\r
 608                 // some overlap\r
 609                 m_map_[start >>> SHIFT_] = newStart - i;\r
 610                 // move the non-overlapping indexes to their new positions\r
 611                 start += i;\r
 612                 for (i = DATA_BLOCK_LENGTH - i; i > 0; -- i) {\r
 613                     m_data_[newStart ++] = m_data_[start ++];\r
 614                 }\r
 615             } \r
 616             else if (newStart < start) {\r
 617                 // no overlap, just move the indexes to their new positions\r
 618                 m_map_[start >>> SHIFT_] = newStart;\r
 619                 for (i = DATA_BLOCK_LENGTH; i > 0; -- i) {\r
 620                     m_data_[newStart ++] = m_data_[start ++];\r
 621                 }\r
 622             } \r
 623             else { // no overlap && newStart==start\r
 624                 m_map_[start >>> SHIFT_] = start;\r
 625                 newStart += DATA_BLOCK_LENGTH;\r
 626                 start = newStart;\r
 627             }\r
 628         }\r
 629         // now adjust the index (stage 1) table\r
 630         for (i = 0; i < m_indexLength_; ++ i) {\r
 631             m_index_[i] = m_map_[Math.abs(m_index_[i]) >>> SHIFT_];\r
 632         }\r
 633         m_dataLength_ = newStart;\r
 634     }\r
 635 \r
 636     /**\r
 637      * Find the same data block\r
 638      * @param data array\r
 639      * @param dataLength\r
 640      * @param otherBlock\r
 641      * @param step\r
 642      */\r
 643     private static final int findSameDataBlock(int data[], int dataLength,\r
 644                                                int otherBlock, int step) \r
 645     {\r
 646         // ensure that we do not even partially get past dataLength\r
 647         dataLength -= DATA_BLOCK_LENGTH;\r
 648 \r
 649         for (int block = 0; block <= dataLength; block += step) {\r
 650             if(equal_int(data, block, otherBlock, DATA_BLOCK_LENGTH)) {\r
 651                 return block;\r
 652             }\r
 653         }\r
 654         return -1;\r
 655     }\r
 656     \r
 657     /**\r
 658      * Fold the normalization data for supplementary code points into\r
 659      * a compact area on top of the BMP-part of the trie index,\r
 660      * with the lead surrogates indexing this compact area.\r
 661      *\r
 662      * Duplicate the index values for lead surrogates:\r
 663      * From inside the BMP area, where some may be overridden with folded values,\r
 664      * to just after the BMP area, where they can be retrieved for\r
 665      * code point lookups.\r
 666      * @param manipulate fold implementation\r
 667      */\r
 668     private final void fold(DataManipulate manipulate) \r
 669     {\r
 670         int leadIndexes[] = new int[SURROGATE_BLOCK_COUNT_];\r
 671         int index[] = m_index_;\r
 672         // copy the lead surrogate indexes into a temporary array\r
 673         System.arraycopy(index, 0xd800 >> SHIFT_, leadIndexes, 0, \r
 674                          SURROGATE_BLOCK_COUNT_);\r
 675         \r
 676         // set all values for lead surrogate code *units* to leadUnitValue\r
 677         // so that by default runtime lookups will find no data for associated\r
 678         // supplementary code points, unless there is data for such code points\r
 679         // which will result in a non-zero folding value below that is set for\r
 680         // the respective lead units\r
 681         // the above saved the indexes for surrogate code *points*\r
 682         // fill the indexes with simplified code from utrie_setRange32()\r
 683         int block = 0;\r
 684         if (m_leadUnitValue_ == m_initialValue_) {\r
 685             // leadUnitValue == initialValue, use all-initial-value block\r
 686             // block = 0; if block here left empty\r
 687         } \r
 688         else {\r
 689             // create and fill the repeatBlock\r
 690             block = allocDataBlock();\r
 691             if (block < 0) {\r
 692                 // data table overflow\r
 693                 throw new IllegalStateException("Internal error: Out of memory space");\r
 694             }\r
 695             fillBlock(block, 0, DATA_BLOCK_LENGTH, m_leadUnitValue_, true);\r
 696             // negative block number to indicate that it is a repeat block\r
 697             block = -block; \r
 698         }\r
 699         for (int c = (0xd800 >> SHIFT_); c < (0xdc00 >> SHIFT_); ++ c) {\r
 700             m_index_[c] = block;\r
 701         }\r
 702 \r
 703         // Fold significant index values into the area just after the BMP \r
 704         // indexes.\r
 705         // In case the first lead surrogate has significant data,\r
 706         // its index block must be used first (in which case the folding is a \r
 707         // no-op).\r
 708         // Later all folded index blocks are moved up one to insert the copied\r
 709         // lead surrogate indexes.\r
 710         int indexLength = BMP_INDEX_LENGTH_;\r
 711         // search for any index (stage 1) entries for supplementary code points \r
 712         for (int c = 0x10000; c < 0x110000;) {\r
 713             if (index[c >> SHIFT_] != 0) {\r
 714                 // there is data, treat the full block for a lead surrogate\r
 715                 c &= ~0x3ff;\r
 716                 // is there an identical index block?\r
 717                 block = findSameIndexBlock(index, indexLength, c >> SHIFT_);\r
 718                 \r
 719                 // get a folded value for [c..c+0x400[ and,\r
 720                 // if different from the value for the lead surrogate code \r
 721                 // point, set it for the lead surrogate code unit\r
 722 \r
 723                 int value = manipulate.getFoldedValue(c, \r
 724                                                       block + SURROGATE_BLOCK_COUNT_);\r
 725                 if (value != getValue(UTF16.getLeadSurrogate(c))) {\r
 726                     if (!setValue(UTF16.getLeadSurrogate(c), value)) {\r
 727                         // data table overflow \r
 728                         throw new ArrayIndexOutOfBoundsException(\r
 729                                                                  "Data table overflow");\r
 730                     }\r
 731                     // if we did not find an identical index block...\r
 732                     if (block == indexLength) {\r
 733                         // move the actual index (stage 1) entries from the \r
 734                         // supplementary position to the new one\r
 735                         System.arraycopy(index, c >> SHIFT_, index, indexLength,\r
 736                                          SURROGATE_BLOCK_COUNT_);\r
 737                         indexLength += SURROGATE_BLOCK_COUNT_;\r
 738                     }\r
 739                 }\r
 740                 c += 0x400;\r
 741             } \r
 742             else {\r
 743                 c += DATA_BLOCK_LENGTH;\r
 744             }\r
 745         }\r
 746         \r
 747         // index array overflow?\r
 748         // This is to guarantee that a folding offset is of the form\r
 749         // UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023.\r
 750         // If the index is too large, then n>=1024 and more than 10 bits are \r
 751         // necessary.\r
 752         // In fact, it can only ever become n==1024 with completely unfoldable \r
 753         // data and the additional block of duplicated values for lead \r
 754         // surrogates.\r
 755         if (indexLength >= MAX_INDEX_LENGTH_) {\r
 756             throw new ArrayIndexOutOfBoundsException("Index table overflow");\r
 757         }\r
 758         // make space for the lead surrogate index block and insert it between \r
 759         // the BMP indexes and the folded ones\r
 760         System.arraycopy(index, BMP_INDEX_LENGTH_, index, \r
 761                          BMP_INDEX_LENGTH_ + SURROGATE_BLOCK_COUNT_,\r
 762                          indexLength - BMP_INDEX_LENGTH_);\r
 763         System.arraycopy(leadIndexes, 0, index, BMP_INDEX_LENGTH_,\r
 764                          SURROGATE_BLOCK_COUNT_);\r
 765         indexLength += SURROGATE_BLOCK_COUNT_;\r
 766         m_indexLength_ = indexLength;\r
 767     }\r
 768     \r
 769     /**\r
 770      * @internal\r
 771      */\r
 772     private void fillBlock(int block, int start, int limit, int value, \r
 773                            boolean overwrite) \r
 774     {\r
 775         limit += block;\r
 776         block += start;\r
 777         if (overwrite) {\r
 778             while (block < limit) {\r
 779                 m_data_[block ++] = value;\r
 780             }\r
 781         } \r
 782         else {\r
 783             while (block < limit) {\r
 784                 if (m_data_[block] == m_initialValue_) {\r
 785                     m_data_[block] = value;\r
 786                 }\r
 787                 ++ block;\r
 788             }\r
 789         }\r
 790     }\r
 791 }\r
 792 \r