2 ******************************************************************************
3 * Copyright (C) 1996-2010, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 ******************************************************************************
8 package com.ibm.icu.impl;
10 import java.io.DataOutputStream;
11 import java.io.IOException;
12 import java.io.OutputStream;
13 import java.util.Arrays;
15 import com.ibm.icu.lang.UCharacter;
16 import com.ibm.icu.text.UTF16;
19 * Builder class to manipulate and generate a trie.
20 * This is useful for ICU data in primitive types.
21 * Provides a compact way to store information that is indexed by Unicode
22 * values, such as character properties, types, keyboard values, etc. This is
23 * very useful when you have a block of Unicode data that contains significant
24 * values while the rest of the Unicode data is unused in the application or
25 * when you have a lot of redundance, such as where all 21,000 Han ideographs
26 * have the same value. However, lookup is much faster than a hash table.
27 * A trie of any primitive data type serves two purposes:
29 * <LI>Fast access of the indexed values.
30 * <LI>Smaller memory footprint.
32 * This is a direct port from the ICU4C version
33 * @author Syn Wee Quek
35 public class IntTrieBuilder extends TrieBuilder
37 // public constructor ----------------------------------------------
42 public IntTrieBuilder(IntTrieBuilder table)
45 m_data_ = new int[m_dataCapacity_];
46 System.arraycopy(table.m_data_, 0, m_data_, 0, m_dataLength_);
47 m_initialValue_ = table.m_initialValue_;
48 m_leadUnitValue_ = table.m_leadUnitValue_;
52 * Constructs a build table
53 * @param aliasdata data to be filled into table
54 * @param maxdatalength maximum data length allowed in table
55 * @param initialvalue inital data value
56 * @param latin1linear is latin 1 to be linear
58 public IntTrieBuilder(int aliasdata[], int maxdatalength,
59 int initialvalue, int leadunitvalue,
63 if (maxdatalength < DATA_BLOCK_LENGTH || (latin1linear
64 && maxdatalength < 1024)) {
65 throw new IllegalArgumentException(
66 "Argument maxdatalength is too small");
69 if (aliasdata != null) {
73 m_data_ = new int[maxdatalength];
76 // preallocate and reset the first data block (block index 0)
77 int j = DATA_BLOCK_LENGTH;
80 // preallocate and reset the first block (number 0) and Latin-1
81 // (U+0000..U+00ff) after that made sure above that
82 // maxDataLength >= 1024
83 // set indexes to point to consecutive data blocks
86 // do this at least for trie->index[0] even if that block is
87 // only partly used for Latin-1
89 j += DATA_BLOCK_LENGTH;
90 } while (i < (256 >> SHIFT_));
94 // reset the initially allocated blocks to the initial value
95 Arrays.fill(m_data_, 0, m_dataLength_, initialvalue);
96 m_initialValue_ = initialvalue;
97 m_leadUnitValue_ = leadunitvalue;
98 m_dataCapacity_ = maxdatalength;
99 m_isLatin1Linear_ = latin1linear;
100 m_isCompacted_ = false;
103 // public methods -------------------------------------------------------
105 /*public final void print()
108 int oldvalue = m_index_[i];
110 System.out.println("index length " + m_indexLength_
111 + " --------------------------");
112 while (i < m_indexLength_) {
113 if (m_index_[i] != oldvalue) {
114 System.out.println("index has " + count + " counts of "
115 + Integer.toHexString(oldvalue));
117 oldvalue = m_index_[i];
122 System.out.println("index has " + count + " counts of "
123 + Integer.toHexString(oldvalue));
125 oldvalue = m_data_[i];
127 System.out.println("data length " + m_dataLength_
128 + " --------------------------");
129 while (i < m_dataLength_) {
130 if (m_data_[i] != oldvalue) {
131 if ((oldvalue & 0xf1000000) == 0xf1000000) {
132 int temp = oldvalue & 0xffffff;
134 oldvalue = 0xf1000000 | temp;
136 if ((oldvalue & 0xf2000000) == 0xf2000000) {
137 int temp = oldvalue & 0xffffff;
139 oldvalue = 0xf2000000 | temp;
141 System.out.println("data has " + count + " counts of "
142 + Integer.toHexString(oldvalue));
144 oldvalue = m_data_[i];
149 if ((oldvalue & 0xf1000000) == 0xf1000000) {
150 int temp = oldvalue & 0xffffff;
152 oldvalue = 0xf1000000 | temp;
154 if ((oldvalue & 0xf2000000) == 0xf2000000) {
155 int temp = oldvalue & 0xffffff;
157 oldvalue = 0xf2000000 | temp;
159 System.out.println("data has " + count + " counts of "
160 + Integer.toHexString(oldvalue));
164 * Gets a 32 bit data from the table data
165 * @param ch codepoint which data is to be retrieved
166 * @return the 32 bit data
168 public int getValue(int ch)
170 // valid, uncompacted trie and valid c?
171 if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < 0) {
175 int block = m_index_[ch >> SHIFT_];
176 return m_data_[Math.abs(block) + (ch & MASK_)];
180 * Get a 32 bit data from the table data
181 * @param ch code point for which data is to be retrieved.
182 * @param inBlockZero Output parameter, inBlockZero[0] returns true if the
183 * char maps into block zero, otherwise false.
184 * @return the 32 bit data value.
186 public int getValue(int ch, boolean [] inBlockZero)
188 // valid, uncompacted trie and valid c?
189 if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < 0) {
190 if (inBlockZero != null) {
191 inBlockZero[0] = true;
196 int block = m_index_[ch >> SHIFT_];
197 if (inBlockZero != null) {
198 inBlockZero[0] = (block == 0);
200 return m_data_[Math.abs(block) + (ch & MASK_)];
205 * Sets a 32 bit data in the table data
206 * @param ch codepoint which data is to be set
207 * @param value to set
208 * @return true if the set is successful, otherwise
209 * if the table has been compacted return false
211 public boolean setValue(int ch, int value)
213 // valid, uncompacted trie and valid c?
214 if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < 0) {
218 int block = getDataBlock(ch);
223 m_data_[block + (ch & MASK_)] = value;
228 * Serializes the build table with 32 bit data
229 * @param datamanipulate builder raw fold method implementation
230 * @param triedatamanipulate result trie fold method
233 public IntTrie serialize(TrieBuilder.DataManipulate datamanipulate,
234 Trie.DataManipulate triedatamanipulate)
236 if (datamanipulate == null) {
237 throw new IllegalArgumentException("Parameters can not be null");
239 // fold and compact if necessary, also checks that indexLength is
241 if (!m_isCompacted_) {
242 // compact once without overlap to improve folding
244 // fold the supplementary part of the index array
245 fold(datamanipulate);
246 // compact again with overlap for minimum data array length
248 m_isCompacted_ = true;
250 // is dataLength within limits?
251 if (m_dataLength_ >= MAX_DATA_LENGTH_) {
252 throw new ArrayIndexOutOfBoundsException("Data length too small");
255 char index[] = new char[m_indexLength_];
256 int data[] = new int[m_dataLength_];
257 // write the index (stage 1) array and the 32-bit data (stage 2) array
258 // write 16-bit index values shifted right by INDEX_SHIFT_
259 for (int i = 0; i < m_indexLength_; i ++) {
260 index[i] = (char)(m_index_[i] >>> INDEX_SHIFT_);
262 // write 32-bit data values
263 System.arraycopy(m_data_, 0, data, 0, m_dataLength_);
265 int options = SHIFT_ | (INDEX_SHIFT_ << OPTIONS_INDEX_SHIFT_);
266 options |= OPTIONS_DATA_IS_32_BIT_;
267 if (m_isLatin1Linear_) {
268 options |= OPTIONS_LATIN1_IS_LINEAR_;
270 return new IntTrie(index, data, m_initialValue_, options,
276 * Serializes the build table to an output stream.
278 * Compacts the build-time trie after all values are set, and then
279 * writes the serialized form onto an output stream.
281 * After this, this build-time Trie can only be serialized again and/or closed;
282 * no further values can be added.
284 * This function is the rough equivalent of utrie_seriaize() in ICU4C.
286 * @param os the output stream to which the seriaized trie will be written.
287 * If nul, the function still returns the size of the serialized Trie.
288 * @param reduceTo16Bits If true, reduce the data size to 16 bits. The resulting
289 * serialized form can then be used to create a CharTrie.
290 * @param datamanipulate builder raw fold method implementation
291 * @return the number of bytes written to the output stream.
293 public int serialize(OutputStream os, boolean reduceTo16Bits,
294 TrieBuilder.DataManipulate datamanipulate) throws IOException {
295 if (datamanipulate == null) {
296 throw new IllegalArgumentException("Parameters can not be null");
299 // fold and compact if necessary, also checks that indexLength is
301 if (!m_isCompacted_) {
302 // compact once without overlap to improve folding
304 // fold the supplementary part of the index array
305 fold(datamanipulate);
306 // compact again with overlap for minimum data array length
308 m_isCompacted_ = true;
311 // is dataLength within limits?
313 if (reduceTo16Bits) {
314 length = m_dataLength_ + m_indexLength_;
316 length = m_dataLength_;
318 if (length >= MAX_DATA_LENGTH_) {
319 throw new ArrayIndexOutOfBoundsException("Data length too small");
322 // struct UTrieHeader {
323 // int32_t signature;
324 // int32_t options (a bit field)
325 // int32_t indexLength
326 // int32_t dataLength
327 length = Trie.HEADER_LENGTH_ + 2*m_indexLength_;
329 length+=2*m_dataLength_;
331 length+=4*m_dataLength_;
335 // No output stream. Just return the length of the serialized Trie, in bytes.
339 DataOutputStream dos = new DataOutputStream(os);
340 dos.writeInt(Trie.HEADER_SIGNATURE_);
342 int options = Trie.INDEX_STAGE_1_SHIFT_ | (Trie.INDEX_STAGE_2_SHIFT_<<Trie.HEADER_OPTIONS_INDEX_SHIFT_);
343 if(!reduceTo16Bits) {
344 options |= Trie.HEADER_OPTIONS_DATA_IS_32_BIT_;
346 if(m_isLatin1Linear_) {
347 options |= Trie.HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_;
349 dos.writeInt(options);
351 dos.writeInt(m_indexLength_);
352 dos.writeInt(m_dataLength_);
354 /* write the index (stage 1) array and the 16/32-bit data (stage 2) array */
356 /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT, after adding indexLength */
357 for (int i=0; i<m_indexLength_; i++) {
358 int v = (m_index_[i] + m_indexLength_) >>> Trie.INDEX_STAGE_2_SHIFT_;
362 /* write 16-bit data values */
363 for(int i=0; i<m_dataLength_; i++) {
364 int v = m_data_[i] & 0x0000ffff;
368 /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT */
369 for (int i=0; i<m_indexLength_; i++) {
370 int v = (m_index_[i]) >>> Trie.INDEX_STAGE_2_SHIFT_;
374 /* write 32-bit data values */
375 for(int i=0; i<m_dataLength_; i++) {
376 dos.writeInt(m_data_[i]);
386 * Set a value in a range of code points [start..limit].
387 * All code points c with start <= c < limit will get the value if
388 * overwrite is true or if the old value is 0.
389 * @param start the first code point to get the value
390 * @param limit one past the last code point to get the value
391 * @param value the value
392 * @param overwrite flag for whether old non-initial values are to be
394 * @return false if a failure occurred (illegal argument or data array
397 public boolean setRange(int start, int limit, int value,
400 // repeat value in [start..limit[
401 // mark index values for repeat-data blocks by setting bit 31 of the
402 // index values fill around existing values if any, if(overwrite)
404 // valid, uncompacted trie and valid indexes?
405 if (m_isCompacted_ || start < UCharacter.MIN_VALUE
406 || start > UCharacter.MAX_VALUE || limit < UCharacter.MIN_VALUE
407 || limit > (UCharacter.MAX_VALUE + 1) || start > limit) {
411 if (start == limit) {
412 return true; // nothing to do
415 if ((start & MASK_) != 0) {
416 // set partial block at [start..following block boundary[
417 int block = getDataBlock(start);
422 int nextStart = (start + DATA_BLOCK_LENGTH) & ~MASK_;
423 if (nextStart <= limit) {
424 fillBlock(block, start & MASK_, DATA_BLOCK_LENGTH,
429 fillBlock(block, start & MASK_, limit & MASK_,
435 // number of positions in the last, partial block
436 int rest = limit & MASK_;
438 // round down limit to a block boundary
441 // iterate over all-value blocks
443 if (value == m_initialValue_) {
444 // repeatBlock = 0; assigned above
449 while (start < limit) {
451 int block = m_index_[start >> SHIFT_];
453 // already allocated, fill in value
454 fillBlock(block, 0, DATA_BLOCK_LENGTH, value, overwrite);
456 else if (m_data_[-block] != value && (block == 0 || overwrite)) {
457 // set the repeatBlock instead of the current block 0 or range
459 if (repeatBlock >= 0) {
460 m_index_[start >> SHIFT_] = -repeatBlock;
463 // create and set and fill the repeatBlock
464 repeatBlock = getDataBlock(start);
465 if (repeatBlock < 0) {
469 // set the negative block number to indicate that it is a
471 m_index_[start >> SHIFT_] = -repeatBlock;
472 fillBlock(repeatBlock, 0, DATA_BLOCK_LENGTH, value, true);
476 start += DATA_BLOCK_LENGTH;
480 // set partial block at [last block boundary..limit[
481 int block = getDataBlock(start);
485 fillBlock(block, 0, rest, value, overwrite);
491 // protected data member ------------------------------------------------
493 protected int m_data_[];
494 protected int m_initialValue_;
496 // private data member ------------------------------------------------
498 private int m_leadUnitValue_;
500 // private methods ------------------------------------------------------
502 private int allocDataBlock()
504 int newBlock = m_dataLength_;
505 int newTop = newBlock + DATA_BLOCK_LENGTH;
506 if (newTop > m_dataCapacity_) {
507 // out of memory in the data array
510 m_dataLength_ = newTop;
515 * No error checking for illegal arguments.
516 * @param ch codepoint to look for
517 * @return -1 if no new data block available (out of memory in data array)
519 private int getDataBlock(int ch)
522 int indexValue = m_index_[ch];
523 if (indexValue > 0) {
527 // allocate a new data block
528 int newBlock = allocDataBlock();
530 // out of memory in the data array
533 m_index_[ch] = newBlock;
535 // copy-on-write for a block from a setRange()
536 System.arraycopy(m_data_, Math.abs(indexValue), m_data_, newBlock,
537 DATA_BLOCK_LENGTH << 2);
542 * Compact a folded build-time trie.
544 * - removes blocks that are identical with earlier ones
545 * - overlaps adjacent blocks as much as possible (if overlap == true)
546 * - moves blocks in steps of the data granularity
547 * - moves and overlaps blocks that overlap with multiple values in the overlap region
550 * - try to move and overlap blocks that are not already adjacent
551 * @param overlap flag
553 private void compact(boolean overlap)
555 if (m_isCompacted_) {
556 return; // nothing left to do
560 // initialize the index map with "block is used/unused" flags
563 // if Latin-1 is preallocated and linear, then do not compact Latin-1
565 int overlapStart = DATA_BLOCK_LENGTH;
566 if (m_isLatin1Linear_ && SHIFT_ <= 8) {
570 int newStart = DATA_BLOCK_LENGTH;
572 for (int start = newStart; start < m_dataLength_;) {
573 // start: index of first entry of current block
574 // newStart: index where the current block is to be moved
575 // (right after current end of already-compacted data)
576 // skip blocks that are not used
577 if (m_map_[start >>> SHIFT_] < 0) {
578 // advance start to the next block
579 start += DATA_BLOCK_LENGTH;
580 // leave newStart with the previous block!
583 // search for an identical block
584 if (start >= overlapStart) {
585 i = findSameDataBlock(m_data_, newStart, start,
586 overlap ? DATA_GRANULARITY_ : DATA_BLOCK_LENGTH);
588 // found an identical block, set the other block's index
589 // value for the current block
590 m_map_[start >>> SHIFT_] = i;
591 // advance start to the next block
592 start += DATA_BLOCK_LENGTH;
593 // leave newStart with the previous block!
597 // see if the beginning of this block can be overlapped with the
598 // end of the previous block
599 if(overlap && start>=overlapStart) {
600 /* look for maximum overlap (modulo granularity) with the previous, adjacent block */
601 for(i=DATA_BLOCK_LENGTH-DATA_GRANULARITY_;
602 i>0 && !equal_int(m_data_, newStart-i, start, i);
603 i-=DATA_GRANULARITY_) {}
609 m_map_[start >>> SHIFT_] = newStart - i;
610 // move the non-overlapping indexes to their new positions
612 for (i = DATA_BLOCK_LENGTH - i; i > 0; -- i) {
613 m_data_[newStart ++] = m_data_[start ++];
616 else if (newStart < start) {
617 // no overlap, just move the indexes to their new positions
618 m_map_[start >>> SHIFT_] = newStart;
619 for (i = DATA_BLOCK_LENGTH; i > 0; -- i) {
620 m_data_[newStart ++] = m_data_[start ++];
623 else { // no overlap && newStart==start
624 m_map_[start >>> SHIFT_] = start;
625 newStart += DATA_BLOCK_LENGTH;
629 // now adjust the index (stage 1) table
630 for (i = 0; i < m_indexLength_; ++ i) {
631 m_index_[i] = m_map_[Math.abs(m_index_[i]) >>> SHIFT_];
633 m_dataLength_ = newStart;
637 * Find the same data block
643 private static final int findSameDataBlock(int data[], int dataLength,
644 int otherBlock, int step)
646 // ensure that we do not even partially get past dataLength
647 dataLength -= DATA_BLOCK_LENGTH;
649 for (int block = 0; block <= dataLength; block += step) {
650 if(equal_int(data, block, otherBlock, DATA_BLOCK_LENGTH)) {
658 * Fold the normalization data for supplementary code points into
659 * a compact area on top of the BMP-part of the trie index,
660 * with the lead surrogates indexing this compact area.
662 * Duplicate the index values for lead surrogates:
663 * From inside the BMP area, where some may be overridden with folded values,
664 * to just after the BMP area, where they can be retrieved for
665 * code point lookups.
666 * @param manipulate fold implementation
668 private final void fold(DataManipulate manipulate)
670 int leadIndexes[] = new int[SURROGATE_BLOCK_COUNT_];
671 int index[] = m_index_;
672 // copy the lead surrogate indexes into a temporary array
673 System.arraycopy(index, 0xd800 >> SHIFT_, leadIndexes, 0,
674 SURROGATE_BLOCK_COUNT_);
676 // set all values for lead surrogate code *units* to leadUnitValue
677 // so that by default runtime lookups will find no data for associated
678 // supplementary code points, unless there is data for such code points
679 // which will result in a non-zero folding value below that is set for
680 // the respective lead units
681 // the above saved the indexes for surrogate code *points*
682 // fill the indexes with simplified code from utrie_setRange32()
684 if (m_leadUnitValue_ == m_initialValue_) {
685 // leadUnitValue == initialValue, use all-initial-value block
686 // block = 0; if block here left empty
689 // create and fill the repeatBlock
690 block = allocDataBlock();
692 // data table overflow
693 throw new IllegalStateException("Internal error: Out of memory space");
695 fillBlock(block, 0, DATA_BLOCK_LENGTH, m_leadUnitValue_, true);
696 // negative block number to indicate that it is a repeat block
699 for (int c = (0xd800 >> SHIFT_); c < (0xdc00 >> SHIFT_); ++ c) {
703 // Fold significant index values into the area just after the BMP
705 // In case the first lead surrogate has significant data,
706 // its index block must be used first (in which case the folding is a
708 // Later all folded index blocks are moved up one to insert the copied
709 // lead surrogate indexes.
710 int indexLength = BMP_INDEX_LENGTH_;
711 // search for any index (stage 1) entries for supplementary code points
712 for (int c = 0x10000; c < 0x110000;) {
713 if (index[c >> SHIFT_] != 0) {
714 // there is data, treat the full block for a lead surrogate
716 // is there an identical index block?
717 block = findSameIndexBlock(index, indexLength, c >> SHIFT_);
719 // get a folded value for [c..c+0x400[ and,
720 // if different from the value for the lead surrogate code
721 // point, set it for the lead surrogate code unit
723 int value = manipulate.getFoldedValue(c,
724 block + SURROGATE_BLOCK_COUNT_);
725 if (value != getValue(UTF16.getLeadSurrogate(c))) {
726 if (!setValue(UTF16.getLeadSurrogate(c), value)) {
727 // data table overflow
728 throw new ArrayIndexOutOfBoundsException(
729 "Data table overflow");
731 // if we did not find an identical index block...
732 if (block == indexLength) {
733 // move the actual index (stage 1) entries from the
734 // supplementary position to the new one
735 System.arraycopy(index, c >> SHIFT_, index, indexLength,
736 SURROGATE_BLOCK_COUNT_);
737 indexLength += SURROGATE_BLOCK_COUNT_;
743 c += DATA_BLOCK_LENGTH;
747 // index array overflow?
748 // This is to guarantee that a folding offset is of the form
749 // UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023.
750 // If the index is too large, then n>=1024 and more than 10 bits are
752 // In fact, it can only ever become n==1024 with completely unfoldable
753 // data and the additional block of duplicated values for lead
755 if (indexLength >= MAX_INDEX_LENGTH_) {
756 throw new ArrayIndexOutOfBoundsException("Index table overflow");
758 // make space for the lead surrogate index block and insert it between
759 // the BMP indexes and the folded ones
760 System.arraycopy(index, BMP_INDEX_LENGTH_, index,
761 BMP_INDEX_LENGTH_ + SURROGATE_BLOCK_COUNT_,
762 indexLength - BMP_INDEX_LENGTH_);
763 System.arraycopy(leadIndexes, 0, index, BMP_INDEX_LENGTH_,
764 SURROGATE_BLOCK_COUNT_);
765 indexLength += SURROGATE_BLOCK_COUNT_;
766 m_indexLength_ = indexLength;
772 private void fillBlock(int block, int start, int limit, int value,
778 while (block < limit) {
779 m_data_[block ++] = value;
783 while (block < limit) {
784 if (m_data_[block] == m_initialValue_) {
785 m_data_[block] = value;