2 ******************************************************************************
\r
3 * Copyright (C) 1996-2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 ******************************************************************************
\r
8 package com.ibm.icu.impl;
\r
10 import java.util.Arrays;
\r
12 import com.ibm.icu.lang.UCharacter;
\r
15 * Builder class to manipulate and generate a trie.
\r
16 * This is useful for ICU data in primitive types.
\r
17 * Provides a compact way to store information that is indexed by Unicode
\r
18 * values, such as character properties, types, keyboard values, etc. This is
\r
19 * very useful when you have a block of Unicode data that contains significant
\r
20 * values while the rest of the Unicode data is unused in the application or
\r
21 * when you have a lot of redundance, such as where all 21,000 Han ideographs
\r
22 * have the same value. However, lookup is much faster than a hash table.
\r
23 * A trie of any primitive data type serves two purposes:
\r
25 * <LI>Fast access of the indexed values.
\r
26 * <LI>Smaller memory footprint.
\r
28 * This is a direct port from the ICU4C version
\r
29 * @author Syn Wee Quek
\r
31 public class TrieBuilder
\r
33 // public data member ----------------------------------------------
\r
36 * Number of data values in a stage 2 (data array) block. 2, 4, 8, ..,
\r
39 public static final int DATA_BLOCK_LENGTH = 1 << Trie.INDEX_STAGE_1_SHIFT_;
\r
41 // public class declaration ----------------------------------------
\r
44 * Character data in com.ibm.impl.Trie have different user-specified format
\r
45 * for different purposes.
\r
46 * This interface specifies methods to be implemented in order for
\r
47 * com.ibm.impl.Trie, to surrogate offset information encapsulated within
\r
50 public static interface DataManipulate
\r
53 * Build-time trie callback function, used with serialize().
\r
54 * This function calculates a lead surrogate's value including a
\r
55 * folding offset from the 1024 supplementary code points
\r
56 * [start..start+1024[ .
\r
57 * It is U+10000 <= start <= U+10fc00 and (start&0x3ff)==0.
\r
58 * The folding offset is provided by the caller.
\r
59 * It is offset=UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT
\r
61 * Instead of the offset itself, n can be stored in 10 bits - or fewer
\r
62 * if it can be assumed that few lead surrogates have associated data.
\r
63 * The returned value must be
\r
64 * - not zero if and only if there is relevant data for the
\r
65 * corresponding 1024 supplementary code points
\r
66 * - such that UTrie.getFoldingOffset(UNewTrieGetFoldedValue(...,
\r
68 * @return a folded value, or 0 if there is no relevant data for the
\r
71 public int getFoldedValue(int start, int offset);
\r
74 // public methods ----------------------------------------------------
\r
77 * Checks if the character belongs to a zero block in the trie
\r
78 * @param ch codepoint which data is to be retrieved
\r
79 * @return true if ch is in the zero block
\r
81 public boolean isInZeroBlock(int ch)
\r
83 // valid, uncompacted trie and valid c?
\r
84 if (m_isCompacted_ || ch > UCharacter.MAX_VALUE
\r
85 || ch < UCharacter.MIN_VALUE) {
\r
89 return m_index_[ch >> SHIFT_] == 0;
\r
92 // package private method -----------------------------------------------
\r
94 // protected data member -----------------------------------------------
\r
97 * Index values at build-time are 32 bits wide for easier processing.
\r
98 * Bit 31 is set if the data block is used by multiple index values
\r
99 * (from setRange()).
\r
101 protected int m_index_[];
\r
102 protected int m_indexLength_;
\r
103 protected int m_dataCapacity_;
\r
104 protected int m_dataLength_;
\r
105 protected boolean m_isLatin1Linear_;
\r
106 protected boolean m_isCompacted_;
\r
108 * Map of adjusted indexes, used in utrie_compact().
\r
109 * Maps from original indexes to new ones.
\r
111 protected int m_map_[];
\r
114 * Shift size for shifting right the input index. 1..9
\r
116 protected static final int SHIFT_ = Trie.INDEX_STAGE_1_SHIFT_;
\r
118 * Length of the index (stage 1) array before folding.
\r
119 * Maximum number of Unicode code points (0x110000) shifted right by
\r
122 protected static final int MAX_INDEX_LENGTH_ = (0x110000 >> SHIFT_);
\r
124 * Length of the BMP portion of the index (stage 1) array.
\r
126 protected static final int BMP_INDEX_LENGTH_ = 0x10000 >> SHIFT_;
\r
128 * Number of index (stage 1) entries per lead surrogate.
\r
129 * Same as number of indexe entries for 1024 trail surrogates,
\r
130 * ==0x400>>UTRIE_SHIFT
\r
131 * 10 - SHIFT == Number of bits of a trail surrogate that are used in
\r
132 * index table lookups.
\r
134 protected static final int SURROGATE_BLOCK_COUNT_ = 1 << (10 - SHIFT_);
\r
136 * Mask for getting the lower bits from the input index.
\r
137 * DATA_BLOCK_LENGTH - 1.
\r
139 protected static final int MASK_ = Trie.INDEX_STAGE_3_MASK_;
\r
141 * Shift size for shifting left the index array values.
\r
142 * Increases possible data size with 16-bit index values at the cost
\r
143 * of compactability.
\r
144 * This requires blocks of stage 2 data to be aligned by UTRIE_DATA_GRANULARITY.
\r
147 protected static final int INDEX_SHIFT_ = Trie.INDEX_STAGE_2_SHIFT_;
\r
149 * Maximum length of the runtime data (stage 2) array.
\r
150 * Limited by 16-bit index values that are left-shifted by INDEX_SHIFT_.
\r
152 protected static final int MAX_DATA_LENGTH_ = (0x10000 << INDEX_SHIFT_);
\r
154 * Shifting to position the index value in options
\r
156 protected static final int OPTIONS_INDEX_SHIFT_ = 4;
\r
158 * If set, then the data (stage 2) array is 32 bits wide.
\r
160 protected static final int OPTIONS_DATA_IS_32_BIT_ = 0x100;
\r
162 * If set, then Latin-1 data (for U+0000..U+00ff) is stored in the data
\r
163 * (stage 2) array as a simple, linear array at data + DATA_BLOCK_LENGTH.
\r
165 protected static final int OPTIONS_LATIN1_IS_LINEAR_ = 0x200;
\r
167 * The alignment size of a stage 2 data block. Also the granularity for
\r
170 protected static final int DATA_GRANULARITY_ = 1 << INDEX_SHIFT_;
\r
172 // protected constructor ----------------------------------------------
\r
174 protected TrieBuilder()
\r
176 m_index_ = new int[MAX_INDEX_LENGTH_];
\r
177 m_map_ = new int[MAX_BUILD_TIME_DATA_LENGTH_ >> SHIFT_];
\r
178 m_isLatin1Linear_ = false;
\r
179 m_isCompacted_ = false;
\r
180 m_indexLength_ = MAX_INDEX_LENGTH_;
\r
183 protected TrieBuilder(TrieBuilder table)
\r
185 m_index_ = new int[MAX_INDEX_LENGTH_];
\r
186 m_indexLength_ = table.m_indexLength_;
\r
187 System.arraycopy(table.m_index_, 0, m_index_, 0, m_indexLength_);
\r
188 m_dataCapacity_ = table.m_dataCapacity_;
\r
189 m_dataLength_ = table.m_dataLength_;
\r
190 m_map_ = new int[table.m_map_.length];
\r
191 System.arraycopy(table.m_map_, 0, m_map_, 0, m_map_.length);
\r
192 m_isLatin1Linear_ = table.m_isLatin1Linear_;
\r
193 m_isCompacted_ = table.m_isCompacted_;
\r
196 // protected functions ------------------------------------------------
\r
199 * Compare two sections of an array for equality.
\r
201 protected static final boolean equal_int(int[] array, int start1, int start2, int length) {
\r
202 while(length>0 && array[start1]==array[start2]) {
\r
211 * Set a value in the trie index map to indicate which data block
\r
212 * is referenced and which one is not.
\r
213 * utrie_compact() will remove data blocks that are not used at all.
\r
215 * - 0 if it is used
\r
216 * - -1 if it is not used
\r
218 protected void findUnusedBlocks()
\r
220 // fill the entire map with "not used"
\r
221 Arrays.fill(m_map_, 0xff);
\r
223 // mark each block that _is_ used with 0
\r
224 for (int i = 0; i < m_indexLength_; ++ i) {
\r
225 m_map_[Math.abs(m_index_[i]) >> SHIFT_] = 0;
\r
228 // never move the all-initial-value block 0
\r
233 * Finds the same index block as the otherBlock
\r
234 * @param index array
\r
235 * @param indexLength size of index
\r
236 * @param otherBlock
\r
237 * @return same index block
\r
239 protected static final int findSameIndexBlock(int index[], int indexLength,
\r
242 for (int block = BMP_INDEX_LENGTH_; block < indexLength;
\r
243 block += SURROGATE_BLOCK_COUNT_) {
\r
244 if(equal_int(index, block, otherBlock, SURROGATE_BLOCK_COUNT_)) {
\r
248 return indexLength;
\r
251 // private data member ------------------------------------------------
\r
254 * Maximum length of the build-time data (stage 2) array.
\r
255 * The maximum length is 0x110000 + DATA_BLOCK_LENGTH + 0x400.
\r
256 * (Number of Unicode code points + one all-initial-value block +
\r
257 * possible duplicate entries for 1024 lead surrogates.)
\r
259 private static final int MAX_BUILD_TIME_DATA_LENGTH_ =
\r
260 0x110000 + DATA_BLOCK_LENGTH + 0x400;
\r