2 ******************************************************************************
\r
3 * Copyright (C) 1996-2008, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 ******************************************************************************
\r
8 package com.ibm.icu.impl;
\r
10 import java.io.InputStream;
\r
11 import java.io.DataInputStream;
\r
12 import java.io.IOException;
\r
13 import java.util.Arrays;
\r
14 import com.ibm.icu.text.UTF16;
\r
17 * Trie implementation which stores data in int, 32 bits.
\r
19 * @see com.ibm.icu.impl.Trie
\r
20 * @since release 2.1, Jan 01 2002
\r
22 public class IntTrie extends Trie
\r
24 // public constructors ---------------------------------------------
\r
27 * <p>Creates a new Trie with the settings for the trie data.</p>
\r
28 * <p>Unserialize the 32-bit-aligned input stream and use the data for the
\r
30 * @param inputStream file input stream to a ICU data file, containing
\r
32 * @param dataManipulate object which provides methods to parse the char
\r
34 * @throws IOException thrown when data reading fails
\r
36 public IntTrie(InputStream inputStream, DataManipulate dataManipulate)
\r
39 super(inputStream, dataManipulate);
\r
41 throw new IllegalArgumentException(
\r
42 "Data given does not belong to a int trie.");
\r
47 * Make a dummy IntTrie.
\r
48 * A dummy trie is an empty runtime trie, used when a real data trie cannot
\r
51 * The trie always returns the initialValue,
\r
52 * or the leadUnitValue for lead surrogate code points.
\r
53 * The Latin-1 part is always set up to be linear.
\r
55 * @param initialValue the initial value that is set for all code points
\r
56 * @param leadUnitValue the value for lead surrogate code _units_ that do not
\r
57 * have associated supplementary data
\r
58 * @param dataManipulate object which provides methods to parse the char data
\r
60 public IntTrie(int initialValue, int leadUnitValue, DataManipulate dataManipulate) {
\r
61 super(new char[BMP_INDEX_LENGTH+SURROGATE_BLOCK_COUNT], HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_, dataManipulate);
\r
63 int dataLength, latin1Length, i, limit;
\r
66 /* calculate the actual size of the dummy trie data */
\r
68 /* max(Latin-1, block 0) */
\r
69 dataLength=latin1Length= INDEX_STAGE_1_SHIFT_<=8 ? 256 : DATA_BLOCK_LENGTH;
\r
70 if(leadUnitValue!=initialValue) {
\r
71 dataLength+=DATA_BLOCK_LENGTH;
\r
73 m_data_=new int[dataLength];
\r
74 m_dataLength_=dataLength;
\r
76 m_initialValue_=initialValue;
\r
78 /* fill the index and data arrays */
\r
80 /* indexes are preset to 0 (block 0) */
\r
83 for(i=0; i<latin1Length; ++i) {
\r
84 m_data_[i]=initialValue;
\r
87 if(leadUnitValue!=initialValue) {
\r
88 /* indexes for lead surrogate code units to the block after Latin-1 */
\r
89 block=(char)(latin1Length>>INDEX_STAGE_2_SHIFT_);
\r
90 i=0xd800>>INDEX_STAGE_1_SHIFT_;
\r
91 limit=0xdc00>>INDEX_STAGE_1_SHIFT_;
\r
92 for(; i<limit; ++i) {
\r
96 /* data for lead surrogate code units */
\r
97 limit=latin1Length+DATA_BLOCK_LENGTH;
\r
98 for(i=latin1Length; i<limit; ++i) {
\r
99 m_data_[i]=leadUnitValue;
\r
104 // public methods --------------------------------------------------
\r
107 * Gets the value associated with the codepoint.
\r
108 * If no value is associated with the codepoint, a default value will be
\r
110 * @param ch codepoint
\r
111 * @return offset to data
\r
113 public final int getCodePointValue(int ch)
\r
117 // fastpath for U+0000..U+D7FF
\r
118 if(0 <= ch && ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
\r
119 // copy of getRawOffset()
\r
120 offset = (m_index_[ch >> INDEX_STAGE_1_SHIFT_] << INDEX_STAGE_2_SHIFT_)
\r
121 + (ch & INDEX_STAGE_3_MASK_);
\r
122 return m_data_[offset];
\r
125 // handle U+D800..U+10FFFF
\r
126 offset = getCodePointOffset(ch);
\r
127 return (offset >= 0) ? m_data_[offset] : m_initialValue_;
\r
131 * Gets the value to the data which this lead surrogate character points
\r
133 * Returned data may contain folding offset information for the next
\r
134 * trailing surrogate character.
\r
135 * This method does not guarantee correct results for trail surrogates.
\r
136 * @param ch lead surrogate character
\r
137 * @return data value
\r
139 public final int getLeadValue(char ch)
\r
141 return m_data_[getLeadOffset(ch)];
\r
145 * Get the value associated with the BMP code point.
\r
146 * Lead surrogate code points are treated as normal code points, with
\r
147 * unfolded values that may differ from getLeadValue() results.
\r
148 * @param ch the input BMP code point
\r
149 * @return trie data value associated with the BMP codepoint
\r
151 public final int getBMPValue(char ch)
\r
153 return m_data_[getBMPOffset(ch)];
\r
157 * Get the value associated with a pair of surrogates.
\r
158 * @param lead a lead surrogate
\r
159 * @param trail a trail surrogate
\r
161 public final int getSurrogateValue(char lead, char trail)
\r
163 if (!UTF16.isLeadSurrogate(lead) || !UTF16.isTrailSurrogate(trail)) {
\r
164 throw new IllegalArgumentException(
\r
165 "Argument characters do not form a supplementary character");
\r
167 // get fold position for the next trail surrogate
\r
168 int offset = getSurrogateOffset(lead, trail);
\r
170 // get the real data from the folded lead/trail units
\r
172 return m_data_[offset];
\r
175 // return m_initialValue_ if there is an error
\r
176 return m_initialValue_;
\r
180 * Get a value from a folding offset (from the value of a lead surrogate)
\r
181 * and a trail surrogate.
\r
182 * @param leadvalue the value of a lead surrogate that contains the
\r
184 * @param trail surrogate
\r
185 * @return trie data value associated with the trail character
\r
187 public final int getTrailValue(int leadvalue, char trail)
\r
189 if (m_dataManipulate_ == null) {
\r
190 throw new NullPointerException(
\r
191 "The field DataManipulate in this Trie is null");
\r
193 int offset = m_dataManipulate_.getFoldingOffset(leadvalue);
\r
195 return m_data_[getRawOffset(offset,
\r
196 (char)(trail & SURROGATE_MASK_))];
\r
198 return m_initialValue_;
\r
202 * <p>Gets the latin 1 fast path value.</p>
\r
203 * <p>Note this only works if latin 1 characters have their own linear
\r
205 * @param ch latin 1 characters
\r
206 * @return value associated with latin character
\r
208 public final int getLatin1LinearValue(char ch)
\r
210 return m_data_[INDEX_STAGE_3_MASK_ + 1 + ch];
\r
214 * Checks if the argument Trie has the same data as this Trie
\r
215 * @param other Trie to check
\r
216 * @return true if the argument Trie has the same data as this Trie, false
\r
220 public boolean equals(Object other)
\r
222 boolean result = super.equals(other);
\r
223 if (result && other instanceof IntTrie) {
\r
224 IntTrie othertrie = (IntTrie)other;
\r
225 if (m_initialValue_ != othertrie.m_initialValue_
\r
226 || !Arrays.equals(m_data_, othertrie.m_data_)) {
\r
235 // protected methods -----------------------------------------------
\r
238 * <p>Parses the input stream and stores its trie content into a index and
\r
240 * @param inputStream data input stream containing trie data
\r
241 * @exception IOException thrown when data reading fails
\r
243 protected final void unserialize(InputStream inputStream)
\r
246 super.unserialize(inputStream);
\r
247 // one used for initial value
\r
248 m_data_ = new int[m_dataLength_];
\r
249 DataInputStream input = new DataInputStream(inputStream);
\r
250 for (int i = 0; i < m_dataLength_; i ++) {
\r
251 m_data_[i] = input.readInt();
\r
253 m_initialValue_ = m_data_[0];
\r
257 * Gets the offset to the data which the surrogate pair points to.
\r
258 * @param lead lead surrogate
\r
259 * @param trail trailing surrogate
\r
260 * @return offset to data
\r
262 protected final int getSurrogateOffset(char lead, char trail)
\r
264 if (m_dataManipulate_ == null) {
\r
265 throw new NullPointerException(
\r
266 "The field DataManipulate in this Trie is null");
\r
268 // get fold position for the next trail surrogate
\r
269 int offset = m_dataManipulate_.getFoldingOffset(getLeadValue(lead));
\r
271 // get the real data from the folded lead/trail units
\r
273 return getRawOffset(offset, (char)(trail & SURROGATE_MASK_));
\r
276 // return -1 if there is an error, in this case we return the default
\r
277 // value: m_initialValue_
\r
282 * Gets the value at the argument index.
\r
283 * For use internally in TrieIterator
\r
284 * @param index value at index will be retrieved
\r
285 * @return 32 bit value
\r
286 * @see com.ibm.icu.impl.TrieIterator
\r
288 protected final int getValue(int index)
\r
290 return m_data_[index];
\r
294 * Gets the default initial value
\r
295 * @return 32 bit value
\r
297 protected final int getInitialValue()
\r
299 return m_initialValue_;
\r
302 // package private methods -----------------------------------------
\r
305 * Internal constructor for builder use
\r
306 * @param index the index array to be slotted into this trie
\r
307 * @param data the data array to be slotted into this trie
\r
308 * @param initialvalue the initial value for this trie
\r
309 * @param options trie options to use
\r
310 * @param datamanipulate folding implementation
\r
312 IntTrie(char index[], int data[], int initialvalue, int options,
\r
313 DataManipulate datamanipulate)
\r
315 super(index, options, datamanipulate);
\r
317 m_dataLength_ = m_data_.length;
\r
318 m_initialValue_ = initialvalue;
\r
321 // private data members --------------------------------------------
\r
326 private int m_initialValue_;
\r
328 * Array of char data
\r
330 private int m_data_[];
\r