2 ******************************************************************************
3 * Copyright (C) 1996-2011, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 ******************************************************************************
8 package com.ibm.icu.impl;
10 import java.io.DataInputStream;
11 import java.io.IOException;
12 import java.io.InputStream;
14 import com.ibm.icu.text.UTF16;
17 * Trie implementation which stores data in char, 16 bits.
19 * @see com.ibm.icu.impl.Trie
20 * @since release 2.1, Jan 01 2002
23 // note that i need to handle the block calculations later, since chartrie
24 // in icu4c uses the same index array.
25 public class CharTrie extends Trie
27 // public constructors ---------------------------------------------
30 * <p>Creates a new Trie with the settings for the trie data.</p>
31 * <p>Unserialize the 32-bit-aligned input stream and use the data for the
33 * @param inputStream file input stream to a ICU data file, containing
35 * @param dataManipulate object which provides methods to parse the char
37 * @throws IOException thrown when data reading fails
39 public CharTrie(InputStream inputStream,
40 DataManipulate dataManipulate) throws IOException
42 super(inputStream, dataManipulate);
45 throw new IllegalArgumentException(
46 "Data given does not belong to a char trie.");
51 * Make a dummy CharTrie.
52 * A dummy trie is an empty runtime trie, used when a real data trie cannot
55 * The trie always returns the initialValue,
56 * or the leadUnitValue for lead surrogate code points.
57 * The Latin-1 part is always set up to be linear.
59 * @param initialValue the initial value that is set for all code points
60 * @param leadUnitValue the value for lead surrogate code _units_ that do not
61 * have associated supplementary data
62 * @param dataManipulate object which provides methods to parse the char data
64 @SuppressWarnings("all") // No way to ignore dead code warning specifically - see eclipse bug#282770
65 public CharTrie(int initialValue, int leadUnitValue, DataManipulate dataManipulate) {
66 super(new char[BMP_INDEX_LENGTH+SURROGATE_BLOCK_COUNT], HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_, dataManipulate);
68 int dataLength, latin1Length, i, limit;
71 /* calculate the actual size of the dummy trie data */
73 /* max(Latin-1, block 0) */
74 dataLength=latin1Length= INDEX_STAGE_1_SHIFT_<=8 ? 256 : DATA_BLOCK_LENGTH;
75 if(leadUnitValue!=initialValue) {
76 dataLength+=DATA_BLOCK_LENGTH;
78 m_data_=new char[dataLength];
79 m_dataLength_=dataLength;
81 m_initialValue_=(char)initialValue;
83 /* fill the index and data arrays */
85 /* indexes are preset to 0 (block 0) */
88 for(i=0; i<latin1Length; ++i) {
89 m_data_[i]=(char)initialValue;
92 if(leadUnitValue!=initialValue) {
93 /* indexes for lead surrogate code units to the block after Latin-1 */
94 block=(char)(latin1Length>>INDEX_STAGE_2_SHIFT_);
95 i=0xd800>>INDEX_STAGE_1_SHIFT_;
96 limit=0xdc00>>INDEX_STAGE_1_SHIFT_;
101 /* data for lead surrogate code units */
102 limit=latin1Length+DATA_BLOCK_LENGTH;
103 for(i=latin1Length; i<limit; ++i) {
104 m_data_[i]=(char)leadUnitValue;
109 // public methods --------------------------------------------------
112 * Gets the value associated with the codepoint.
113 * If no value is associated with the codepoint, a default value will be
115 * @param ch codepoint
116 * @return offset to data
118 public final char getCodePointValue(int ch)
122 // fastpath for U+0000..U+D7FF
123 if(0 <= ch && ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
124 // copy of getRawOffset()
125 offset = (m_index_[ch >> INDEX_STAGE_1_SHIFT_] << INDEX_STAGE_2_SHIFT_)
126 + (ch & INDEX_STAGE_3_MASK_);
127 return m_data_[offset];
130 // handle U+D800..U+10FFFF
131 offset = getCodePointOffset(ch);
133 // return -1 if there is an error, in this case we return the default
134 // value: m_initialValue_
135 return (offset >= 0) ? m_data_[offset] : m_initialValue_;
139 * Gets the value to the data which this lead surrogate character points
141 * Returned data may contain folding offset information for the next
142 * trailing surrogate character.
143 * This method does not guarantee correct results for trail surrogates.
144 * @param ch lead surrogate character
147 public final char getLeadValue(char ch)
149 return m_data_[getLeadOffset(ch)];
153 * Get the value associated with the BMP code point.
154 * Lead surrogate code points are treated as normal code points, with
155 * unfolded values that may differ from getLeadValue() results.
156 * @param ch the input BMP code point
157 * @return trie data value associated with the BMP codepoint
159 public final char getBMPValue(char ch)
161 return m_data_[getBMPOffset(ch)];
165 * Get the value associated with a pair of surrogates.
166 * @param lead a lead surrogate
167 * @param trail a trail surrogate
169 public final char getSurrogateValue(char lead, char trail)
171 int offset = getSurrogateOffset(lead, trail);
173 return m_data_[offset];
175 return m_initialValue_;
179 * <p>Get a value from a folding offset (from the value of a lead surrogate)
180 * and a trail surrogate.</p>
182 * @param leadvalue value associated with the lead surrogate which contains
184 * @param trail surrogate
185 * @return trie data value associated with the trail character
187 public final char getTrailValue(int leadvalue, char trail)
189 if (m_dataManipulate_ == null) {
190 throw new NullPointerException(
191 "The field DataManipulate in this Trie is null");
193 int offset = m_dataManipulate_.getFoldingOffset(leadvalue);
195 return m_data_[getRawOffset(offset,
196 (char)(trail & SURROGATE_MASK_))];
198 return m_initialValue_;
202 * <p>Gets the latin 1 fast path value.</p>
203 * <p>Note this only works if latin 1 characters have their own linear
205 * @param ch latin 1 characters
206 * @return value associated with latin character
208 public final char getLatin1LinearValue(char ch)
210 return m_data_[INDEX_STAGE_3_MASK_ + 1 + m_dataOffset_ + ch];
214 * Checks if the argument Trie has the same data as this Trie
215 * @param other Trie to check
216 * @return true if the argument Trie has the same data as this Trie, false
220 public boolean equals(Object other)
222 boolean result = super.equals(other);
223 if (result && other instanceof CharTrie) {
224 CharTrie othertrie = (CharTrie)other;
225 return m_initialValue_ == othertrie.m_initialValue_;
230 public int hashCode() {
231 assert false : "hashCode not designed";
236 // protected methods -----------------------------------------------
239 * <p>Parses the input stream and stores its trie content into a index and
241 * @param inputStream data input stream containing trie data
242 * @exception IOException thrown when data reading fails
244 protected final void unserialize(InputStream inputStream)
247 DataInputStream input = new DataInputStream(inputStream);
248 int indexDataLength = m_dataOffset_ + m_dataLength_;
249 m_index_ = new char[indexDataLength];
250 for (int i = 0; i < indexDataLength; i ++) {
251 m_index_[i] = input.readChar();
254 m_initialValue_ = m_data_[m_dataOffset_];
258 * Gets the offset to the data which the surrogate pair points to.
259 * @param lead lead surrogate
260 * @param trail trailing surrogate
261 * @return offset to data
263 protected final int getSurrogateOffset(char lead, char trail)
265 if (m_dataManipulate_ == null) {
266 throw new NullPointerException(
267 "The field DataManipulate in this Trie is null");
270 // get fold position for the next trail surrogate
271 int offset = m_dataManipulate_.getFoldingOffset(getLeadValue(lead));
273 // get the real data from the folded lead/trail units
275 return getRawOffset(offset, (char)(trail & SURROGATE_MASK_));
278 // return -1 if there is an error, in this case we return the default
279 // value: m_initialValue_
284 * Gets the value at the argument index.
285 * For use internally in TrieIterator.
286 * @param index value at index will be retrieved
287 * @return 32 bit value
288 * @see com.ibm.icu.impl.TrieIterator
290 protected final int getValue(int index)
292 return m_data_[index];
296 * Gets the default initial value
297 * @return 32 bit value
299 protected final int getInitialValue()
301 return m_initialValue_;
304 // private data members --------------------------------------------
309 private char m_initialValue_;
313 private char m_data_[];