2 ******************************************************************************
\r
3 * Copyright (C) 1996-2008, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 ******************************************************************************
\r
8 package com.ibm.icu.impl;
\r
10 import java.io.InputStream;
\r
11 import java.io.DataInputStream;
\r
12 import java.io.IOException;
\r
14 import com.ibm.icu.text.UTF16;
\r
17 * Trie implementation which stores data in char, 16 bits.
\r
19 * @see com.ibm.icu.impl.Trie
\r
20 * @since release 2.1, Jan 01 2002
\r
23 // note that i need to handle the block calculations later, since chartrie
\r
24 // in icu4c uses the same index array.
\r
25 public class CharTrie extends Trie
\r
27 // public constructors ---------------------------------------------
\r
30 * <p>Creates a new Trie with the settings for the trie data.</p>
\r
31 * <p>Unserialize the 32-bit-aligned input stream and use the data for the
\r
33 * @param inputStream file input stream to a ICU data file, containing
\r
35 * @param dataManipulate object which provides methods to parse the char
\r
37 * @throws IOException thrown when data reading fails
\r
39 public CharTrie(InputStream inputStream,
\r
40 DataManipulate dataManipulate) throws IOException
\r
42 super(inputStream, dataManipulate);
\r
44 if (!isCharTrie()) {
\r
45 throw new IllegalArgumentException(
\r
46 "Data given does not belong to a char trie.");
\r
48 m_friendAgent_ = new FriendAgent();
\r
52 * Make a dummy CharTrie.
\r
53 * A dummy trie is an empty runtime trie, used when a real data trie cannot
\r
56 * The trie always returns the initialValue,
\r
57 * or the leadUnitValue for lead surrogate code points.
\r
58 * The Latin-1 part is always set up to be linear.
\r
60 * @param initialValue the initial value that is set for all code points
\r
61 * @param leadUnitValue the value for lead surrogate code _units_ that do not
\r
62 * have associated supplementary data
\r
63 * @param dataManipulate object which provides methods to parse the char data
\r
65 public CharTrie(int initialValue, int leadUnitValue, DataManipulate dataManipulate) {
\r
66 super(new char[BMP_INDEX_LENGTH+SURROGATE_BLOCK_COUNT], HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_, dataManipulate);
\r
68 int dataLength, latin1Length, i, limit;
\r
71 /* calculate the actual size of the dummy trie data */
\r
73 /* max(Latin-1, block 0) */
\r
74 dataLength=latin1Length= INDEX_STAGE_1_SHIFT_<=8 ? 256 : DATA_BLOCK_LENGTH;
\r
75 if(leadUnitValue!=initialValue) {
\r
76 dataLength+=DATA_BLOCK_LENGTH;
\r
78 m_data_=new char[dataLength];
\r
79 m_dataLength_=dataLength;
\r
81 m_initialValue_=(char)initialValue;
\r
83 /* fill the index and data arrays */
\r
85 /* indexes are preset to 0 (block 0) */
\r
88 for(i=0; i<latin1Length; ++i) {
\r
89 m_data_[i]=(char)initialValue;
\r
92 if(leadUnitValue!=initialValue) {
\r
93 /* indexes for lead surrogate code units to the block after Latin-1 */
\r
94 block=(char)(latin1Length>>INDEX_STAGE_2_SHIFT_);
\r
95 i=0xd800>>INDEX_STAGE_1_SHIFT_;
\r
96 limit=0xdc00>>INDEX_STAGE_1_SHIFT_;
\r
97 for(; i<limit; ++i) {
\r
101 /* data for lead surrogate code units */
\r
102 limit=latin1Length+DATA_BLOCK_LENGTH;
\r
103 for(i=latin1Length; i<limit; ++i) {
\r
104 m_data_[i]=(char)leadUnitValue;
\r
108 m_friendAgent_ = new FriendAgent();
\r
112 * Java friend implementation
\r
114 public class FriendAgent
\r
117 * Gives out the index array of the trie
\r
118 * @return index array of trie
\r
120 public char[] getPrivateIndex()
\r
125 * Gives out the data array of the trie
\r
126 * @return data array of trie
\r
128 public char[] getPrivateData()
\r
133 * Gives out the data offset in the trie
\r
134 * @return data offset in the trie
\r
136 public int getPrivateInitialValue()
\r
138 return m_initialValue_;
\r
142 // public methods --------------------------------------------------
\r
145 * Java friend implementation
\r
146 * To store the index and data array into the argument.
\r
147 * @param friend java friend UCharacterProperty object to store the array
\r
149 public void putIndexData(UCharacterProperty friend)
\r
151 friend.setIndexData(m_friendAgent_);
\r
155 * Gets the value associated with the codepoint.
\r
156 * If no value is associated with the codepoint, a default value will be
\r
158 * @param ch codepoint
\r
159 * @return offset to data
\r
161 public final char getCodePointValue(int ch)
\r
165 // fastpath for U+0000..U+D7FF
\r
166 if(0 <= ch && ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
\r
167 // copy of getRawOffset()
\r
168 offset = (m_index_[ch >> INDEX_STAGE_1_SHIFT_] << INDEX_STAGE_2_SHIFT_)
\r
169 + (ch & INDEX_STAGE_3_MASK_);
\r
170 return m_data_[offset];
\r
173 // handle U+D800..U+10FFFF
\r
174 offset = getCodePointOffset(ch);
\r
176 // return -1 if there is an error, in this case we return the default
\r
177 // value: m_initialValue_
\r
178 return (offset >= 0) ? m_data_[offset] : m_initialValue_;
\r
182 * Gets the value to the data which this lead surrogate character points
\r
184 * Returned data may contain folding offset information for the next
\r
185 * trailing surrogate character.
\r
186 * This method does not guarantee correct results for trail surrogates.
\r
187 * @param ch lead surrogate character
\r
188 * @return data value
\r
190 public final char getLeadValue(char ch)
\r
192 return m_data_[getLeadOffset(ch)];
\r
196 * Get the value associated with the BMP code point.
\r
197 * Lead surrogate code points are treated as normal code points, with
\r
198 * unfolded values that may differ from getLeadValue() results.
\r
199 * @param ch the input BMP code point
\r
200 * @return trie data value associated with the BMP codepoint
\r
202 public final char getBMPValue(char ch)
\r
204 return m_data_[getBMPOffset(ch)];
\r
208 * Get the value associated with a pair of surrogates.
\r
209 * @param lead a lead surrogate
\r
210 * @param trail a trail surrogate
\r
212 public final char getSurrogateValue(char lead, char trail)
\r
214 int offset = getSurrogateOffset(lead, trail);
\r
216 return m_data_[offset];
\r
218 return m_initialValue_;
\r
222 * <p>Get a value from a folding offset (from the value of a lead surrogate)
\r
223 * and a trail surrogate.</p>
\r
225 * @param leadvalue value associated with the lead surrogate which contains
\r
226 * the folding offset
\r
227 * @param trail surrogate
\r
228 * @return trie data value associated with the trail character
\r
230 public final char getTrailValue(int leadvalue, char trail)
\r
232 if (m_dataManipulate_ == null) {
\r
233 throw new NullPointerException(
\r
234 "The field DataManipulate in this Trie is null");
\r
236 int offset = m_dataManipulate_.getFoldingOffset(leadvalue);
\r
238 return m_data_[getRawOffset(offset,
\r
239 (char)(trail & SURROGATE_MASK_))];
\r
241 return m_initialValue_;
\r
245 * <p>Gets the latin 1 fast path value.</p>
\r
246 * <p>Note this only works if latin 1 characters have their own linear
\r
248 * @param ch latin 1 characters
\r
249 * @return value associated with latin character
\r
251 public final char getLatin1LinearValue(char ch)
\r
253 return m_data_[INDEX_STAGE_3_MASK_ + 1 + m_dataOffset_ + ch];
\r
257 * Checks if the argument Trie has the same data as this Trie
\r
258 * @param other Trie to check
\r
259 * @return true if the argument Trie has the same data as this Trie, false
\r
263 public boolean equals(Object other)
\r
265 boolean result = super.equals(other);
\r
266 if (result && other instanceof CharTrie) {
\r
267 CharTrie othertrie = (CharTrie)other;
\r
268 return m_initialValue_ == othertrie.m_initialValue_;
\r
274 // protected methods -----------------------------------------------
\r
277 * <p>Parses the input stream and stores its trie content into a index and
\r
279 * @param inputStream data input stream containing trie data
\r
280 * @exception IOException thrown when data reading fails
\r
282 protected final void unserialize(InputStream inputStream)
\r
285 DataInputStream input = new DataInputStream(inputStream);
\r
286 int indexDataLength = m_dataOffset_ + m_dataLength_;
\r
287 m_index_ = new char[indexDataLength];
\r
288 for (int i = 0; i < indexDataLength; i ++) {
\r
289 m_index_[i] = input.readChar();
\r
291 m_data_ = m_index_;
\r
292 m_initialValue_ = m_data_[m_dataOffset_];
\r
296 * Gets the offset to the data which the surrogate pair points to.
\r
297 * @param lead lead surrogate
\r
298 * @param trail trailing surrogate
\r
299 * @return offset to data
\r
301 protected final int getSurrogateOffset(char lead, char trail)
\r
303 if (m_dataManipulate_ == null) {
\r
304 throw new NullPointerException(
\r
305 "The field DataManipulate in this Trie is null");
\r
308 // get fold position for the next trail surrogate
\r
309 int offset = m_dataManipulate_.getFoldingOffset(getLeadValue(lead));
\r
311 // get the real data from the folded lead/trail units
\r
313 return getRawOffset(offset, (char)(trail & SURROGATE_MASK_));
\r
316 // return -1 if there is an error, in this case we return the default
\r
317 // value: m_initialValue_
\r
322 * Gets the value at the argument index.
\r
323 * For use internally in TrieIterator.
\r
324 * @param index value at index will be retrieved
\r
325 * @return 32 bit value
\r
326 * @see com.ibm.icu.impl.TrieIterator
\r
328 protected final int getValue(int index)
\r
330 return m_data_[index];
\r
334 * Gets the default initial value
\r
335 * @return 32 bit value
\r
337 protected final int getInitialValue()
\r
339 return m_initialValue_;
\r
342 // private data members --------------------------------------------
\r
347 private char m_initialValue_;
\r
349 * Array of char data
\r
351 private char m_data_[];
\r
353 * Agent for friends
\r
355 private FriendAgent m_friendAgent_;
\r