2 ******************************************************************************
\r
3 * Copyright (C) 1996-2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 ******************************************************************************
\r
8 package com.ibm.icu.impl;
\r
10 import java.io.DataInputStream;
\r
11 import java.io.IOException;
\r
12 import java.io.InputStream;
\r
13 import java.util.Arrays;
\r
15 import com.ibm.icu.lang.UCharacter;
\r
16 import com.ibm.icu.text.UTF16;
\r
19 * <p>A trie is a kind of compressed, serializable table of values
\r
20 * associated with Unicode code points (0..0x10ffff).</p>
\r
21 * <p>This class defines the basic structure of a trie and provides methods
\r
22 * to <b>retrieve the offsets to the actual data</b>.</p>
\r
23 * <p>Data will be the form of an array of basic types, char or int.</p>
\r
24 * <p>The actual data format will have to be specified by the user in the
\r
25 * inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p>
\r
26 * <p>This trie implementation is optimized for getting offset while walking
\r
27 * forward through a UTF-16 string.
\r
28 * Therefore, the simplest and fastest access macros are the
\r
29 * fromLead() and fromOffsetTrail() methods.
\r
30 * The fromBMP() method are a little more complicated; they get offsets even
\r
31 * for lead surrogate codepoints, while the fromLead() method get special
\r
32 * "folded" offsets for lead surrogate code units if there is relevant data
\r
33 * associated with them.
\r
34 * From such a folded offsets, an offset needs to be extracted to supply
\r
35 * to the fromOffsetTrail() methods.
\r
36 * To handle such supplementary codepoints, some offset information are kept
\r
38 * <p>Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve
\r
39 * that offset from the folded value for the lead surrogate unit.</p>
\r
40 * <p>For examples of use, see com.ibm.icu.impl.CharTrie or
\r
41 * com.ibm.icu.impl.IntTrie.</p>
\r
43 * @see com.ibm.icu.impl.CharTrie
\r
44 * @see com.ibm.icu.impl.IntTrie
\r
45 * @since release 2.1, Jan 01 2002
\r
47 public abstract class Trie
\r
49 // public class declaration ----------------------------------------
\r
52 * Character data in com.ibm.impl.Trie have different user-specified format
\r
53 * for different purposes.
\r
54 * This interface specifies methods to be implemented in order for
\r
55 * com.ibm.impl.Trie, to surrogate offset information encapsulated within
\r
58 public static interface DataManipulate
\r
61 * Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's
\r
63 * the index array offset of the indexes for that lead surrogate.
\r
64 * @param value data value for a surrogate from the trie, including the
\r
66 * @return data offset or 0 if there is no data for the lead surrogate
\r
68 public int getFoldingOffset(int value);
\r
71 // default implementation
\r
72 private static class DefaultGetFoldingOffset implements DataManipulate {
\r
73 public int getFoldingOffset(int value) {
\r
78 // public methods --------------------------------------------------
\r
81 * Determines if this trie has a linear latin 1 array
\r
82 * @return true if this trie has a linear latin 1 array, false otherwise
\r
84 public final boolean isLatin1Linear()
\r
86 return m_isLatin1Linear_;
\r
90 * Checks if the argument Trie has the same data as this Trie.
\r
91 * Attributes are checked but not the index data.
\r
92 * @param other Trie to check
\r
93 * @return true if the argument Trie has the same data as this Trie, false
\r
97 public boolean equals(Object other)
\r
99 if (other == this) {
\r
102 if (!(other instanceof Trie)) {
\r
105 Trie othertrie = (Trie)other;
\r
106 return m_isLatin1Linear_ == othertrie.m_isLatin1Linear_
\r
107 && m_options_ == othertrie.m_options_
\r
108 && m_dataLength_ == othertrie.m_dataLength_
\r
109 && Arrays.equals(m_index_, othertrie.m_index_);
\r
114 * Gets the serialized data file size of the Trie. This is used during
\r
115 * trie data reading for size checking purposes.
\r
116 * @return size size of serialized trie data file in terms of the number
\r
119 public int getSerializedDataSize()
\r
121 // includes signature, option, dataoffset and datalength output
\r
122 int result = (4 << 2);
\r
123 result += (m_dataOffset_ << 1);
\r
124 if (isCharTrie()) {
\r
125 result += (m_dataLength_ << 1);
\r
127 else if (isIntTrie()) {
\r
128 result += (m_dataLength_ << 2);
\r
133 // protected constructor -------------------------------------------
\r
136 * Trie constructor for CharTrie use.
\r
137 * @param inputStream ICU data file input stream which contains the
\r
139 * @param dataManipulate object containing the information to parse the
\r
141 * @throws IOException thrown when input stream does not have the
\r
144 protected Trie(InputStream inputStream,
\r
145 DataManipulate dataManipulate) throws IOException
\r
147 DataInputStream input = new DataInputStream(inputStream);
\r
148 // Magic number to authenticate the data.
\r
149 int signature = input.readInt();
\r
150 m_options_ = input.readInt();
\r
152 if (!checkHeader(signature)) {
\r
153 throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file");
\r
156 if(dataManipulate != null) {
\r
157 m_dataManipulate_ = dataManipulate;
\r
159 m_dataManipulate_ = new DefaultGetFoldingOffset();
\r
161 m_isLatin1Linear_ = (m_options_ &
\r
162 HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
\r
163 m_dataOffset_ = input.readInt();
\r
164 m_dataLength_ = input.readInt();
\r
165 unserialize(inputStream);
\r
170 * @param index array to be used for index
\r
171 * @param options used by the trie
\r
172 * @param dataManipulate object containing the information to parse the
\r
175 protected Trie(char index[], int options, DataManipulate dataManipulate)
\r
177 m_options_ = options;
\r
178 if(dataManipulate != null) {
\r
179 m_dataManipulate_ = dataManipulate;
\r
181 m_dataManipulate_ = new DefaultGetFoldingOffset();
\r
183 m_isLatin1Linear_ = (m_options_ &
\r
184 HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
\r
186 m_dataOffset_ = m_index_.length;
\r
190 // protected data members ------------------------------------------
\r
193 * Lead surrogate code points' index displacement in the index array.
\r
194 * 0x10000-0xd800=0x2800
\r
195 * 0x2800 >> INDEX_STAGE_1_SHIFT_
\r
197 protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;
\r
199 * Shift size for shifting right the input index. 1..9
\r
201 protected static final int INDEX_STAGE_1_SHIFT_ = 5;
\r
203 * Shift size for shifting left the index array values.
\r
204 * Increases possible data size with 16-bit index values at the cost
\r
205 * of compactability.
\r
206 * This requires blocks of stage 2 data to be aligned by
\r
207 * DATA_GRANULARITY.
\r
208 * 0..INDEX_STAGE_1_SHIFT
\r
210 protected static final int INDEX_STAGE_2_SHIFT_ = 2;
\r
212 * Number of data values in a stage 2 (data array) block.
\r
214 protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_;
\r
216 * Mask for getting the lower bits from the input index.
\r
217 * DATA_BLOCK_LENGTH - 1.
\r
219 protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1;
\r
220 /** Number of bits of a trail surrogate that are used in index table lookups. */
\r
221 protected static final int SURROGATE_BLOCK_BITS=10-INDEX_STAGE_1_SHIFT_;
\r
223 * Number of index (stage 1) entries per lead surrogate.
\r
224 * Same as number of index entries for 1024 trail surrogates,
\r
225 * ==0x400>>INDEX_STAGE_1_SHIFT_
\r
227 protected static final int SURROGATE_BLOCK_COUNT=(1<<SURROGATE_BLOCK_BITS);
\r
228 /** Length of the BMP portion of the index (stage 1) array. */
\r
229 protected static final int BMP_INDEX_LENGTH=0x10000>>INDEX_STAGE_1_SHIFT_;
\r
231 * Surrogate mask to use when shifting offset to retrieve supplementary
\r
234 protected static final int SURROGATE_MASK_ = 0x3FF;
\r
236 * Index or UTF16 characters
\r
238 protected char m_index_[];
\r
240 * Internal TrieValue which handles the parsing of the data value.
\r
241 * This class is to be implemented by the user
\r
243 protected DataManipulate m_dataManipulate_;
\r
245 * Start index of the data portion of the trie. CharTrie combines
\r
246 * index and data into a char array, so this is used to indicate the
\r
247 * initial offset to the data portion.
\r
248 * Note this index always points to the initial value.
\r
250 protected int m_dataOffset_;
\r
252 * Length of the data array
\r
254 protected int m_dataLength_;
\r
256 // protected methods -----------------------------------------------
\r
259 * Gets the offset to the data which the surrogate pair points to.
\r
260 * @param lead lead surrogate
\r
261 * @param trail trailing surrogate
\r
262 * @return offset to data
\r
264 protected abstract int getSurrogateOffset(char lead, char trail);
\r
267 * Gets the value at the argument index
\r
268 * @param index value at index will be retrieved
\r
269 * @return 32 bit value
\r
271 protected abstract int getValue(int index);
\r
274 * Gets the default initial value
\r
275 * @return 32 bit value
\r
277 protected abstract int getInitialValue();
\r
280 * Gets the offset to the data which the index ch after variable offset
\r
282 * Note for locating a non-supplementary character data offset, calling
\r
284 * getRawOffset(0, ch);
\r
286 * will do. Otherwise if it is a supplementary character formed by
\r
287 * surrogates lead and trail. Then we would have to call getRawOffset()
\r
288 * with getFoldingIndexOffset(). See getSurrogateOffset().
\r
289 * @param offset index offset which ch is to start from
\r
290 * @param ch index to be used after offset
\r
291 * @return offset to the data
\r
293 protected final int getRawOffset(int offset, char ch)
\r
295 return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)]
\r
296 << INDEX_STAGE_2_SHIFT_)
\r
297 + (ch & INDEX_STAGE_3_MASK_);
\r
301 * Gets the offset to data which the BMP character points to
\r
302 * Treats a lead surrogate as a normal code point.
\r
303 * @param ch BMP character
\r
304 * @return offset to data
\r
306 protected final int getBMPOffset(char ch)
\r
308 return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE
\r
309 && ch <= UTF16.LEAD_SURROGATE_MAX_VALUE)
\r
310 ? getRawOffset(LEAD_INDEX_OFFSET_, ch)
\r
311 : getRawOffset(0, ch);
\r
312 // using a getRawOffset(ch) makes no diff
\r
316 * Gets the offset to the data which this lead surrogate character points
\r
318 * Data at the returned offset may contain folding offset information for
\r
319 * the next trailing surrogate character.
\r
320 * @param ch lead surrogate character
\r
321 * @return offset to data
\r
323 protected final int getLeadOffset(char ch)
\r
325 return getRawOffset(0, ch);
\r
329 * Internal trie getter from a code point.
\r
330 * Could be faster(?) but longer with
\r
331 * if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }
\r
332 * Gets the offset to data which the codepoint points to
\r
333 * @param ch codepoint
\r
334 * @return offset to data
\r
336 protected final int getCodePointOffset(int ch)
\r
338 // if ((ch >> 16) == 0) slower
\r
341 } else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
\r
342 // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works
\r
343 return getRawOffset(0, (char)ch);
\r
344 } else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
\r
346 return getBMPOffset((char)ch);
\r
347 } else if (ch <= UCharacter.MAX_VALUE) {
\r
348 // look at the construction of supplementary characters
\r
349 // trail forms the ends of it.
\r
350 return getSurrogateOffset(UTF16.getLeadSurrogate(ch),
\r
351 (char)(ch & SURROGATE_MASK_));
\r
353 // return -1 if there is an error, in this case we return
\r
359 * <p>Parses the inputstream and creates the trie index with it.</p>
\r
360 * <p>This is overwritten by the child classes.
\r
361 * @param inputStream input stream containing the trie information
\r
362 * @exception IOException thrown when data reading fails.
\r
364 protected void unserialize(InputStream inputStream) throws IOException
\r
366 //indexLength is a multiple of 1024 >> INDEX_STAGE_2_SHIFT_
\r
367 m_index_ = new char[m_dataOffset_];
\r
368 DataInputStream input = new DataInputStream(inputStream);
\r
369 for (int i = 0; i < m_dataOffset_; i ++) {
\r
370 m_index_[i] = input.readChar();
\r
375 * Determines if this is a 32 bit trie
\r
376 * @return true if options specifies this is a 32 bit trie
\r
378 protected final boolean isIntTrie()
\r
380 return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) != 0;
\r
384 * Determines if this is a 16 bit trie
\r
385 * @return true if this is a 16 bit trie
\r
387 protected final boolean isCharTrie()
\r
389 return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0;
\r
392 // private data members --------------------------------------------
\r
394 // struct UTrieHeader {
\r
395 // int32_t signature;
\r
396 // int32_t options (a bit field)
\r
397 // int32_t indexLength
\r
398 // int32_t dataLength
\r
401 * Size of Trie header in bytes
\r
403 protected static final int HEADER_LENGTH_ = 4 * 4;
\r
405 * Latin 1 option mask
\r
407 protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;
\r
409 * Constant number to authenticate the byte block
\r
411 protected static final int HEADER_SIGNATURE_ = 0x54726965;
\r
413 * Header option formatting
\r
415 private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF;
\r
416 protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4;
\r
417 protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100;
\r
420 * Flag indicator for Latin quick access data block
\r
422 private boolean m_isLatin1Linear_;
\r
425 * <p>Trie options field.</p>
\r
426 * <p>options bit field:<br>
\r
427 * 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br>
\r
428 * 8 0 = 16-bit data, 1=32-bit data<br>
\r
429 * 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT<br>
\r
430 * 3..0 INDEX_STAGE_2_SHIFT // 1..9<br>
\r
432 private int m_options_;
\r
434 // private methods ---------------------------------------------------
\r
437 * Authenticates raw data header.
\r
438 * Checking the header information, signature and options.
\r
439 * @param signature This contains the options and type of a Trie
\r
440 * @return true if the header is authenticated valid
\r
442 private final boolean checkHeader(int signature)
\r
444 // check the signature
\r
445 // Trie in big-endian US-ASCII (0x54726965).
\r
446 // Magic number to authenticate the data.
\r
447 if (signature != HEADER_SIGNATURE_) {
\r
451 if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) !=
\r
452 INDEX_STAGE_1_SHIFT_ ||
\r
453 ((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) &
\r
454 HEADER_OPTIONS_SHIFT_MASK_)
\r
455 != INDEX_STAGE_2_SHIFT_) {
\r