2 ******************************************************************************
3 * Copyright (C) 1996-2010, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 ******************************************************************************
8 package com.ibm.icu.impl;
10 import java.io.DataInputStream;
11 import java.io.IOException;
12 import java.io.InputStream;
13 import java.util.Arrays;
15 import com.ibm.icu.lang.UCharacter;
16 import com.ibm.icu.text.UTF16;
19 * <p>A trie is a kind of compressed, serializable table of values
20 * associated with Unicode code points (0..0x10ffff).</p>
21 * <p>This class defines the basic structure of a trie and provides methods
22 * to <b>retrieve the offsets to the actual data</b>.</p>
23 * <p>Data will be the form of an array of basic types, char or int.</p>
24 * <p>The actual data format will have to be specified by the user in the
25 * inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p>
26 * <p>This trie implementation is optimized for getting offset while walking
27 * forward through a UTF-16 string.
28 * Therefore, the simplest and fastest access macros are the
29 * fromLead() and fromOffsetTrail() methods.
30 * The fromBMP() method are a little more complicated; they get offsets even
31 * for lead surrogate codepoints, while the fromLead() method get special
32 * "folded" offsets for lead surrogate code units if there is relevant data
33 * associated with them.
34 * From such a folded offsets, an offset needs to be extracted to supply
35 * to the fromOffsetTrail() methods.
36 * To handle such supplementary codepoints, some offset information are kept
38 * <p>Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve
39 * that offset from the folded value for the lead surrogate unit.</p>
40 * <p>For examples of use, see com.ibm.icu.impl.CharTrie or
41 * com.ibm.icu.impl.IntTrie.</p>
43 * @see com.ibm.icu.impl.CharTrie
44 * @see com.ibm.icu.impl.IntTrie
45 * @since release 2.1, Jan 01 2002
47 public abstract class Trie
49 // public class declaration ----------------------------------------
52 * Character data in com.ibm.impl.Trie have different user-specified format
53 * for different purposes.
54 * This interface specifies methods to be implemented in order for
55 * com.ibm.impl.Trie, to surrogate offset information encapsulated within
58 public static interface DataManipulate
61 * Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's
63 * the index array offset of the indexes for that lead surrogate.
64 * @param value data value for a surrogate from the trie, including the
66 * @return data offset or 0 if there is no data for the lead surrogate
68 public int getFoldingOffset(int value);
71 // default implementation
72 private static class DefaultGetFoldingOffset implements DataManipulate {
73 public int getFoldingOffset(int value) {
78 // public methods --------------------------------------------------
81 * Determines if this trie has a linear latin 1 array
82 * @return true if this trie has a linear latin 1 array, false otherwise
84 public final boolean isLatin1Linear()
86 return m_isLatin1Linear_;
90 * Checks if the argument Trie has the same data as this Trie.
91 * Attributes are checked but not the index data.
92 * @param other Trie to check
93 * @return true if the argument Trie has the same data as this Trie, false
97 public boolean equals(Object other)
102 if (!(other instanceof Trie)) {
105 Trie othertrie = (Trie)other;
106 return m_isLatin1Linear_ == othertrie.m_isLatin1Linear_
107 && m_options_ == othertrie.m_options_
108 && m_dataLength_ == othertrie.m_dataLength_
109 && Arrays.equals(m_index_, othertrie.m_index_);
114 * Gets the serialized data file size of the Trie. This is used during
115 * trie data reading for size checking purposes.
116 * @return size size of serialized trie data file in terms of the number
119 public int getSerializedDataSize()
121 // includes signature, option, dataoffset and datalength output
122 int result = (4 << 2);
123 result += (m_dataOffset_ << 1);
125 result += (m_dataLength_ << 1);
127 else if (isIntTrie()) {
128 result += (m_dataLength_ << 2);
133 // protected constructor -------------------------------------------
136 * Trie constructor for CharTrie use.
137 * @param inputStream ICU data file input stream which contains the
139 * @param dataManipulate object containing the information to parse the
141 * @throws IOException thrown when input stream does not have the
144 protected Trie(InputStream inputStream,
145 DataManipulate dataManipulate) throws IOException
147 DataInputStream input = new DataInputStream(inputStream);
148 // Magic number to authenticate the data.
149 int signature = input.readInt();
150 m_options_ = input.readInt();
152 if (!checkHeader(signature)) {
153 throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file");
156 if(dataManipulate != null) {
157 m_dataManipulate_ = dataManipulate;
159 m_dataManipulate_ = new DefaultGetFoldingOffset();
161 m_isLatin1Linear_ = (m_options_ &
162 HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
163 m_dataOffset_ = input.readInt();
164 m_dataLength_ = input.readInt();
165 unserialize(inputStream);
170 * @param index array to be used for index
171 * @param options used by the trie
172 * @param dataManipulate object containing the information to parse the
175 protected Trie(char index[], int options, DataManipulate dataManipulate)
177 m_options_ = options;
178 if(dataManipulate != null) {
179 m_dataManipulate_ = dataManipulate;
181 m_dataManipulate_ = new DefaultGetFoldingOffset();
183 m_isLatin1Linear_ = (m_options_ &
184 HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
186 m_dataOffset_ = m_index_.length;
190 // protected data members ------------------------------------------
193 * Lead surrogate code points' index displacement in the index array.
194 * 0x10000-0xd800=0x2800
195 * 0x2800 >> INDEX_STAGE_1_SHIFT_
197 protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;
199 * Shift size for shifting right the input index. 1..9
201 protected static final int INDEX_STAGE_1_SHIFT_ = 5;
203 * Shift size for shifting left the index array values.
204 * Increases possible data size with 16-bit index values at the cost
206 * This requires blocks of stage 2 data to be aligned by
208 * 0..INDEX_STAGE_1_SHIFT
210 protected static final int INDEX_STAGE_2_SHIFT_ = 2;
212 * Number of data values in a stage 2 (data array) block.
214 protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_;
216 * Mask for getting the lower bits from the input index.
217 * DATA_BLOCK_LENGTH - 1.
219 protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1;
220 /** Number of bits of a trail surrogate that are used in index table lookups. */
221 protected static final int SURROGATE_BLOCK_BITS=10-INDEX_STAGE_1_SHIFT_;
223 * Number of index (stage 1) entries per lead surrogate.
224 * Same as number of index entries for 1024 trail surrogates,
225 * ==0x400>>INDEX_STAGE_1_SHIFT_
227 protected static final int SURROGATE_BLOCK_COUNT=(1<<SURROGATE_BLOCK_BITS);
228 /** Length of the BMP portion of the index (stage 1) array. */
229 protected static final int BMP_INDEX_LENGTH=0x10000>>INDEX_STAGE_1_SHIFT_;
231 * Surrogate mask to use when shifting offset to retrieve supplementary
234 protected static final int SURROGATE_MASK_ = 0x3FF;
236 * Index or UTF16 characters
238 protected char m_index_[];
240 * Internal TrieValue which handles the parsing of the data value.
241 * This class is to be implemented by the user
243 protected DataManipulate m_dataManipulate_;
245 * Start index of the data portion of the trie. CharTrie combines
246 * index and data into a char array, so this is used to indicate the
247 * initial offset to the data portion.
248 * Note this index always points to the initial value.
250 protected int m_dataOffset_;
252 * Length of the data array
254 protected int m_dataLength_;
256 // protected methods -----------------------------------------------
259 * Gets the offset to the data which the surrogate pair points to.
260 * @param lead lead surrogate
261 * @param trail trailing surrogate
262 * @return offset to data
264 protected abstract int getSurrogateOffset(char lead, char trail);
267 * Gets the value at the argument index
268 * @param index value at index will be retrieved
269 * @return 32 bit value
271 protected abstract int getValue(int index);
274 * Gets the default initial value
275 * @return 32 bit value
277 protected abstract int getInitialValue();
280 * Gets the offset to the data which the index ch after variable offset
282 * Note for locating a non-supplementary character data offset, calling
284 * getRawOffset(0, ch);
286 * will do. Otherwise if it is a supplementary character formed by
287 * surrogates lead and trail. Then we would have to call getRawOffset()
288 * with getFoldingIndexOffset(). See getSurrogateOffset().
289 * @param offset index offset which ch is to start from
290 * @param ch index to be used after offset
291 * @return offset to the data
293 protected final int getRawOffset(int offset, char ch)
295 return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)]
296 << INDEX_STAGE_2_SHIFT_)
297 + (ch & INDEX_STAGE_3_MASK_);
301 * Gets the offset to data which the BMP character points to
302 * Treats a lead surrogate as a normal code point.
303 * @param ch BMP character
304 * @return offset to data
306 protected final int getBMPOffset(char ch)
308 return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE
309 && ch <= UTF16.LEAD_SURROGATE_MAX_VALUE)
310 ? getRawOffset(LEAD_INDEX_OFFSET_, ch)
311 : getRawOffset(0, ch);
312 // using a getRawOffset(ch) makes no diff
316 * Gets the offset to the data which this lead surrogate character points
318 * Data at the returned offset may contain folding offset information for
319 * the next trailing surrogate character.
320 * @param ch lead surrogate character
321 * @return offset to data
323 protected final int getLeadOffset(char ch)
325 return getRawOffset(0, ch);
329 * Internal trie getter from a code point.
330 * Could be faster(?) but longer with
331 * if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }
332 * Gets the offset to data which the codepoint points to
333 * @param ch codepoint
334 * @return offset to data
336 protected final int getCodePointOffset(int ch)
338 // if ((ch >> 16) == 0) slower
341 } else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
342 // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works
343 return getRawOffset(0, (char)ch);
344 } else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
346 return getBMPOffset((char)ch);
347 } else if (ch <= UCharacter.MAX_VALUE) {
348 // look at the construction of supplementary characters
349 // trail forms the ends of it.
350 return getSurrogateOffset(UTF16.getLeadSurrogate(ch),
351 (char)(ch & SURROGATE_MASK_));
353 // return -1 if there is an error, in this case we return
359 * <p>Parses the inputstream and creates the trie index with it.</p>
360 * <p>This is overwritten by the child classes.
361 * @param inputStream input stream containing the trie information
362 * @exception IOException thrown when data reading fails.
364 protected void unserialize(InputStream inputStream) throws IOException
366 //indexLength is a multiple of 1024 >> INDEX_STAGE_2_SHIFT_
367 m_index_ = new char[m_dataOffset_];
368 DataInputStream input = new DataInputStream(inputStream);
369 for (int i = 0; i < m_dataOffset_; i ++) {
370 m_index_[i] = input.readChar();
375 * Determines if this is a 32 bit trie
376 * @return true if options specifies this is a 32 bit trie
378 protected final boolean isIntTrie()
380 return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) != 0;
384 * Determines if this is a 16 bit trie
385 * @return true if this is a 16 bit trie
387 protected final boolean isCharTrie()
389 return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0;
392 // private data members --------------------------------------------
394 // struct UTrieHeader {
395 // int32_t signature;
396 // int32_t options (a bit field)
397 // int32_t indexLength
398 // int32_t dataLength
401 * Size of Trie header in bytes
403 protected static final int HEADER_LENGTH_ = 4 * 4;
405 * Latin 1 option mask
407 protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;
409 * Constant number to authenticate the byte block
411 protected static final int HEADER_SIGNATURE_ = 0x54726965;
413 * Header option formatting
415 private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF;
416 protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4;
417 protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100;
420 * Flag indicator for Latin quick access data block
422 private boolean m_isLatin1Linear_;
425 * <p>Trie options field.</p>
426 * <p>options bit field:<br>
427 * 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br>
428 * 8 0 = 16-bit data, 1=32-bit data<br>
429 * 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT<br>
430 * 3..0 INDEX_STAGE_2_SHIFT // 1..9<br>
432 private int m_options_;
434 // private methods ---------------------------------------------------
437 * Authenticates raw data header.
438 * Checking the header information, signature and options.
439 * @param signature This contains the options and type of a Trie
440 * @return true if the header is authenticated valid
442 private final boolean checkHeader(int signature)
444 // check the signature
445 // Trie in big-endian US-ASCII (0x54726965).
446 // Magic number to authenticate the data.
447 if (signature != HEADER_SIGNATURE_) {
451 if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) !=
452 INDEX_STAGE_1_SHIFT_ ||
453 ((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) &
454 HEADER_OPTIONS_SHIFT_MASK_)
455 != INDEX_STAGE_2_SHIFT_) {