2 *******************************************************************************
\r
3 * Copyright (C) 2009-2010, International Business Machines Corporation and
\r
4 * others. All Rights Reserved.
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.impl;
\r
9 import java.io.DataOutputStream;
\r
10 import java.io.IOException;
\r
11 import java.io.InputStream;
\r
12 import java.io.OutputStream;
\r
18 * A read-only Trie2, holding 16 bit data values.
\r
20 * A Trie2 is a highly optimized data structure for mapping from Unicode
\r
21 * code points (values ranging from 0 to 0x10ffff) to a 16 or 32 bit value.
\r
23 * See class Trie2 for descriptions of the API for accessing the contents of a trie.
\r
25 * The fundamental data access methods are declared final in this class, with
\r
26 * the intent that applications might gain a little extra performance, when compared
\r
27 * with calling the same methods via the abstract UTrie2 base class.
\r
29 public final class Trie2_16 extends Trie2 {
\r
33 * Internal constructor, not for general use.
\r
40 * Create a Trie2 from its serialized form. Inverse of utrie2_serialize().
\r
41 * The serialized format is identical between ICU4C and ICU4J, so this function
\r
42 * will work with serialized Trie2s from either.
\r
44 * The serialized Trie2 on the stream may be in either little or big endian byte order.
\r
45 * This allows using serialized Tries from ICU4C without needing to consider the
\r
46 * byte order of the system that created them.
\r
48 * @param is an input stream to the serialized form of a UTrie2.
\r
49 * @return An unserialized Trie_16, ready for use.
\r
50 * @throws IllegalArgumentException if the stream does not contain a serialized Trie2.
\r
51 * @throws IOException if a read error occurs on the InputStream.
\r
52 * @throws ClassCastException if the stream contains a serialized Trie2_32
\r
54 public static Trie2_16 createFromSerialized(InputStream is) throws IOException {
\r
55 return (Trie2_16) Trie2.createFromSerialized(is);
\r
59 * Get the value for a code point as stored in the Trie2.
\r
61 * @param codePoint the code point
\r
65 public final int get(int codePoint) {
\r
69 if (codePoint >= 0) {
\r
70 if (codePoint < 0x0d800 || (codePoint > 0x0dbff && codePoint <= 0x0ffff)) {
\r
71 // Ordinary BMP code point, excluding leading surrogates.
\r
72 // BMP uses a single level lookup. BMP index starts at offset 0 in the Trie2 index.
\r
73 // 16 bit data is stored in the index array itself.
\r
74 ix = index[codePoint >> UTRIE2_SHIFT_2];
\r
75 ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
\r
79 if (codePoint <= 0xffff) {
\r
80 // Lead Surrogate Code Point. A Separate index section is stored for
\r
81 // lead surrogate code units and code points.
\r
82 // The main index has the code unit data.
\r
83 // For this function, we need the code point data.
\r
84 // Note: this expression could be refactored for slightly improved efficiency, but
\r
85 // surrogate code points will be so rare in practice that it's not worth it.
\r
86 ix = index[UTRIE2_LSCP_INDEX_2_OFFSET + ((codePoint - 0xd800) >> UTRIE2_SHIFT_2)];
\r
87 ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
\r
91 if (codePoint < highStart) {
\r
92 // Supplemental code point, use two-level lookup.
\r
93 ix = (UTRIE2_INDEX_1_OFFSET - UTRIE2_OMITTED_BMP_INDEX_1_LENGTH) + (codePoint >> UTRIE2_SHIFT_1);
\r
95 ix += (codePoint >> UTRIE2_SHIFT_2) & UTRIE2_INDEX_2_MASK;
\r
97 ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
\r
101 if (codePoint <= 0x10ffff) {
\r
102 value = index[highValueIndex];
\r
107 // Fall through. The code point is outside of the legal range of 0..0x10ffff.
\r
113 * Get a Trie2 value for a UTF-16 code unit.
\r
115 * This function returns the same value as get() if the input
\r
116 * character is outside of the lead surrogate range
\r
118 * There are two values stored in a Trie2 for inputs in the lead
\r
119 * surrogate range. This function returns the alternate value,
\r
120 * while Trie2.get() returns the main value.
\r
122 * @param codeUnit a 16 bit code unit or lead surrogate value.
\r
123 * @return the value
\r
126 public int getFromU16SingleLead(char codeUnit) {
\r
130 // Because the input is a 16 bit char, we can skip the tests for it being in
\r
131 // the BMP range. It is.
\r
132 ix = index[codeUnit >> UTRIE2_SHIFT_2];
\r
133 ix = (ix << UTRIE2_INDEX_SHIFT) + (codeUnit & UTRIE2_DATA_MASK);
\r
140 * Serialize a Trie2_16 onto an OutputStream.
\r
142 * A Trie2 can be serialized multiple times.
\r
143 * The serialized data is compatible with ICU4C UTrie2 serialization.
\r
144 * Trie2 serialization is unrelated to Java object serialization.
\r
146 * @param os the stream to which the serialized Trie2 data will be written.
\r
147 * @return the number of bytes written.
\r
148 * @throw IOException on an error writing to the OutputStream.
\r
150 public int serialize(OutputStream os) throws IOException {
\r
151 DataOutputStream dos = new DataOutputStream(os);
\r
152 int bytesWritten = 0;
\r
154 bytesWritten += serializeHeader(dos);
\r
155 for (int i=0; i<dataLength; i++) {
\r
156 dos.writeChar(index[data16+i]);
\r
158 bytesWritten += dataLength*2;
\r
159 return bytesWritten;
\r
163 * @return the number of bytes of the serialized trie
\r
165 public int getSerializedLength() {
\r
166 return 16+(header.indexLength+dataLength)*2;
\r
170 * Given a starting code point, find the last in a range of code points,
\r
171 * all with the same value.
\r
173 * This function is part of the implementation of iterating over the
\r
174 * Trie2's contents.
\r
175 * @param startingCP The code point at which to begin looking.
\r
176 * @return The last code point with the same value as the starting code point.
\r
179 int rangeEnd(int startingCP, int limit, int value) {
\r
180 int cp = startingCP;
\r
182 int index2Block = 0;
\r
184 // Loop runs once for each of
\r
185 // - a partial data block
\r
186 // - a reference to the null (default) data block.
\r
187 // - a reference to the index2 null block
\r
194 if (cp < 0x0d800 || (cp > 0x0dbff && cp <= 0x0ffff)) {
\r
195 // Ordinary BMP code point, excluding leading surrogates.
\r
196 // BMP uses a single level lookup. BMP index starts at offset 0 in the Trie2 index.
\r
197 // 16 bit data is stored in the index array itself.
\r
199 block = index[cp >> UTRIE2_SHIFT_2] << UTRIE2_INDEX_SHIFT;
\r
200 } else if (cp < 0xffff) {
\r
201 // Lead Surrogate Code Point, 0xd800 <= cp < 0xdc00
\r
202 index2Block = UTRIE2_LSCP_INDEX_2_OFFSET;
\r
203 block = index[index2Block + ((cp - 0xd800) >> UTRIE2_SHIFT_2)] << UTRIE2_INDEX_SHIFT;
\r
204 } else if (cp < highStart) {
\r
205 // Supplemental code point, use two-level lookup.
\r
206 int ix = (UTRIE2_INDEX_1_OFFSET - UTRIE2_OMITTED_BMP_INDEX_1_LENGTH) + (cp >> UTRIE2_SHIFT_1);
\r
207 index2Block = index[ix];
\r
208 block = index[index2Block + ((cp >> UTRIE2_SHIFT_2) & UTRIE2_INDEX_2_MASK)] << UTRIE2_INDEX_SHIFT;
\r
210 // Code point above highStart.
\r
211 if (value == index[highValueIndex]) {
\r
217 if (index2Block == index2NullOffset) {
\r
218 if (value != initialValue) {
\r
221 cp += UTRIE2_CP_PER_INDEX_1_ENTRY;
\r
222 } else if (block == dataNullOffset) {
\r
223 // The block at dataNullOffset has all values == initialValue.
\r
224 // Because Trie2 iteration always proceeds in ascending order, we will always
\r
225 // encounter a null block at its beginning, and can skip over
\r
226 // a number of code points equal to the length of the block.
\r
227 if (value != initialValue) {
\r
230 cp += UTRIE2_DATA_BLOCK_LENGTH;
\r
232 // Current position refers to an ordinary data block.
\r
233 // Walk over the data entries, checking the values.
\r
234 int startIx = block + (cp & UTRIE2_DATA_MASK);
\r
235 int limitIx = block + UTRIE2_DATA_BLOCK_LENGTH;
\r
236 for (int ix = startIx; ix<limitIx; ix++) {
\r
237 if (index[ix] != value) {
\r
238 // We came to an entry with a different value.
\r
240 cp += (ix - startIx);
\r
244 // The ordinary data block contained our value until its end.
\r
245 // Advance the current code point, and continue the outerloop.
\r
246 cp += limitIx - startIx;
\r