2 *******************************************************************************
\r
3 * Copyright (C) 2009-2010, International Business Machines Corporation and
\r
4 * others. All Rights Reserved.
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.impl;
\r
9 import java.io.DataInputStream;
\r
10 import java.io.DataOutputStream;
\r
11 import java.io.IOException;
\r
12 import java.io.InputStream;
\r
13 import java.util.Iterator;
\r
14 import java.util.NoSuchElementException;
\r
18 * This is the interface and common implementation of a Unicode Trie2.
\r
19 * It is a kind of compressed table that maps from Unicode code points (0..0x10ffff)
\r
20 * to 16- or 32-bit integer values. It works best when there are ranges of
\r
21 * characters with the same value, which is generally the case with Unicode
\r
22 * character properties.
\r
24 * This is the second common version of a Unicode trie (hence the name Trie2).
\r
27 public abstract class Trie2 implements Iterable<Trie2.Range> {
\r
31 * Create a Trie2 from its serialized form. Inverse of utrie2_serialize().
\r
32 * The serialized format is identical between ICU4C and ICU4J, so this function
\r
33 * will work with serialized Trie2s from either.
\r
35 * The actual type of the returned Trie2 will be either Trie2_16 or Trie2_32, depending
\r
36 * on the width of the data.
\r
38 * To obtain the width of the Trie2, check the actual class type of the returned Trie2.
\r
39 * Or use the createFromSerialized() function of Trie2_16 or Trie2_32, which will
\r
40 * return only Tries of their specific type/size.
\r
42 * The serialized Trie2 on the stream may be in either little or big endian byte order.
\r
43 * This allows using serialized Tries from ICU4C without needing to consider the
\r
44 * byte order of the system that created them.
\r
46 * @param is an input stream to the serialized form of a UTrie2.
\r
47 * @return An unserialized Trie2, ready for use.
\r
48 * @throws IllegalArgumentException if the stream does not contain a serialized Trie2.
\r
49 * @throws IOException if a read error occurs on the InputStream.
\r
52 public static Trie2 createFromSerialized(InputStream is) throws IOException {
\r
53 // From ICU4C utrie2_impl.h
\r
54 // * Trie2 data structure in serialized form:
\r
56 // * UTrie2Header header;
\r
57 // * uint16_t index[header.index2Length];
\r
58 // * uint16_t data[header.shiftedDataLength<<2]; -- or uint32_t data[...]
\r
61 // typedef struct UTrie2Header {
\r
62 // /** "Tri2" in big-endian US-ASCII (0x54726932) */
\r
63 // uint32_t signature;
\r
66 // * options bit field:
\r
67 // * 15.. 4 reserved (0)
\r
68 // * 3.. 0 UTrie2ValueBits valueBits
\r
70 // uint16_t options;
\r
72 // /** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH */
\r
73 // uint16_t indexLength;
\r
75 // /** (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT */
\r
76 // uint16_t shiftedDataLength;
\r
78 // /** Null index and data blocks, not shifted. */
\r
79 // uint16_t index2NullOffset, dataNullOffset;
\r
82 // * First code point of the single-value range ending with U+10ffff,
\r
83 // * rounded up and then shifted right by UTRIE2_SHIFT_1.
\r
85 // uint16_t shiftedHighStart;
\r
88 DataInputStream dis = new DataInputStream(is);
\r
89 boolean needByteSwap = false;
\r
91 UTrie2Header header = new UTrie2Header();
\r
93 /* check the signature */
\r
94 header.signature = dis.readInt();
\r
95 switch (header.signature) {
\r
97 needByteSwap = false;
\r
100 needByteSwap = true;
\r
101 header.signature = Integer.reverseBytes(header.signature);
\r
104 throw new IllegalArgumentException("Stream does not contain a serialized UTrie2");
\r
107 header.options = swapShort(needByteSwap, dis.readUnsignedShort());
\r
108 header.indexLength = swapShort(needByteSwap, dis.readUnsignedShort());
\r
109 header.shiftedDataLength = swapShort(needByteSwap, dis.readUnsignedShort());
\r
110 header.index2NullOffset = swapShort(needByteSwap, dis.readUnsignedShort());
\r
111 header.dataNullOffset = swapShort(needByteSwap, dis.readUnsignedShort());
\r
112 header.shiftedHighStart = swapShort(needByteSwap, dis.readUnsignedShort());
\r
114 // Trie2 data width - 0: 16 bits
\r
116 if ((header.options & UTRIE2_OPTIONS_VALUE_BITS_MASK) > 1) {
\r
117 throw new IllegalArgumentException("UTrie2 serialized format error.");
\r
121 if ((header.options & UTRIE2_OPTIONS_VALUE_BITS_MASK) == 0) {
\r
122 width = ValueWidth.BITS_16;
\r
123 This = new Trie2_16();
\r
125 width = ValueWidth.BITS_32;
\r
126 This = new Trie2_32();
\r
128 This.header = header;
\r
130 /* get the length values and offsets */
\r
131 This.indexLength = header.indexLength;
\r
132 This.dataLength = header.shiftedDataLength << UTRIE2_INDEX_SHIFT;
\r
133 This.index2NullOffset = header.index2NullOffset;
\r
134 This.dataNullOffset = header.dataNullOffset;
\r
135 This.highStart = header.shiftedHighStart << UTRIE2_SHIFT_1;
\r
136 This.highValueIndex = This.dataLength - UTRIE2_DATA_GRANULARITY;
\r
137 if (width == ValueWidth.BITS_16) {
\r
138 This.highValueIndex += This.indexLength;
\r
141 // Allocate the Trie2 index array. If the data width is 16 bits, the array also
\r
142 // includes the space for the data.
\r
144 int indexArraySize = This.indexLength;
\r
145 if (width == ValueWidth.BITS_16) {
\r
146 indexArraySize += This.dataLength;
\r
148 This.index = new char[indexArraySize];
\r
150 /* Read in the index */
\r
152 for (i=0; i<This.indexLength; i++) {
\r
153 This.index[i] = swapChar(needByteSwap, dis.readChar());
\r
156 /* Read in the data. 16 bit data goes in the same array as the index.
\r
157 * 32 bit data goes in its own separate data array.
\r
159 if (width == ValueWidth.BITS_16) {
\r
160 This.data16 = This.indexLength;
\r
161 for (i=0; i<This.dataLength; i++) {
\r
162 This.index[This.data16 + i] = swapChar(needByteSwap, dis.readChar());
\r
165 This.data32 = new int[This.dataLength];
\r
166 for (i=0; i<This.dataLength; i++) {
\r
167 This.data32[i] = swapInt(needByteSwap, dis.readInt());
\r
173 This.data32 = null;
\r
174 This.initialValue = This.index[This.dataNullOffset];
\r
175 This.errorValue = This.index[This.data16+UTRIE2_BAD_UTF8_DATA_OFFSET];
\r
179 This.initialValue = This.data32[This.dataNullOffset];
\r
180 This.errorValue = This.data32[UTRIE2_BAD_UTF8_DATA_OFFSET];
\r
183 throw new IllegalArgumentException("UTrie2 serialized format error.");
\r
190 private static int swapShort(boolean needSwap, int value) {
\r
191 return needSwap? ((int)Short.reverseBytes((short)value)) & 0x0000ffff : value;
\r
194 private static char swapChar(boolean needSwap, char value) {
\r
195 return needSwap? (char)Short.reverseBytes((short)value) : value;
\r
198 private static int swapInt(boolean needSwap, int value) {
\r
199 return needSwap? Integer.reverseBytes(value) : value;
\r
204 * Get the UTrie version from an InputStream containing the serialized form
\r
205 * of either a Trie (version 1) or a Trie2 (version 2).
\r
207 * @param is an InputStream containing the serialized form
\r
208 * of a UTrie, version 1 or 2. The stream must support mark() and reset().
\r
209 * The position of the input stream will be left unchanged.
\r
210 * @param littleEndianOk If FALSE, only big-endian (Java native) serialized forms are recognized.
\r
211 * If TRUE, little-endian serialized forms are recognized as well.
\r
212 * @return the Trie version of the serialized form, or 0 if it is not
\r
213 * recognized as a serialized UTrie
\r
214 * @throws IOException on errors in reading from the input stream.
\r
216 public static int getVersion(InputStream is, boolean littleEndianOk) throws IOException {
\r
217 if (! is.markSupported()) {
\r
218 throw new IllegalArgumentException("Input stream must support mark().");
\r
221 byte sig[] = new byte[4];
\r
225 if (sig[0]=='T' && sig[1]=='r' && sig[2]=='i' && sig[3]=='e') {
\r
228 if (sig[0]=='T' && sig[1]=='r' && sig[2]=='i' && sig[3]=='2') {
\r
231 if (littleEndianOk) {
\r
232 if (sig[0]=='e' && sig[1]=='i' && sig[2]=='r' && sig[3]=='T') {
\r
235 if (sig[0]=='2' && sig[1]=='i' && sig[2]=='r' && sig[3]=='T') {
\r
244 * Get the value for a code point as stored in the Trie2.
\r
246 * @param codePoint the code point
\r
247 * @return the value
\r
249 abstract public int get(int codePoint);
\r
253 * Get the trie value for a UTF-16 code unit.
\r
255 * A Trie2 stores two distinct values for input in the lead surrogate
\r
256 * range, one for lead surrogates, which is the value that will be
\r
257 * returned by this function, and a second value that is returned
\r
260 * For code units outside of the lead surrogate range, this function
\r
261 * returns the same result as Trie2.get().
\r
263 * This function, together with the alternate value for lead surrogates,
\r
264 * makes possible very efficient processing of UTF-16 strings without
\r
265 * first converting surrogate pairs to their corresponding 32 bit code point
\r
268 * At build-time, enumerate the contents of the Trie2 to see if there
\r
269 * is non-trivial (non-initialValue) data for any of the supplementary
\r
270 * code points associated with a lead surrogate.
\r
271 * If so, then set a special (application-specific) value for the
\r
272 * lead surrogate code _unit_, with Trie2Writable.setForLeadSurrogateCodeUnit().
\r
274 * At runtime, use Trie2.getFromU16SingleLead(). If there is non-trivial
\r
275 * data and the code unit is a lead surrogate, then check if a trail surrogate
\r
276 * follows. If so, assemble the supplementary code point and look up its value
\r
277 * with Trie2.get(); otherwise reset the lead
\r
278 * surrogate's value or do a code point lookup for it.
\r
280 * If there is only trivial data for lead and trail surrogates, then processing
\r
281 * can often skip them. For example, in normalization or case mapping
\r
282 * all characters that do not have any mappings are simply copied as is.
\r
284 * @param c the code point or lead surrogate value.
\r
285 * @return the value
\r
287 abstract public int getFromU16SingleLead(char c);
\r
291 * Equals function. Two Tries are equal if their contents are equal.
\r
292 * The type need not be the same, so a Trie2Writable will be equal to
\r
293 * (read-only) Trie2_16 or Trie2_32 so long as they are storing the same values.
\r
296 public final boolean equals(Object other) {
\r
297 if(!(other instanceof Trie2)) {
\r
300 Trie2 OtherTrie = (Trie2)other;
\r
301 Range rangeFromOther;
\r
303 Iterator<Trie2.Range> otherIter = OtherTrie.iterator();
\r
304 for (Trie2.Range rangeFromThis: this) {
\r
305 if (otherIter.hasNext() == false) {
\r
308 rangeFromOther = otherIter.next();
\r
309 if (!rangeFromThis.equals(rangeFromOther)) {
\r
313 if (otherIter.hasNext()) {
\r
317 if (errorValue != OtherTrie.errorValue ||
\r
318 initialValue != OtherTrie.initialValue) {
\r
326 public int hashCode() {
\r
328 int hash = initHash();
\r
329 for (Range r: this) {
\r
330 hash = hashInt(hash, r.hashCode());
\r
341 * When iterating over the contents of a Trie2, Elements of this type are produced.
\r
342 * The iterator will return one item for each contiguous range of codepoints having the same value.
\r
344 * When iterating, the same Trie2EnumRange object will be reused and returned for each range.
\r
345 * If you need to retain complete iteration results, clone each returned Trie2EnumRange,
\r
346 * or save the range in some other way, before advancing to the next iteration step.
\r
348 public static class Range {
\r
349 public int startCodePoint;
\r
350 public int endCodePoint; // Inclusive.
\r
352 public boolean leadSurrogate;
\r
354 public boolean equals(Object other) {
\r
355 if (other == null || !(other.getClass().equals(getClass()))) {
\r
358 Range tother = (Range)other;
\r
359 return this.startCodePoint == tother.startCodePoint &&
\r
360 this.endCodePoint == tother.endCodePoint &&
\r
361 this.value == tother.value &&
\r
362 this.leadSurrogate == tother.leadSurrogate;
\r
366 public int hashCode() {
\r
367 int h = initHash();
\r
368 h = hashUChar32(h, startCodePoint);
\r
369 h = hashUChar32(h, endCodePoint);
\r
370 h = hashInt(h, value);
\r
371 h = hashByte(h, leadSurrogate? 1: 0);
\r
378 * Create an iterator over the value ranges in this Trie2.
\r
379 * Values from the Trie2 are not remapped or filtered, but are returned as they
\r
380 * are stored in the Trie2.
\r
382 * @return an Iterator
\r
384 public Iterator<Range> iterator() {
\r
385 return iterator(defaultValueMapper);
\r
388 private static ValueMapper defaultValueMapper = new ValueMapper() {
\r
389 public int map(int in) {
\r
395 * Create an iterator over the value ranges from this Trie2.
\r
396 * Values from the Trie2 are passed through a caller-supplied remapping function,
\r
397 * and it is the remapped values that determine the ranges that
\r
398 * will be produced by the iterator.
\r
401 * @param mapper provides a function to remap values obtained from the Trie2.
\r
402 * @return an Iterator
\r
404 public Iterator<Range> iterator(ValueMapper mapper) {
\r
405 return new Trie2Iterator(mapper);
\r
410 * Create an iterator over the Trie2 values for the 1024=0x400 code points
\r
411 * corresponding to a given lead surrogate.
\r
412 * For example, for the lead surrogate U+D87E it will enumerate the values
\r
413 * for [U+2F800..U+2FC00[.
\r
414 * Used by data builder code that sets special lead surrogate code unit values
\r
415 * for optimized UTF-16 string processing.
\r
417 * Do not modify the Trie2 during the iteration.
\r
419 * Except for the limited code point range, this functions just like Trie2.iterator().
\r
422 public Iterator<Range> iteratorForLeadSurrogate(char lead, ValueMapper mapper) {
\r
423 return new Trie2Iterator(lead, mapper);
\r
427 * Create an iterator over the Trie2 values for the 1024=0x400 code points
\r
428 * corresponding to a given lead surrogate.
\r
429 * For example, for the lead surrogate U+D87E it will enumerate the values
\r
430 * for [U+2F800..U+2FC00[.
\r
431 * Used by data builder code that sets special lead surrogate code unit values
\r
432 * for optimized UTF-16 string processing.
\r
434 * Do not modify the Trie2 during the iteration.
\r
436 * Except for the limited code point range, this functions just like Trie2.iterator().
\r
439 public Iterator<Range> iteratorForLeadSurrogate(char lead) {
\r
440 return new Trie2Iterator(lead, defaultValueMapper);
\r
444 * When iterating over the contents of a Trie2, an instance of TrieValueMapper may
\r
445 * be used to remap the values from the Trie2. The remapped values will be used
\r
446 * both in determining the ranges of codepoints and as the value to be returned
\r
449 * Example of use, with an anonymous subclass of TrieValueMapper:
\r
452 * ValueMapper m = new ValueMapper() {
\r
453 * int map(int in) {return in & 0x1f;};
\r
455 * for (Iterator<Trie2EnumRange> iter = trie.iterator(m); i.hasNext(); ) {
\r
456 * Trie2EnumRange r = i.next();
\r
457 * ... // Do something with the range r.
\r
461 public interface ValueMapper {
\r
462 public int map(int originalVal);
\r
467 * Serialize a trie2 Header and Index onto an OutputStream. This is
\r
468 * common code used for both the Trie2_16 and Trie2_32 serialize functions.
\r
469 * @param dos the stream to which the serialized Trie2 data will be written.
\r
470 * @return the number of bytes written.
\r
472 protected int serializeHeader(DataOutputStream dos) throws IOException {
\r
473 // Write the header. It is already set and ready to use, having been
\r
474 // created when the Trie2 was unserialized or when it was frozen.
\r
475 int bytesWritten = 0;
\r
477 dos.writeInt(header.signature);
\r
478 dos.writeShort(header.options);
\r
479 dos.writeShort(header.indexLength);
\r
480 dos.writeShort(header.shiftedDataLength);
\r
481 dos.writeShort(header.index2NullOffset);
\r
482 dos.writeShort(header.dataNullOffset);
\r
483 dos.writeShort(header.shiftedHighStart);
\r
484 bytesWritten += 16;
\r
488 for (i=0; i< header.indexLength; i++) {
\r
489 dos.writeChar(index[i]);
\r
491 bytesWritten += header.indexLength;
\r
492 return bytesWritten;
\r
497 * Struct-like class for holding the results returned by a UTrie2 CharSequence iterator.
\r
498 * The iteration walks over a CharSequence, and for each Unicode code point therein
\r
499 * returns the character and its associated Trie2 value.
\r
501 public static class CharSequenceValues {
\r
502 /** string index of the current code point. */
\r
504 /** The code point at index. */
\r
505 public int codePoint;
\r
506 /** The Trie2 value for the current code point */
\r
512 * Create an iterator that will produce the values from the Trie2 for
\r
513 * the sequence of code points in an input text.
\r
515 * @param text A text string to be iterated over.
\r
516 * @param index The starting iteration position within the input text.
\r
517 * @return the CharSequenceIterator
\r
519 public CharSequenceIterator charSequenceIterator(CharSequence text, int index) {
\r
520 return new CharSequenceIterator(text, index);
\r
523 // TODO: Survey usage of the equivalent of CharSequenceIterator in ICU4C
\r
524 // and if there is none, remove it from here.
\r
525 // Don't waste time testing and maintaining unused code.
\r
528 * An iterator that operates over an input CharSequence, and for each Unicode code point
\r
529 * in the input returns the associated value from the Trie2.
\r
531 * The iterator can move forwards or backwards, and can be reset to an arbitrary index.
\r
533 * Note that Trie2_16 and Trie2_32 subclass Trie2.CharSequenceIterator. This is done
\r
534 * only for performance reasons. It does require that any changes made here be propagated
\r
535 * into the corresponding code in the subclasses.
\r
537 public class CharSequenceIterator implements Iterator<CharSequenceValues> {
\r
539 * Internal constructor.
\r
541 CharSequenceIterator(CharSequence t, int index) {
\r
543 textLength = text.length();
\r
547 private CharSequence text;
\r
548 private int textLength;
\r
550 private Trie2.CharSequenceValues fResults = new Trie2.CharSequenceValues();
\r
553 public void set(int i) {
\r
554 if (i < 0 || i > textLength) {
\r
555 throw new IndexOutOfBoundsException();
\r
561 public final boolean hasNext() {
\r
562 return index<textLength;
\r
566 public final boolean hasPrevious() {
\r
571 public Trie2.CharSequenceValues next() {
\r
572 int c = Character.codePointAt(text, index);
\r
575 fResults.index = index;
\r
576 fResults.codePoint = c;
\r
577 fResults.value = val;
\r
579 if (c >= 0x10000) {
\r
586 public Trie2.CharSequenceValues previous() {
\r
587 int c = Character.codePointBefore(text, index);
\r
590 if (c >= 0x10000) {
\r
593 fResults.index = index;
\r
594 fResults.codePoint = c;
\r
595 fResults.value = val;
\r
600 * Iterator.remove() is not supported by Trie2.CharSequenceIterator.
\r
601 * @throws UnsupportedOperationException Always thrown because this operation is not supported
\r
602 * @see java.util.Iterator#remove()
\r
604 public void remove() {
\r
605 throw new UnsupportedOperationException("Trie2.CharSequenceIterator does not support remove().");
\r
610 //--------------------------------------------------------------------------------
\r
612 // Below this point are internal implementation items. No further public API.
\r
614 //--------------------------------------------------------------------------------
\r
618 * Selectors for the width of a UTrie2 data value.
\r
626 * Trie2 data structure in serialized form:
\r
628 * UTrie2Header header;
\r
629 * uint16_t index[header.index2Length];
\r
630 * uint16_t data[header.shiftedDataLength<<2]; -- or uint32_t data[...]
\r
632 * For Java, this is read from the stream into an instance of UTrie2Header.
\r
633 * (The C version just places a struct over the raw serialized data.)
\r
637 static class UTrie2Header {
\r
638 /** "Tri2" in big-endian US-ASCII (0x54726932) */
\r
642 * options bit field (uint16_t):
\r
643 * 15.. 4 reserved (0)
\r
644 * 3.. 0 UTrie2ValueBits valueBits
\r
648 /** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH (uint16_t) */
\r
651 /** (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT (uint16_t) */
\r
652 int shiftedDataLength;
\r
654 /** Null index and data blocks, not shifted. (uint16_t) */
\r
655 int index2NullOffset, dataNullOffset;
\r
658 * First code point of the single-value range ending with U+10ffff,
\r
659 * rounded up and then shifted right by UTRIE2_SHIFT_1. (uint16_t)
\r
661 int shiftedHighStart;
\r
665 // Data members of UTrie2.
\r
667 UTrie2Header header;
\r
668 char index[]; // Index array. Includes data for 16 bit Tries.
\r
669 int data16; // Offset to data portion of the index array, if 16 bit data.
\r
670 // zero if 32 bit data.
\r
671 int data32[]; // NULL if 16b data is used via index
\r
675 int index2NullOffset; // 0xffff if there is no dedicated index-2 null block
\r
678 /** Value returned for out-of-range code points and illegal UTF-8. */
\r
681 /* Start of the last range which ends at U+10ffff, and its value. */
\r
683 int highValueIndex;
\r
685 int dataNullOffset;
\r
687 int fHash; // Zero if not yet computed.
\r
688 // Shared by Trie2Writable, Trie2_16, Trie2_32.
\r
689 // Thread safety: if two racing threads compute
\r
690 // the same hash on a frozen Trie2, no damage is done.
\r
694 * Trie2 constants, defining shift widths, index array lengths, etc.
\r
696 * These are needed for the runtime macros but users can treat these as
\r
697 * implementation details and skip to the actual public API further below.
\r
700 static final int UTRIE2_OPTIONS_VALUE_BITS_MASK=0x000f;
\r
703 /** Shift size for getting the index-1 table offset. */
\r
704 static final int UTRIE2_SHIFT_1=6+5;
\r
706 /** Shift size for getting the index-2 table offset. */
\r
707 static final int UTRIE2_SHIFT_2=5;
\r
710 * Difference between the two shift sizes,
\r
711 * for getting an index-1 offset from an index-2 offset. 6=11-5
\r
713 static final int UTRIE2_SHIFT_1_2=UTRIE2_SHIFT_1-UTRIE2_SHIFT_2;
\r
716 * Number of index-1 entries for the BMP. 32=0x20
\r
717 * This part of the index-1 table is omitted from the serialized form.
\r
719 static final int UTRIE2_OMITTED_BMP_INDEX_1_LENGTH=0x10000>>UTRIE2_SHIFT_1;
\r
721 /** Number of code points per index-1 table entry. 2048=0x800 */
\r
722 static final int UTRIE2_CP_PER_INDEX_1_ENTRY=1<<UTRIE2_SHIFT_1;
\r
724 /** Number of entries in an index-2 block. 64=0x40 */
\r
725 static final int UTRIE2_INDEX_2_BLOCK_LENGTH=1<<UTRIE2_SHIFT_1_2;
\r
727 /** Mask for getting the lower bits for the in-index-2-block offset. */
\r
728 static final int UTRIE2_INDEX_2_MASK=UTRIE2_INDEX_2_BLOCK_LENGTH-1;
\r
730 /** Number of entries in a data block. 32=0x20 */
\r
731 static final int UTRIE2_DATA_BLOCK_LENGTH=1<<UTRIE2_SHIFT_2;
\r
733 /** Mask for getting the lower bits for the in-data-block offset. */
\r
734 static final int UTRIE2_DATA_MASK=UTRIE2_DATA_BLOCK_LENGTH-1;
\r
737 * Shift size for shifting left the index array values.
\r
738 * Increases possible data size with 16-bit index values at the cost
\r
739 * of compactability.
\r
740 * This requires data blocks to be aligned by UTRIE2_DATA_GRANULARITY.
\r
742 static final int UTRIE2_INDEX_SHIFT=2;
\r
744 /** The alignment size of a data block. Also the granularity for compaction. */
\r
745 static final int UTRIE2_DATA_GRANULARITY=1<<UTRIE2_INDEX_SHIFT;
\r
747 /* Fixed layout of the first part of the index array. ------------------- */
\r
750 * The BMP part of the index-2 table is fixed and linear and starts at offset 0.
\r
751 * Length=2048=0x800=0x10000>>UTRIE2_SHIFT_2.
\r
753 static final int UTRIE2_INDEX_2_OFFSET=0;
\r
756 * The part of the index-2 table for U+D800..U+DBFF stores values for
\r
757 * lead surrogate code _units_ not code _points_.
\r
758 * Values for lead surrogate code _points_ are indexed with this portion of the table.
\r
759 * Length=32=0x20=0x400>>UTRIE2_SHIFT_2. (There are 1024=0x400 lead surrogates.)
\r
761 static final int UTRIE2_LSCP_INDEX_2_OFFSET=0x10000>>UTRIE2_SHIFT_2;
\r
762 static final int UTRIE2_LSCP_INDEX_2_LENGTH=0x400>>UTRIE2_SHIFT_2;
\r
764 /** Count the lengths of both BMP pieces. 2080=0x820 */
\r
765 static final int UTRIE2_INDEX_2_BMP_LENGTH=UTRIE2_LSCP_INDEX_2_OFFSET+UTRIE2_LSCP_INDEX_2_LENGTH;
\r
768 * The 2-byte UTF-8 version of the index-2 table follows at offset 2080=0x820.
\r
769 * Length 32=0x20 for lead bytes C0..DF, regardless of UTRIE2_SHIFT_2.
\r
771 static final int UTRIE2_UTF8_2B_INDEX_2_OFFSET=UTRIE2_INDEX_2_BMP_LENGTH;
\r
772 static final int UTRIE2_UTF8_2B_INDEX_2_LENGTH=0x800>>6; /* U+0800 is the first code point after 2-byte UTF-8 */
\r
775 * The index-1 table, only used for supplementary code points, at offset 2112=0x840.
\r
776 * Variable length, for code points up to highStart, where the last single-value range starts.
\r
777 * Maximum length 512=0x200=0x100000>>UTRIE2_SHIFT_1.
\r
778 * (For 0x100000 supplementary code points U+10000..U+10ffff.)
\r
780 * The part of the index-2 table for supplementary code points starts
\r
781 * after this index-1 table.
\r
783 * Both the index-1 table and the following part of the index-2 table
\r
784 * are omitted completely if there is only BMP data.
\r
786 static final int UTRIE2_INDEX_1_OFFSET=UTRIE2_UTF8_2B_INDEX_2_OFFSET+UTRIE2_UTF8_2B_INDEX_2_LENGTH;
\r
787 static final int UTRIE2_MAX_INDEX_1_LENGTH=0x100000>>UTRIE2_SHIFT_1;
\r
790 * Fixed layout of the first part of the data array. -----------------------
\r
791 * Starts with 4 blocks (128=0x80 entries) for ASCII.
\r
795 * The illegal-UTF-8 data block follows the ASCII block, at offset 128=0x80.
\r
796 * Used with linear access for single bytes 0..0xbf for simple error handling.
\r
797 * Length 64=0x40, not UTRIE2_DATA_BLOCK_LENGTH.
\r
799 static final int UTRIE2_BAD_UTF8_DATA_OFFSET=0x80;
\r
801 /** The start of non-linear-ASCII data blocks, at offset 192=0xc0. */
\r
802 static final int UTRIE2_DATA_START_OFFSET=0xc0;
\r
804 /* Building a Trie2 ---------------------------------------------------------- */
\r
807 * These definitions are mostly needed by utrie2_builder.c, but also by
\r
808 * utrie2_get32() and utrie2_enum().
\r
812 * At build time, leave a gap in the index-2 table,
\r
813 * at least as long as the maximum lengths of the 2-byte UTF-8 index-2 table
\r
814 * and the supplementary index-1 table.
\r
815 * Round up to UTRIE2_INDEX_2_BLOCK_LENGTH for proper compacting.
\r
817 static final int UNEWTRIE2_INDEX_GAP_OFFSET = UTRIE2_INDEX_2_BMP_LENGTH;
\r
818 static final int UNEWTRIE2_INDEX_GAP_LENGTH =
\r
819 ((UTRIE2_UTF8_2B_INDEX_2_LENGTH + UTRIE2_MAX_INDEX_1_LENGTH) + UTRIE2_INDEX_2_MASK) &
\r
820 ~UTRIE2_INDEX_2_MASK;
\r
823 * Maximum length of the build-time index-2 array.
\r
824 * Maximum number of Unicode code points (0x110000) shifted right by UTRIE2_SHIFT_2,
\r
825 * plus the part of the index-2 table for lead surrogate code points,
\r
826 * plus the build-time index gap,
\r
827 * plus the null index-2 block.
\r
829 static final int UNEWTRIE2_MAX_INDEX_2_LENGTH=
\r
830 (0x110000>>UTRIE2_SHIFT_2)+
\r
831 UTRIE2_LSCP_INDEX_2_LENGTH+
\r
832 UNEWTRIE2_INDEX_GAP_LENGTH+
\r
833 UTRIE2_INDEX_2_BLOCK_LENGTH;
\r
835 static final int UNEWTRIE2_INDEX_1_LENGTH = 0x110000>>UTRIE2_SHIFT_1;
\r
838 * Maximum length of the build-time data array.
\r
839 * One entry per 0x110000 code points, plus the illegal-UTF-8 block and the null block,
\r
840 * plus values for the 0x400 surrogate code units.
\r
842 static final int UNEWTRIE2_MAX_DATA_LENGTH = (0x110000+0x40+0x40+0x400);
\r
847 * Implementation class for an iterator over a Trie2.
\r
849 * Iteration over a Trie2 first returns all of the ranges that are indexed by code points,
\r
850 * then returns the special alternate values for the lead surrogates
\r
854 class Trie2Iterator implements Iterator<Range> {
\r
855 // The normal constructor that configures the iterator to cover the complete
\r
856 // contents of the Trie2
\r
857 Trie2Iterator(ValueMapper vm) {
\r
860 limitCP = 0x110000;
\r
861 doLeadSurrogates = true;
\r
864 // An alternate constructor that configures the iterator to cover only the
\r
865 // code points corresponding to a particular Lead Surrogate value.
\r
866 Trie2Iterator(char leadSurrogate, ValueMapper vm) {
\r
867 if (leadSurrogate < 0xd800 || leadSurrogate > 0xdbff) {
\r
868 throw new IllegalArgumentException("Bad lead surrogate value.");
\r
871 nextStart = (leadSurrogate - 0xd7c0) << 10;
\r
872 limitCP = nextStart + 0x400;
\r
873 doLeadSurrogates = false; // Do not iterate over lead the special lead surrogate
\r
874 // values after completing iteration over code points.
\r
878 * The main next() function for Trie2 iterators
\r
881 public Range next() {
\r
883 throw new NoSuchElementException();
\r
885 if (nextStart >= limitCP) {
\r
886 // Switch over from iterating normal code point values to
\r
887 // doing the alternate lead-surrogate values.
\r
888 doingCodePoints = false;
\r
889 nextStart = 0xd800;
\r
891 int endOfRange = 0;
\r
895 if (doingCodePoints) {
\r
896 // Iteration over code point values.
\r
897 val = get(nextStart);
\r
898 mappedVal = mapper.map(val);
\r
899 endOfRange = rangeEnd(nextStart, limitCP, val);
\r
900 // Loop once for each range in the Trie2 with the same raw (unmapped) value.
\r
901 // Loop continues so long as the mapped values are the same.
\r
903 if (endOfRange >= limitCP-1) {
\r
906 val = get(endOfRange+1);
\r
907 if (mapper.map(val) != mappedVal) {
\r
910 endOfRange = rangeEnd(endOfRange+1, limitCP, val);
\r
913 // Iteration over the alternate lead surrogate values.
\r
914 val = getFromU16SingleLead((char)nextStart);
\r
915 mappedVal = mapper.map(val);
\r
916 endOfRange = rangeEndLS((char)nextStart);
\r
917 // Loop once for each range in the Trie2 with the same raw (unmapped) value.
\r
918 // Loop continues so long as the mapped values are the same.
\r
920 if (endOfRange >= 0xdbff) {
\r
923 val = getFromU16SingleLead((char)(endOfRange+1));
\r
924 if (mapper.map(val) != mappedVal) {
\r
927 endOfRange = rangeEndLS((char)(endOfRange+1));
\r
930 returnValue.startCodePoint = nextStart;
\r
931 returnValue.endCodePoint = endOfRange;
\r
932 returnValue.value = mappedVal;
\r
933 returnValue.leadSurrogate = !doingCodePoints;
\r
934 nextStart = endOfRange+1;
\r
935 return returnValue;
\r
941 public boolean hasNext() {
\r
942 return doingCodePoints && (doLeadSurrogates || nextStart < limitCP) || nextStart < 0xdc00;
\r
945 public void remove() {
\r
946 throw new UnsupportedOperationException();
\r
951 * Find the last lead surrogate in a contiguous range with the
\r
952 * same Trie2 value as the input character.
\r
954 * Use the alternate Lead Surrogate values from the Trie2,
\r
955 * not the code-point values.
\r
957 * Note: Trie2_16 and Trie2_32 override this implementation with optimized versions,
\r
958 * meaning that the implementation here is only being used with
\r
959 * Trie2Writable. The code here is logically correct with any type
\r
960 * of Trie2, however.
\r
962 * @param c The character to begin with.
\r
963 * @return The last contiguous character with the same value.
\r
965 private int rangeEndLS(char startingLS) {
\r
966 if (startingLS >= 0xdbff) {
\r
971 int val = getFromU16SingleLead(startingLS);
\r
972 for (c = startingLS+1; c <= 0x0dbff; c++) {
\r
973 if (getFromU16SingleLead((char)c) != val) {
\r
981 // Iteration State Variables
\r
983 private ValueMapper mapper;
\r
984 private Range returnValue = new Range();
\r
985 // The starting code point for the next range to be returned.
\r
986 private int nextStart;
\r
987 // The upper limit for the last normal range to be returned. Normally 0x110000, but
\r
988 // may be lower when iterating over the code points for a single lead surrogate.
\r
989 private int limitCP;
\r
991 // True while iterating over the the Trie2 values for code points.
\r
992 // False while iterating over the alternate values for lead surrogates.
\r
993 private boolean doingCodePoints = true;
\r
995 // True if the iterator should iterate the special values for lead surrogates in
\r
996 // addition to the normal values for code points.
\r
997 private boolean doLeadSurrogates = true;
\r
1001 * Find the last character in a contiguous range of characters with the
\r
1002 * same Trie2 value as the input character.
\r
1004 * @param c The character to begin with.
\r
1005 * @return The last contiguous character with the same value.
\r
1007 int rangeEnd(int start, int limitp, int val) {
\r
1009 int limit = Math.min(highStart, limitp);
\r
1011 for (c = start+1; c < limit; c++) {
\r
1012 if (get(c) != val) {
\r
1016 if (c >= highStart) {
\r
1024 // Hashing implementation functions. FNV hash. Respected public domain algorithm.
\r
1026 private static int initHash() {
\r
1027 return 0x811c9DC5; // unsigned 2166136261
\r
1030 private static int hashByte(int h, int b) {
\r
1036 private static int hashUChar32(int h, int c) {
\r
1037 h = Trie2.hashByte(h, c & 255);
\r
1038 h = Trie2.hashByte(h, (c>>8) & 255);
\r
1039 h = Trie2.hashByte(h, c>>16);
\r
1043 private static int hashInt(int h, int i) {
\r
1044 h = Trie2.hashByte(h, i & 255);
\r
1045 h = Trie2.hashByte(h, (i>>8) & 255);
\r
1046 h = Trie2.hashByte(h, (i>>16) & 255);
\r
1047 h = Trie2.hashByte(h, (i>>24) & 255);
\r