2 *******************************************************************************
3 * Copyright (C) 2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * created on: 2011jan06
7 * created by: Markus W. Scherer
8 * ported from ICU4C ucharstrie.h/.cpp
11 package com.ibm.icu.util;
13 import java.io.IOException;
14 import java.util.ArrayList;
15 import java.util.NoSuchElementException;
17 import com.ibm.icu.text.UTF16;
18 import com.ibm.icu.util.BytesTrie.Result;
21 * Light-weight, non-const reader class for a CharsTrie.
22 * Traverses a char-serialized data structure with minimal state,
23 * for mapping strings (16-bit-unit sequences) to non-negative integer values.
25 * <p>This class is not intended for public subclassing.
28 * @provisional This API might change or be removed in a future release.
29 * @author Markus W. Scherer
31 public final class CharsTrie implements Cloneable, Iterable<CharsTrie.Entry> {
33 * Constructs a CharsTrie reader instance.
35 * <p>The CharSequence must contain a copy of a char sequence from the CharsTrieBuilder,
36 * with the offset indicating the first char of that sequence.
37 * The CharsTrie object will not read more chars than
38 * the CharsTrieBuilder generated in the corresponding build() call.
40 * <p>The CharSequence is not copied/cloned and must not be modified while
41 * the CharsTrie object is in use.
43 * @param trieChars CharSequence that contains the serialized trie.
44 * @param offset Root offset of the trie in the CharSequence.
46 * @provisional This API might change or be removed in a future release.
48 public CharsTrie(CharSequence trieChars, int offset) {
51 remainingMatchLength_=-1;
55 * Clones this trie reader object and its state,
56 * but not the char array which will be shared.
57 * @return A shallow clone of this trie.
59 * @provisional This API might change or be removed in a future release.
62 public Object clone() throws CloneNotSupportedException {
63 return super.clone(); // A shallow copy is just what we need.
67 * Resets this trie to its initial state.
70 * @provisional This API might change or be removed in a future release.
72 public CharsTrie reset() {
74 remainingMatchLength_=-1;
79 * CharsTrie state object, for saving a trie's current state
80 * and resetting the trie back to this state later.
82 * @provisional This API might change or be removed in a future release.
84 public static final class State {
86 * Constructs an empty State.
88 * @provisional This API might change or be removed in a future release.
91 private CharSequence chars;
94 private int remainingMatchLength;
98 * Saves the state of this trie.
99 * @param state The State object to hold the trie's state.
103 * @provisional This API might change or be removed in a future release.
105 public CharsTrie saveState(State state) /*const*/ {
109 state.remainingMatchLength=remainingMatchLength_;
114 * Resets this trie to the saved state.
115 * @param state The State object which holds a saved trie state.
117 * @throws IllegalArgumentException if the state object contains no state,
118 * or the state of a different trie
122 * @provisional This API might change or be removed in a future release.
124 public CharsTrie resetToState(State state) {
125 if(chars_==state.chars && chars_!=null && root_==state.root) {
127 remainingMatchLength_=state.remainingMatchLength;
129 throw new IllegalArgumentException("incompatible trie state");
135 * Determines whether the string so far matches, whether it has a value,
136 * and whether another input char can continue a matching string.
137 * @return The match/value Result.
139 * @provisional This API might change or be removed in a future release.
141 public Result current() /*const*/ {
144 return Result.NO_MATCH;
147 return (remainingMatchLength_<0 && (node=chars_.charAt(pos))>=kMinValueLead) ?
148 valueResults_[node>>15] : Result.NO_VALUE;
153 * Traverses the trie from the initial state for this input char.
154 * Equivalent to reset().next(inUnit).
155 * @param inUnit Input char value. Values below 0 and above 0xffff will never match.
156 * @return The match/value Result.
158 * @provisional This API might change or be removed in a future release.
160 public Result first(int inUnit) {
161 remainingMatchLength_=-1;
162 return nextImpl(root_, inUnit);
166 * Traverses the trie from the initial state for the
167 * one or two UTF-16 code units for this input code point.
168 * Equivalent to reset().nextForCodePoint(cp).
169 * @param cp A Unicode code point 0..0x10ffff.
170 * @return The match/value Result.
172 * @provisional This API might change or be removed in a future release.
174 public Result firstForCodePoint(int cp) {
177 (first(UTF16.getLeadSurrogate(cp)).hasNext() ?
178 next(UTF16.getTrailSurrogate(cp)) :
183 * Traverses the trie from the current state for this input char.
184 * @param inUnit Input char value. Values below 0 and above 0xffff will never match.
185 * @return The match/value Result.
187 * @provisional This API might change or be removed in a future release.
189 public Result next(int inUnit) {
192 return Result.NO_MATCH;
194 int length=remainingMatchLength_; // Actual remaining match length minus 1.
196 // Remaining part of a linear-match node.
197 if(inUnit==chars_.charAt(pos++)) {
198 remainingMatchLength_=--length;
201 return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ?
202 valueResults_[node>>15] : Result.NO_VALUE;
205 return Result.NO_MATCH;
208 return nextImpl(pos, inUnit);
212 * Traverses the trie from the current state for the
213 * one or two UTF-16 code units for this input code point.
214 * @param cp A Unicode code point 0..0x10ffff.
215 * @return The match/value Result.
217 * @provisional This API might change or be removed in a future release.
219 public Result nextForCodePoint(int cp) {
222 (next(UTF16.getLeadSurrogate(cp)).hasNext() ?
223 next(UTF16.getTrailSurrogate(cp)) :
228 * Traverses the trie from the current state for this string.
231 * Result result=current();
233 * if(!result.hasNext()) return Result.NO_MATCH;
237 * @param s Contains a string.
238 * @param sIndex The start index of the string in s.
239 * @param sLimit The (exclusive) end index of the string in s.
240 * @return The match/value Result.
242 * @provisional This API might change or be removed in a future release.
244 public Result next(CharSequence s, int sIndex, int sLimit) {
251 return Result.NO_MATCH;
253 int length=remainingMatchLength_; // Actual remaining match length minus 1.
255 // Fetch the next input unit, if there is one.
256 // Continue a linear-match node.
260 remainingMatchLength_=length;
263 return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ?
264 valueResults_[node>>15] : Result.NO_VALUE;
266 inUnit=s.charAt(sIndex++);
268 remainingMatchLength_=length;
271 if(inUnit!=chars_.charAt(pos)) {
273 return Result.NO_MATCH;
278 int node=chars_.charAt(pos++);
280 if(node<kMinLinearMatch) {
281 Result result=branchNext(pos, node, inUnit);
282 if(result==Result.NO_MATCH) {
283 return Result.NO_MATCH;
285 // Fetch the next input unit, if there is one.
289 if(result==Result.FINAL_VALUE) {
290 // No further matching units.
292 return Result.NO_MATCH;
294 inUnit=s.charAt(sIndex++);
295 pos=pos_; // branchNext() advanced pos and wrote it to pos_ .
296 node=chars_.charAt(pos++);
297 } else if(node<kMinValueLead) {
298 // Match length+1 units.
299 length=node-kMinLinearMatch; // Actual match length minus 1.
300 if(inUnit!=chars_.charAt(pos)) {
302 return Result.NO_MATCH;
307 } else if((node&kValueIsFinal)!=0) {
308 // No further matching units.
310 return Result.NO_MATCH;
312 // Skip intermediate value.
313 pos=skipNodeValue(pos, node);
321 * Returns a matching string's value if called immediately after
322 * current()/first()/next() returned Result.INTERMEDIATE_VALUE or Result.FINAL_VALUE.
323 * getValue() can be called multiple times.
325 * Do not call getValue() after Result.NO_MATCH or Result.NO_VALUE!
326 * @return The value for the string so far.
328 * @provisional This API might change or be removed in a future release.
330 public int getValue() /*const*/ {
332 int leadUnit=chars_.charAt(pos++);
333 assert(leadUnit>=kMinValueLead);
334 return (leadUnit&kValueIsFinal)!=0 ?
335 readValue(chars_, pos, leadUnit&0x7fff) : readNodeValue(chars_, pos, leadUnit);
339 * Determines whether all strings reachable from the current state
340 * map to the same value, and if so, returns that value.
341 * @return The unique value in bits 32..1 with bit 0 set,
342 * if all strings reachable from the current state
343 * map to the same value; otherwise returns 0.
345 * @provisional This API might change or be removed in a future release.
347 public long getUniqueValue() /*const*/ {
352 // Skip the rest of a pending linear-match node.
353 long uniqueValue=findUniqueValue(chars_, pos+remainingMatchLength_+1, 0);
354 // Ignore internally used bits 63..33; extend the actual value's sign bit from bit 32.
355 return (uniqueValue<<31)>>31;
359 * Finds each char which continues the string from the current state.
360 * That is, each char c for which it would be next(c)!=Result.NO_MATCH now.
361 * @param out Each next char is appended to this object.
362 * (Only uses the out.append(c) method.)
363 * @return The number of chars which continue the string from here.
365 * @provisional This API might change or be removed in a future release.
367 public int getNextChars(Appendable out) /*const*/ {
372 if(remainingMatchLength_>=0) {
373 append(out, chars_.charAt(pos)); // Next unit of a pending linear-match node.
376 int node=chars_.charAt(pos++);
377 if(node>=kMinValueLead) {
378 if((node&kValueIsFinal)!=0) {
381 pos=skipNodeValue(pos, node);
385 if(node<kMinLinearMatch) {
387 node=chars_.charAt(pos++);
389 getNextBranchChars(chars_, pos, ++node, out);
392 // First unit of the linear-match node.
393 append(out, chars_.charAt(pos));
399 * Iterates from the current state of this trie.
400 * @return A new CharsTrie.Iterator.
402 * @provisional This API might change or be removed in a future release.
404 public Iterator iterator() {
405 return new Iterator(chars_, pos_, remainingMatchLength_, 0);
409 * Iterates from the current state of this trie.
410 * @param maxStringLength If 0, the iterator returns full strings.
411 * Otherwise, the iterator returns strings with this maximum length.
412 * @return A new CharsTrie.Iterator.
414 * @provisional This API might change or be removed in a future release.
416 public Iterator iterator(int maxStringLength) {
417 return new Iterator(chars_, pos_, remainingMatchLength_, maxStringLength);
421 * Iterates from the root of a char-serialized BytesTrie.
422 * @param trieChars CharSequence that contains the serialized trie.
423 * @param offset Root offset of the trie in the CharSequence.
424 * @param maxStringLength If 0, the iterator returns full strings.
425 * Otherwise, the iterator returns strings with this maximum length.
426 * @return A new CharsTrie.Iterator.
428 * @provisional This API might change or be removed in a future release.
430 public static Iterator iterator(CharSequence trieChars, int offset, int maxStringLength) {
431 return new Iterator(trieChars, offset, -1, maxStringLength);
435 * Return value type for the Iterator.
437 * @provisional This API might change or be removed in a future release.
439 public static final class Entry {
443 * @provisional This API might change or be removed in a future release.
445 public CharSequence chars;
447 * The value associated with the string.
449 * @provisional This API might change or be removed in a future release.
458 * Iterator for all of the (string, value) pairs in a CharsTrie.
460 * @provisional This API might change or be removed in a future release.
462 public static final class Iterator implements java.util.Iterator<Entry> {
463 private Iterator(CharSequence trieChars, int offset, int remainingMatchLength, int maxStringLength) {
465 pos_=initialPos_=offset;
466 remainingMatchLength_=initialRemainingMatchLength_=remainingMatchLength;
467 maxLength_=maxStringLength;
468 int length=remainingMatchLength_; // Actual remaining match length minus 1.
470 // Pending linear-match node, append remaining bytes to str_.
472 if(maxLength_>0 && length>maxLength_) {
473 length=maxLength_; // This will leave remainingMatchLength>=0 as a signal.
475 str_.append(chars_, pos_, pos_+length);
477 remainingMatchLength_-=length;
482 * Resets this iterator to its initial state.
485 * @provisional This API might change or be removed in a future release.
487 public Iterator reset() {
489 remainingMatchLength_=initialRemainingMatchLength_;
491 int length=remainingMatchLength_+1; // Remaining match length.
492 if(maxLength_>0 && length>maxLength_) {
495 str_.setLength(length);
497 remainingMatchLength_-=length;
503 * @return true if there are more elements.
505 * @provisional This API might change or be removed in a future release.
507 public boolean hasNext() /*const*/ { return pos_>=0 || !stack_.isEmpty(); }
510 * Finds the next (string, value) pair if there is one.
512 * If the string is truncated to the maximum length and does not
513 * have a real value, then the value is set to -1.
514 * In this case, this "not a real value" is indistinguishable from
515 * a real value of -1.
516 * @return An Entry with the string and value of the next element.
517 * @throws NoSuchElementException - iteration has no more elements.
519 * @provisional This API might change or be removed in a future release.
521 public Entry next() {
524 if(stack_.isEmpty()) {
525 throw new NoSuchElementException();
527 // Pop the state off the stack and continue with the next outbound edge of
529 long top=stack_.remove(stack_.size()-1);
532 str_.setLength(length&0xffff);
535 pos=branchNext(pos, length);
537 return entry_; // Reached a final value.
540 str_.append(chars_.charAt(pos++));
543 if(remainingMatchLength_>=0) {
544 // We only get here if we started in a pending linear-match node
545 // with more than maxLength remaining units.
546 return truncateAndStop();
549 int node=chars_.charAt(pos++);
550 if(node>=kMinValueLead) {
552 pos=skipNodeValue(pos, node);
556 // Deliver value for the string so far.
557 boolean isFinal=(node&kValueIsFinal)!=0;
559 entry_.value=readValue(chars_, pos, node&0x7fff);
561 entry_.value=readNodeValue(chars_, pos, node);
563 if(isFinal || (maxLength_>0 && str_.length()==maxLength_)) {
566 // We cannot skip the value right here because it shares its
567 // lead unit with a match node which we have to evaluate
569 // Instead, keep pos_ on the node lead unit itself.
577 if(maxLength_>0 && str_.length()==maxLength_) {
578 return truncateAndStop();
580 if(node<kMinLinearMatch) {
582 node=chars_.charAt(pos++);
584 pos=branchNext(pos, node+1);
586 return entry_; // Reached a final value.
589 // Linear-match node, append length units to str_.
590 int length=node-kMinLinearMatch+1;
591 if(maxLength_>0 && str_.length()+length>maxLength_) {
592 str_.append(chars_, pos, pos+maxLength_-str_.length());
593 return truncateAndStop();
595 str_.append(chars_, pos, pos+length);
602 * Iterator.remove() is not supported.
603 * @throws UnsupportedOperationException (always)
605 * @provisional This API might change or be removed in a future release.
607 public void remove() {
608 throw new UnsupportedOperationException();
611 private Entry truncateAndStop() {
613 // We reset entry_.chars every time we return entry_
614 // just because the caller might have modified the Entry.
616 entry_.value=-1; // no real value for str
620 private int branchNext(int pos, int length) {
621 while(length>kMaxBranchLinearSubNodeLength) {
622 ++pos; // ignore the comparison unit
623 // Push state for the greater-or-equal edge.
624 stack_.add(((long)skipDelta(chars_, pos)<<32)|((length-(length>>1))<<16)|str_.length());
625 // Follow the less-than edge.
627 pos=jumpByDelta(chars_, pos);
629 // List of key-value pairs where values are either final values or jump deltas.
630 // Read the first (key, value) pair.
631 char trieUnit=chars_.charAt(pos++);
632 int node=chars_.charAt(pos++);
633 boolean isFinal=(node&kValueIsFinal)!=0;
634 int value=readValue(chars_, pos, node&=0x7fff);
635 pos=skipValue(pos, node);
636 stack_.add(((long)pos<<32)|((length-1)<<16)|str_.length());
637 str_.append(trieUnit);
648 private CharSequence chars_;
650 private int initialPos_;
651 private int remainingMatchLength_;
652 private int initialRemainingMatchLength_;
653 private boolean skipValue_; // Skip intermediate value which was already delivered.
655 private StringBuilder str_=new StringBuilder();
656 private int maxLength_;
657 private Entry entry_=new Entry();
659 // The stack stores longs for backtracking to another
660 // outbound edge of a branch node.
661 // Each long has the offset in chars_ in bits 62..32,
662 // the str_.length() from before the node in bits 15..0,
663 // and the remaining branch length in bits 31..16.
664 // (We could store the remaining branch length minus 1 in bits 30..16 and not use bit 31,
665 // but the code looks more confusing that way.)
666 private ArrayList<Long> stack_=new ArrayList<Long>();
669 private void stop() {
673 // Reads a compact 32-bit integer.
674 // pos is already after the leadUnit, and the lead unit has bit 15 reset.
675 private static int readValue(CharSequence chars, int pos, int leadUnit) {
677 if(leadUnit<kMinTwoUnitValueLead) {
679 } else if(leadUnit<kThreeUnitValueLead) {
680 value=((leadUnit-kMinTwoUnitValueLead)<<16)|chars.charAt(pos);
682 value=(chars.charAt(pos)<<16)|chars.charAt(pos+1);
686 private static int skipValue(int pos, int leadUnit) {
687 if(leadUnit>=kMinTwoUnitValueLead) {
688 if(leadUnit<kThreeUnitValueLead) {
696 private static int skipValue(CharSequence chars, int pos) {
697 int leadUnit=chars.charAt(pos++);
698 return skipValue(pos, leadUnit&0x7fff);
701 private static int readNodeValue(CharSequence chars, int pos, int leadUnit) {
702 assert(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
704 if(leadUnit<kMinTwoUnitNodeValueLead) {
705 value=(leadUnit>>6)-1;
706 } else if(leadUnit<kThreeUnitNodeValueLead) {
707 value=(((leadUnit&0x7fc0)-kMinTwoUnitNodeValueLead)<<10)|chars.charAt(pos);
709 value=(chars.charAt(pos)<<16)|chars.charAt(pos+1);
713 private static int skipNodeValue(int pos, int leadUnit) {
714 assert(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
715 if(leadUnit>=kMinTwoUnitNodeValueLead) {
716 if(leadUnit<kThreeUnitNodeValueLead) {
725 private static int jumpByDelta(CharSequence chars, int pos) {
726 int delta=chars.charAt(pos++);
727 if(delta>=kMinTwoUnitDeltaLead) {
728 if(delta==kThreeUnitDeltaLead) {
729 delta=(chars.charAt(pos)<<16)|chars.charAt(pos+1);
732 delta=((delta-kMinTwoUnitDeltaLead)<<16)|chars.charAt(pos++);
738 private static int skipDelta(CharSequence chars, int pos) {
739 int delta=chars.charAt(pos++);
740 if(delta>=kMinTwoUnitDeltaLead) {
741 if(delta==kThreeUnitDeltaLead) {
750 private static Result[] valueResults_={ Result.INTERMEDIATE_VALUE, Result.FINAL_VALUE };
752 // Handles a branch node for both next(unit) and next(string).
753 private Result branchNext(int pos, int length, int inUnit) {
754 // Branch according to the current unit.
756 length=chars_.charAt(pos++);
759 // The length of the branch is the number of units to select from.
760 // The data structure encodes a binary search.
761 while(length>kMaxBranchLinearSubNodeLength) {
762 if(inUnit<chars_.charAt(pos++)) {
764 pos=jumpByDelta(chars_, pos);
766 length=length-(length>>1);
767 pos=skipDelta(chars_, pos);
770 // Drop down to linear search for the last few units.
771 // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
772 // and divides length by 2.
774 if(inUnit==chars_.charAt(pos++)) {
776 int node=chars_.charAt(pos);
777 if((node&kValueIsFinal)!=0) {
778 // Leave the final value for getValue() to read.
779 result=Result.FINAL_VALUE;
781 // Use the non-final value as the jump delta.
783 // int delta=readValue(pos, node);
785 if(node<kMinTwoUnitValueLead) {
787 } else if(node<kThreeUnitValueLead) {
788 delta=((node-kMinTwoUnitValueLead)<<16)|chars_.charAt(pos++);
790 delta=(chars_.charAt(pos)<<16)|chars_.charAt(pos+1);
795 node=chars_.charAt(pos);
796 result= node>=kMinValueLead ? valueResults_[node>>15] : Result.NO_VALUE;
802 pos=skipValue(chars_, pos);
804 if(inUnit==chars_.charAt(pos++)) {
806 int node=chars_.charAt(pos);
807 return node>=kMinValueLead ? valueResults_[node>>15] : Result.NO_VALUE;
810 return Result.NO_MATCH;
814 // Requires remainingLength_<0.
815 private Result nextImpl(int pos, int inUnit) {
816 int node=chars_.charAt(pos++);
818 if(node<kMinLinearMatch) {
819 return branchNext(pos, node, inUnit);
820 } else if(node<kMinValueLead) {
821 // Match the first of length+1 units.
822 int length=node-kMinLinearMatch; // Actual match length minus 1.
823 if(inUnit==chars_.charAt(pos++)) {
824 remainingMatchLength_=--length;
826 return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ?
827 valueResults_[node>>15] : Result.NO_VALUE;
832 } else if((node&kValueIsFinal)!=0) {
833 // No further matching units.
836 // Skip intermediate value.
837 pos=skipNodeValue(pos, node);
842 return Result.NO_MATCH;
845 // Helper functions for getUniqueValue().
846 // Recursively finds a unique value (or whether there is not a unique one)
848 // uniqueValue: On input, same as for getUniqueValue()/findUniqueValue().
849 // On return, if not 0, then bits 63..33 contain the updated non-negative pos.
850 private static long findUniqueValueFromBranch(CharSequence chars, int pos, int length,
852 while(length>kMaxBranchLinearSubNodeLength) {
853 ++pos; // ignore the comparison unit
854 uniqueValue=findUniqueValueFromBranch(chars, jumpByDelta(chars, pos), length>>1, uniqueValue);
858 length=length-(length>>1);
859 pos=skipDelta(chars, pos);
862 ++pos; // ignore a comparison unit
864 int node=chars.charAt(pos++);
865 boolean isFinal=(node&kValueIsFinal)!=0;
867 int value=readValue(chars, pos, node);
868 pos=skipValue(pos, node);
871 if(value!=(int)(uniqueValue>>1)) {
875 uniqueValue=((long)value<<1)|1;
878 uniqueValue=findUniqueValue(chars, pos+value, uniqueValue);
884 // ignore the last comparison byte
885 return ((long)(pos+1)<<33)|(uniqueValue&0x1ffffffffL);
887 // Recursively finds a unique value (or whether there is not a unique one)
888 // starting from a position on a node lead unit.
889 // uniqueValue: If there is one, then bits 32..1 contain the value and bit 0 is set.
890 // Otherwise, uniqueValue is 0. Bits 63..33 are ignored.
891 private static long findUniqueValue(CharSequence chars, int pos, long uniqueValue) {
892 int node=chars.charAt(pos++);
894 if(node<kMinLinearMatch) {
896 node=chars.charAt(pos++);
898 uniqueValue=findUniqueValueFromBranch(chars, pos, node+1, uniqueValue);
902 pos=(int)(uniqueValue>>>33);
903 node=chars.charAt(pos++);
904 } else if(node<kMinValueLead) {
906 pos+=node-kMinLinearMatch+1; // Ignore the match units.
907 node=chars.charAt(pos++);
909 boolean isFinal=(node&kValueIsFinal)!=0;
912 value=readValue(chars, pos, node&0x7fff);
914 value=readNodeValue(chars, pos, node);
917 if(value!=(int)(uniqueValue>>1)) {
921 uniqueValue=((long)value<<1)|1;
926 pos=skipNodeValue(pos, node);
932 // Helper functions for getNextChars().
933 // getNextChars() when pos is on a branch node.
934 private static void getNextBranchChars(CharSequence chars, int pos, int length, Appendable out) {
935 while(length>kMaxBranchLinearSubNodeLength) {
936 ++pos; // ignore the comparison unit
937 getNextBranchChars(chars, jumpByDelta(chars, pos), length>>1, out);
938 length=length-(length>>1);
939 pos=skipDelta(chars, pos);
942 append(out, chars.charAt(pos++));
943 pos=skipValue(chars, pos);
945 append(out, chars.charAt(pos));
947 private static void append(Appendable out, int c) {
950 } catch(IOException e) {
951 throw new RuntimeException(e);
955 // CharsTrie data structure
957 // The trie consists of a series of char-serialized nodes for incremental
958 // Unicode string/char sequence matching. (char=16-bit unsigned integer)
959 // The root node is at the beginning of the trie data.
961 // Types of nodes are distinguished by their node lead unit ranges.
962 // After each node, except a final-value node, another node follows to
963 // encode match values or continue matching further units.
966 // - Final-value node: Stores a 32-bit integer in a compact, variable-length format.
967 // The value is for the string/char sequence so far.
968 // - Match node, optionally with an intermediate value in a different compact format.
969 // The value, if present, is for the string/char sequence so far.
971 // Aside from the value, which uses the node lead unit's high bits:
973 // - Linear-match node: Matches a number of units.
974 // - Branch node: Branches to other nodes according to the current input unit.
975 // The node unit is the length of the branch (number of units to select from)
976 // minus 1. It is followed by a sub-node:
977 // - If the length is at most kMaxBranchLinearSubNodeLength, then
978 // there are length-1 (key, value) pairs and then one more comparison unit.
979 // If one of the key units matches, then the value is either a final value for
980 // the string so far, or a "jump" delta to the next node.
981 // If the last unit matches, then matching continues with the next node.
982 // (Values have the same encoding as final-value nodes.)
983 // - If the length is greater than kMaxBranchLinearSubNodeLength, then
984 // there is one unit and one "jump" delta.
985 // If the input unit is less than the sub-node unit, then "jump" by delta to
986 // the next sub-node which will have a length of length/2.
987 // (The delta has its own compact encoding.)
988 // Otherwise, skip the "jump" delta to the next sub-node
989 // which will have a length of length-length/2.
991 // Match-node lead unit values, after masking off intermediate-value bits:
993 // 0000..002f: Branch node. If node!=0 then the length is node+1, otherwise
994 // the length is one more than the next unit.
996 // For a branch sub-node with at most this many entries, we drop down
997 // to a linear search.
998 /*package*/ static final int kMaxBranchLinearSubNodeLength=5;
1000 // 0030..003f: Linear-match node, match 1..16 units and continue reading the next node.
1001 /*package*/ static final int kMinLinearMatch=0x30;
1002 /*package*/ static final int kMaxLinearMatchLength=0x10;
1004 // Match-node lead unit bits 14..6 for the optional intermediate value.
1005 // If these bits are 0, then there is no intermediate value.
1006 // Otherwise, see the *NodeValue* constants below.
1007 /*package*/ static final int kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x0040
1008 /*package*/ static final int kNodeTypeMask=kMinValueLead-1; // 0x003f
1010 // A final-value node has bit 15 set.
1011 /*package*/ static final int kValueIsFinal=0x8000;
1013 // Compact value: After testing and masking off bit 15, use the following thresholds.
1014 /*package*/ static final int kMaxOneUnitValue=0x3fff;
1016 /*package*/ static final int kMinTwoUnitValueLead=kMaxOneUnitValue+1; // 0x4000
1017 /*package*/ static final int kThreeUnitValueLead=0x7fff;
1019 /*package*/ static final int kMaxTwoUnitValue=((kThreeUnitValueLead-kMinTwoUnitValueLead)<<16)-1; // 0x3ffeffff
1021 // Compact intermediate-value integer, lead unit shared with a branch or linear-match node.
1022 /*package*/ static final int kMaxOneUnitNodeValue=0xff;
1023 /*package*/ static final int kMinTwoUnitNodeValueLead=kMinValueLead+((kMaxOneUnitNodeValue+1)<<6); // 0x4040
1024 /*package*/ static final int kThreeUnitNodeValueLead=0x7fc0;
1026 /*package*/ static final int kMaxTwoUnitNodeValue=
1027 ((kThreeUnitNodeValueLead-kMinTwoUnitNodeValueLead)<<10)-1; // 0xfdffff
1029 // Compact delta integers.
1030 /*package*/ static final int kMaxOneUnitDelta=0xfbff;
1031 /*package*/ static final int kMinTwoUnitDeltaLead=kMaxOneUnitDelta+1; // 0xfc00
1032 /*package*/ static final int kThreeUnitDeltaLead=0xffff;
1034 /*package*/ static final int kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1; // 0x03feffff
1036 // Fixed value referencing the CharsTrie words.
1037 private CharSequence chars_;
1040 // Iterator variables.
1042 // Pointer to next trie unit to read. NULL if no more matches.
1044 // Remaining length of a linear-match node, minus 1. Negative if not in such a node.
1045 private int remainingMatchLength_;