2 *******************************************************************************
3 * Copyright (C) 2010-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * created on: 2010nov23
7 * created by: Markus W. Scherer
8 * ported from ICU4C bytestrie.h/.cpp
10 package com.ibm.icu.util;
12 import java.io.IOException;
13 import java.nio.ByteBuffer;
14 import java.util.ArrayList;
15 import java.util.NoSuchElementException;
18 * Light-weight, non-const reader class for a BytesTrie.
19 * Traverses a byte-serialized data structure with minimal state,
20 * for mapping byte sequences to non-negative integer values.
22 * <p>This class is not intended for public subclassing.
25 * @provisional This API might change or be removed in a future release.
26 * @author Markus W. Scherer
28 public final class BytesTrie implements Cloneable, Iterable<BytesTrie.Entry> {
30 * Constructs a BytesTrie reader instance.
32 * <p>The array must contain a copy of a byte sequence from the BytesTrieBuilder,
33 * with the offset indicating the first byte of that sequence.
34 * The BytesTrie object will not read more bytes than
35 * the BytesTrieBuilder generated in the corresponding build() call.
37 * <p>The array is not copied/cloned and must not be modified while
38 * the BytesTrie object is in use.
40 * @param trieBytes Bytes array that contains the serialized trie.
41 * @param offset Root offset of the trie in the array.
43 * @provisional This API might change or be removed in a future release.
45 public BytesTrie(byte[] trieBytes, int offset) {
48 remainingMatchLength_=-1;
52 * Clones this trie reader object and its state,
53 * but not the byte array which will be shared.
54 * @return A shallow clone of this trie.
56 * @provisional This API might change or be removed in a future release.
59 public Object clone() throws CloneNotSupportedException {
60 return super.clone(); // A shallow copy is just what we need.
64 * Resets this trie to its initial state.
67 * @provisional This API might change or be removed in a future release.
69 public BytesTrie reset() {
71 remainingMatchLength_=-1;
76 * BytesTrie state object, for saving a trie's current state
77 * and resetting the trie back to this state later.
79 * @provisional This API might change or be removed in a future release.
81 public static final class State {
83 * Constructs an empty State.
85 * @provisional This API might change or be removed in a future release.
91 private int remainingMatchLength;
95 * Saves the state of this trie.
96 * @param state The State object to hold the trie's state.
100 * @provisional This API might change or be removed in a future release.
102 public BytesTrie saveState(State state) /*const*/ {
106 state.remainingMatchLength=remainingMatchLength_;
111 * Resets this trie to the saved state.
112 * @param state The State object which holds a saved trie state.
114 * @throws IllegalArgumentException if the state object contains no state,
115 * or the state of a different trie
119 * @provisional This API might change or be removed in a future release.
121 public BytesTrie resetToState(State state) {
122 if(bytes_==state.bytes && bytes_!=null && root_==state.root) {
124 remainingMatchLength_=state.remainingMatchLength;
126 throw new IllegalArgumentException("incompatible trie state");
132 * Return values for BytesTrie.next(), CharsTrie.next() and similar methods.
134 * @provisional This API might change or be removed in a future release.
138 * The input unit(s) did not continue a matching string.
139 * Once current()/next() return NO_MATCH,
140 * all further calls to current()/next() will also return NO_MATCH,
141 * until the trie is reset to its original state or to a saved state.
143 * @provisional This API might change or be removed in a future release.
147 * The input unit(s) continued a matching string
148 * but there is no value for the string so far.
149 * (It is a prefix of a longer string.)
151 * @provisional This API might change or be removed in a future release.
155 * The input unit(s) continued a matching string
156 * and there is a value for the string so far.
157 * This value will be returned by getValue().
158 * No further input byte/unit can continue a matching string.
160 * @provisional This API might change or be removed in a future release.
164 * The input unit(s) continued a matching string
165 * and there is a value for the string so far.
166 * This value will be returned by getValue().
167 * Another input byte/unit can continue a matching string.
169 * @provisional This API might change or be removed in a future release.
173 // Note: The following methods assume the particular order
174 // of enum constants, treating the ordinal() values like bit sets.
175 // Do not reorder the enum constants!
178 * Same as (result!=NO_MATCH).
179 * @return true if the input bytes/units so far are part of a matching string/byte sequence.
181 * @provisional This API might change or be removed in a future release.
183 public boolean matches() { return this!=NO_MATCH; }
186 * Equivalent to (result==INTERMEDIATE_VALUE || result==FINAL_VALUE).
187 * @return true if there is a value for the input bytes/units so far.
190 * @provisional This API might change or be removed in a future release.
192 public boolean hasValue() { return ordinal()>=2; }
195 * Equivalent to (result==NO_VALUE || result==INTERMEDIATE_VALUE).
196 * @return true if another input byte/unit can continue a matching string.
198 * @provisional This API might change or be removed in a future release.
200 public boolean hasNext() { return (ordinal()&1)!=0; }
204 * Determines whether the byte sequence so far matches, whether it has a value,
205 * and whether another input byte can continue a matching byte sequence.
206 * @return The match/value Result.
208 * @provisional This API might change or be removed in a future release.
210 public Result current() /*const*/ {
213 return Result.NO_MATCH;
216 return (remainingMatchLength_<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ?
217 valueResults_[node&kValueIsFinal] : Result.NO_VALUE;
222 * Traverses the trie from the initial state for this input byte.
223 * Equivalent to reset().next(inByte).
224 * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff.
225 * Values below -0x100 and above 0xff will never match.
226 * @return The match/value Result.
228 * @provisional This API might change or be removed in a future release.
230 public Result first(int inByte) {
231 remainingMatchLength_=-1;
235 return nextImpl(root_, inByte);
239 * Traverses the trie from the current state for this input byte.
240 * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff.
241 * Values below -0x100 and above 0xff will never match.
242 * @return The match/value Result.
244 * @provisional This API might change or be removed in a future release.
246 public Result next(int inByte) {
249 return Result.NO_MATCH;
254 int length=remainingMatchLength_; // Actual remaining match length minus 1.
256 // Remaining part of a linear-match node.
257 if(inByte==(bytes_[pos++]&0xff)) {
258 remainingMatchLength_=--length;
261 return (length<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ?
262 valueResults_[node&kValueIsFinal] : Result.NO_VALUE;
265 return Result.NO_MATCH;
268 return nextImpl(pos, inByte);
272 * Traverses the trie from the current state for this byte sequence.
275 * Result result=current();
277 * if(!result.hasNext()) return Result.NO_MATCH;
281 * @param s Contains a string or byte sequence.
282 * @param sIndex The start index of the byte sequence in s.
283 * @param sLimit The (exclusive) end index of the byte sequence in s.
284 * @return The match/value Result.
286 * @provisional This API might change or be removed in a future release.
288 public Result next(byte[] s, int sIndex, int sLimit) {
295 return Result.NO_MATCH;
297 int length=remainingMatchLength_; // Actual remaining match length minus 1.
299 // Fetch the next input byte, if there is one.
300 // Continue a linear-match node.
304 remainingMatchLength_=length;
307 return (length<0 && (node=(bytes_[pos]&0xff))>=kMinValueLead) ?
308 valueResults_[node&kValueIsFinal] : Result.NO_VALUE;
312 remainingMatchLength_=length;
315 if(inByte!=bytes_[pos]) {
317 return Result.NO_MATCH;
323 int node=bytes_[pos++]&0xff;
324 if(node<kMinLinearMatch) {
325 Result result=branchNext(pos, node, inByte&0xff);
326 if(result==Result.NO_MATCH) {
327 return Result.NO_MATCH;
329 // Fetch the next input byte, if there is one.
333 if(result==Result.FINAL_VALUE) {
334 // No further matching bytes.
336 return Result.NO_MATCH;
339 pos=pos_; // branchNext() advanced pos and wrote it to pos_ .
340 } else if(node<kMinValueLead) {
341 // Match length+1 bytes.
342 length=node-kMinLinearMatch; // Actual match length minus 1.
343 if(inByte!=bytes_[pos]) {
345 return Result.NO_MATCH;
350 } else if((node&kValueIsFinal)!=0) {
351 // No further matching bytes.
353 return Result.NO_MATCH;
355 // Skip intermediate value.
356 pos=skipValue(pos, node);
357 // The next node must not also be a value node.
358 assert((bytes_[pos]&0xff)<kMinValueLead);
365 * Returns a matching byte sequence's value if called immediately after
366 * current()/first()/next() returned Result.INTERMEDIATE_VALUE or Result.FINAL_VALUE.
367 * getValue() can be called multiple times.
369 * Do not call getValue() after Result.NO_MATCH or Result.NO_VALUE!
370 * @return The value for the byte sequence so far.
372 * @provisional This API might change or be removed in a future release.
374 public int getValue() /*const*/ {
376 int leadByte=bytes_[pos++]&0xff;
377 assert(leadByte>=kMinValueLead);
378 return readValue(bytes_, pos, leadByte>>1);
382 * Determines whether all byte sequences reachable from the current state
383 * map to the same value, and if so, returns that value.
384 * @return The unique value in bits 32..1 with bit 0 set,
385 * if all byte sequences reachable from the current state
386 * map to the same value; otherwise returns 0.
388 * @provisional This API might change or be removed in a future release.
390 public long getUniqueValue() /*const*/ {
395 // Skip the rest of a pending linear-match node.
396 long uniqueValue=findUniqueValue(bytes_, pos+remainingMatchLength_+1, 0);
397 // Ignore internally used bits 63..33; extend the actual value's sign bit from bit 32.
398 return (uniqueValue<<31)>>31;
402 * Finds each byte which continues the byte sequence from the current state.
403 * That is, each byte b for which it would be next(b)!=Result.NO_MATCH now.
404 * @param out Each next byte is 0-extended to a char and appended to this object.
405 * (Only uses the out.append(c) method.)
406 * @return The number of bytes which continue the byte sequence from here.
408 * @provisional This API might change or be removed in a future release.
410 public int getNextBytes(Appendable out) /*const*/ {
415 if(remainingMatchLength_>=0) {
416 append(out, bytes_[pos]&0xff); // Next byte of a pending linear-match node.
419 int node=bytes_[pos++]&0xff;
420 if(node>=kMinValueLead) {
421 if((node&kValueIsFinal)!=0) {
424 pos=skipValue(pos, node);
425 node=bytes_[pos++]&0xff;
426 assert(node<kMinValueLead);
429 if(node<kMinLinearMatch) {
431 node=bytes_[pos++]&0xff;
433 getNextBranchBytes(bytes_, pos, ++node, out);
436 // First byte of the linear-match node.
437 append(out, bytes_[pos]&0xff);
443 * Iterates from the current state of this trie.
444 * @return A new BytesTrie.Iterator.
446 * @provisional This API might change or be removed in a future release.
448 public Iterator iterator() {
449 return new Iterator(bytes_, pos_, remainingMatchLength_, 0);
453 * Iterates from the current state of this trie.
454 * @param maxStringLength If 0, the iterator returns full strings/byte sequences.
455 * Otherwise, the iterator returns strings with this maximum length.
456 * @return A new BytesTrie.Iterator.
458 * @provisional This API might change or be removed in a future release.
460 public Iterator iterator(int maxStringLength) {
461 return new Iterator(bytes_, pos_, remainingMatchLength_, maxStringLength);
465 * Iterates from the root of a byte-serialized BytesTrie.
466 * @param trieBytes Bytes array that contains the serialized trie.
467 * @param offset Root offset of the trie in the array.
468 * @param maxStringLength If 0, the iterator returns full strings/byte sequences.
469 * Otherwise, the iterator returns strings with this maximum length.
470 * @return A new BytesTrie.Iterator.
472 * @provisional This API might change or be removed in a future release.
474 public static Iterator iterator(byte[] trieBytes, int offset, int maxStringLength) {
475 return new Iterator(trieBytes, offset, -1, maxStringLength);
479 * Return value type for the Iterator.
481 * @provisional This API might change or be removed in a future release.
483 public static final class Entry {
484 private Entry(int capacity) {
485 bytes=new byte[capacity];
489 * @return The length of the byte sequence.
491 * @provisional This API might change or be removed in a future release.
493 public int bytesLength() { return length; }
495 * Returns a byte of the byte sequence.
496 * @param index An index into the byte sequence.
497 * @return The index-th byte sequence byte.
499 * @provisional This API might change or be removed in a future release.
501 public byte byteAt(int index) { return bytes[index]; }
503 * Copies the byte sequence into a byte array.
504 * @param dest Destination byte array.
505 * @param destOffset Starting offset to where in dest the byte sequence is copied.
507 * @provisional This API might change or be removed in a future release.
509 public void copyBytesTo(byte[] dest, int destOffset) {
510 System.arraycopy(bytes, 0, dest, destOffset, length);
513 * @return The byte sequence as a read-only ByteBuffer.
515 * @provisional This API might change or be removed in a future release.
517 public ByteBuffer bytesAsByteBuffer() {
518 return ByteBuffer.wrap(bytes, 0, length).asReadOnlyBuffer();
522 * The value associated with the byte sequence.
524 * @provisional This API might change or be removed in a future release.
528 private void ensureCapacity(int len) {
529 if(bytes.length<len) {
530 byte[] newBytes=new byte[Math.min(2*bytes.length, 2*len)];
531 System.arraycopy(bytes, 0, newBytes, 0, length);
535 private void append(byte b) {
536 ensureCapacity(length+1);
539 private void append(byte[] b, int off, int len) {
540 ensureCapacity(length+len);
541 System.arraycopy(b, off, bytes, length, len);
544 private void truncateString(int newLength) { length=newLength; }
546 private byte[] bytes;
551 * Iterator for all of the (byte sequence, value) pairs in a BytesTrie.
553 * @provisional This API might change or be removed in a future release.
555 public static final class Iterator implements java.util.Iterator<Entry> {
556 private Iterator(byte[] trieBytes, int offset, int remainingMatchLength, int maxStringLength) {
558 pos_=initialPos_=offset;
559 remainingMatchLength_=initialRemainingMatchLength_=remainingMatchLength;
560 maxLength_=maxStringLength;
561 entry_=new Entry(maxLength_!=0 ? maxLength_ : 32);
562 int length=remainingMatchLength_; // Actual remaining match length minus 1.
564 // Pending linear-match node, append remaining bytes to entry_.
566 if(maxLength_>0 && length>maxLength_) {
567 length=maxLength_; // This will leave remainingMatchLength>=0 as a signal.
569 entry_.append(bytes_, pos_, length);
571 remainingMatchLength_-=length;
576 * Resets this iterator to its initial state.
579 * @provisional This API might change or be removed in a future release.
581 public Iterator reset() {
583 remainingMatchLength_=initialRemainingMatchLength_;
584 int length=remainingMatchLength_+1; // Remaining match length.
585 if(maxLength_>0 && length>maxLength_) {
588 entry_.truncateString(length);
590 remainingMatchLength_-=length;
596 * @return true if there are more elements.
598 * @provisional This API might change or be removed in a future release.
600 public boolean hasNext() /*const*/ { return pos_>=0 || !stack_.isEmpty(); }
603 * Finds the next (byte sequence, value) pair if there is one.
605 * If the byte sequence is truncated to the maximum length and does not
606 * have a real value, then the value is set to -1.
607 * In this case, this "not a real value" is indistinguishable from
608 * a real value of -1.
609 * @return An Entry with the string and value of the next element.
610 * @throws NoSuchElementException - iteration has no more elements.
612 * @provisional This API might change or be removed in a future release.
614 public Entry next() {
617 if(stack_.isEmpty()) {
618 throw new NoSuchElementException();
620 // Pop the state off the stack and continue with the next outbound edge of
622 long top=stack_.remove(stack_.size()-1);
625 entry_.truncateString(length&0xffff);
628 pos=branchNext(pos, length);
630 return entry_; // Reached a final value.
633 entry_.append(bytes_[pos++]);
636 if(remainingMatchLength_>=0) {
637 // We only get here if we started in a pending linear-match node
638 // with more than maxLength remaining bytes.
639 return truncateAndStop();
642 int node=bytes_[pos++]&0xff;
643 if(node>=kMinValueLead) {
644 // Deliver value for the byte sequence so far.
645 boolean isFinal=(node&kValueIsFinal)!=0;
646 entry_.value=readValue(bytes_, pos, node>>1);
647 if(isFinal || (maxLength_>0 && entry_.length==maxLength_)) {
650 pos_=skipValue(pos, node);
654 if(maxLength_>0 && entry_.length==maxLength_) {
655 return truncateAndStop();
657 if(node<kMinLinearMatch) {
659 node=bytes_[pos++]&0xff;
661 pos=branchNext(pos, node+1);
663 return entry_; // Reached a final value.
666 // Linear-match node, append length bytes to entry_.
667 int length=node-kMinLinearMatch+1;
668 if(maxLength_>0 && entry_.length+length>maxLength_) {
669 entry_.append(bytes_, pos, maxLength_-entry_.length);
670 return truncateAndStop();
672 entry_.append(bytes_, pos, length);
679 * Iterator.remove() is not supported.
680 * @throws UnsupportedOperationException (always)
682 * @provisional This API might change or be removed in a future release.
684 public void remove() {
685 throw new UnsupportedOperationException();
688 private Entry truncateAndStop() {
690 entry_.value=-1; // no real value for str
694 private int branchNext(int pos, int length) {
695 while(length>kMaxBranchLinearSubNodeLength) {
696 ++pos; // ignore the comparison byte
697 // Push state for the greater-or-equal edge.
698 stack_.add(((long)skipDelta(bytes_, pos)<<32)|((length-(length>>1))<<16)|entry_.length);
699 // Follow the less-than edge.
701 pos=jumpByDelta(bytes_, pos);
703 // List of key-value pairs where values are either final values or jump deltas.
704 // Read the first (key, value) pair.
705 byte trieByte=bytes_[pos++];
706 int node=bytes_[pos++]&0xff;
707 boolean isFinal=(node&kValueIsFinal)!=0;
708 int value=readValue(bytes_, pos, node>>1);
709 pos=skipValue(pos, node);
710 stack_.add(((long)pos<<32)|((length-1)<<16)|entry_.length);
711 entry_.append(trieByte);
721 private byte[] bytes_;
723 private int initialPos_;
724 private int remainingMatchLength_;
725 private int initialRemainingMatchLength_;
727 private int maxLength_;
728 private Entry entry_;
730 // The stack stores longs for backtracking to another
731 // outbound edge of a branch node.
732 // Each long has the offset from bytes_ in bits 62..32,
733 // the entry_.stringLength() from before the node in bits 15..0,
734 // and the remaining branch length in bits 24..16. (Bits 31..25 are unused.)
735 // (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24,
736 // but the code looks more confusing that way.)
737 private ArrayList<Long> stack_=new ArrayList<Long>();
740 private void stop() {
744 // Reads a compact 32-bit integer.
745 // pos is already after the leadByte, and the lead byte is already shifted right by 1.
746 private static int readValue(byte[] bytes, int pos, int leadByte) {
748 if(leadByte<kMinTwoByteValueLead) {
749 value=leadByte-kMinOneByteValueLead;
750 } else if(leadByte<kMinThreeByteValueLead) {
751 value=((leadByte-kMinTwoByteValueLead)<<8)|(bytes[pos]&0xff);
752 } else if(leadByte<kFourByteValueLead) {
753 value=((leadByte-kMinThreeByteValueLead)<<16)|((bytes[pos]&0xff)<<8)|(bytes[pos+1]&0xff);
754 } else if(leadByte==kFourByteValueLead) {
755 value=((bytes[pos]&0xff)<<16)|((bytes[pos+1]&0xff)<<8)|(bytes[pos+2]&0xff);
757 value=(bytes[pos]<<24)|((bytes[pos+1]&0xff)<<16)|((bytes[pos+2]&0xff)<<8)|(bytes[pos+3]&0xff);
761 private static int skipValue(int pos, int leadByte) {
762 assert(leadByte>=kMinValueLead);
763 if(leadByte>=(kMinTwoByteValueLead<<1)) {
764 if(leadByte<(kMinThreeByteValueLead<<1)) {
766 } else if(leadByte<(kFourByteValueLead<<1)) {
769 pos+=3+((leadByte>>1)&1);
774 private static int skipValue(byte[] bytes, int pos) {
775 int leadByte=bytes[pos++]&0xff;
776 return skipValue(pos, leadByte);
779 // Reads a jump delta and jumps.
780 private static int jumpByDelta(byte[] bytes, int pos) {
781 int delta=bytes[pos++]&0xff;
782 if(delta<kMinTwoByteDeltaLead) {
784 } else if(delta<kMinThreeByteDeltaLead) {
785 delta=((delta-kMinTwoByteDeltaLead)<<8)|(bytes[pos++]&0xff);
786 } else if(delta<kFourByteDeltaLead) {
787 delta=((delta-kMinThreeByteDeltaLead)<<16)|((bytes[pos]&0xff)<<8)|(bytes[pos+1]&0xff);
789 } else if(delta==kFourByteDeltaLead) {
790 delta=((bytes[pos]&0xff)<<16)|((bytes[pos+1]&0xff)<<8)|(bytes[pos+2]&0xff);
793 delta=(bytes[pos]<<24)|((bytes[pos+1]&0xff)<<16)|((bytes[pos+2]&0xff)<<8)|(bytes[pos+3]&0xff);
799 private static int skipDelta(byte[] bytes, int pos) {
800 int delta=bytes[pos++]&0xff;
801 if(delta>=kMinTwoByteDeltaLead) {
802 if(delta<kMinThreeByteDeltaLead) {
804 } else if(delta<kFourByteDeltaLead) {
813 private static Result[] valueResults_={ Result.INTERMEDIATE_VALUE, Result.FINAL_VALUE };
815 // Handles a branch node for both next(byte) and next(string).
816 private Result branchNext(int pos, int length, int inByte) {
817 // Branch according to the current byte.
819 length=bytes_[pos++]&0xff;
822 // The length of the branch is the number of bytes to select from.
823 // The data structure encodes a binary search.
824 while(length>kMaxBranchLinearSubNodeLength) {
825 if(inByte<(bytes_[pos++]&0xff)) {
827 pos=jumpByDelta(bytes_, pos);
829 length=length-(length>>1);
830 pos=skipDelta(bytes_, pos);
833 // Drop down to linear search for the last few bytes.
834 // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
835 // and divides length by 2.
837 if(inByte==(bytes_[pos++]&0xff)) {
839 int node=bytes_[pos]&0xff;
840 assert(node>=kMinValueLead);
841 if((node&kValueIsFinal)!=0) {
842 // Leave the final value for getValue() to read.
843 result=Result.FINAL_VALUE;
845 // Use the non-final value as the jump delta.
847 // int delta=readValue(pos, node>>1);
850 if(node<kMinTwoByteValueLead) {
851 delta=node-kMinOneByteValueLead;
852 } else if(node<kMinThreeByteValueLead) {
853 delta=((node-kMinTwoByteValueLead)<<8)|(bytes_[pos++]&0xff);
854 } else if(node<kFourByteValueLead) {
855 delta=((node-kMinThreeByteValueLead)<<16)|((bytes_[pos]&0xff)<<8)|(bytes_[pos+1]&0xff);
857 } else if(node==kFourByteValueLead) {
858 delta=((bytes_[pos]&0xff)<<16)|((bytes_[pos+1]&0xff)<<8)|(bytes_[pos+2]&0xff);
861 delta=(bytes_[pos]<<24)|((bytes_[pos+1]&0xff)<<16)|((bytes_[pos+2]&0xff)<<8)|(bytes_[pos+3]&0xff);
866 node=bytes_[pos]&0xff;
867 result= node>=kMinValueLead ? valueResults_[node&kValueIsFinal] : Result.NO_VALUE;
873 pos=skipValue(bytes_, pos);
875 if(inByte==(bytes_[pos++]&0xff)) {
877 int node=bytes_[pos]&0xff;
878 return node>=kMinValueLead ? valueResults_[node&kValueIsFinal] : Result.NO_VALUE;
881 return Result.NO_MATCH;
885 // Requires remainingLength_<0.
886 private Result nextImpl(int pos, int inByte) {
888 int node=bytes_[pos++]&0xff;
889 if(node<kMinLinearMatch) {
890 return branchNext(pos, node, inByte);
891 } else if(node<kMinValueLead) {
892 // Match the first of length+1 bytes.
893 int length=node-kMinLinearMatch; // Actual match length minus 1.
894 if(inByte==(bytes_[pos++]&0xff)) {
895 remainingMatchLength_=--length;
897 return (length<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ?
898 valueResults_[node&kValueIsFinal] : Result.NO_VALUE;
903 } else if((node&kValueIsFinal)!=0) {
904 // No further matching bytes.
907 // Skip intermediate value.
908 pos=skipValue(pos, node);
909 // The next node must not also be a value node.
910 assert((bytes_[pos]&0xff)<kMinValueLead);
914 return Result.NO_MATCH;
917 // Helper functions for getUniqueValue().
918 // Recursively finds a unique value (or whether there is not a unique one)
920 // uniqueValue: On input, same as for getUniqueValue()/findUniqueValue().
921 // On return, if not 0, then bits 63..33 contain the updated non-negative pos.
922 private static long findUniqueValueFromBranch(byte[] bytes, int pos, int length,
924 while(length>kMaxBranchLinearSubNodeLength) {
925 ++pos; // ignore the comparison byte
926 uniqueValue=findUniqueValueFromBranch(bytes, jumpByDelta(bytes, pos), length>>1, uniqueValue);
930 length=length-(length>>1);
931 pos=skipDelta(bytes, pos);
934 ++pos; // ignore a comparison byte
936 int node=bytes[pos++]&0xff;
937 boolean isFinal=(node&kValueIsFinal)!=0;
938 int value=readValue(bytes, pos, node>>1);
939 pos=skipValue(pos, node);
942 if(value!=(int)(uniqueValue>>1)) {
946 uniqueValue=((long)value<<1)|1;
949 uniqueValue=findUniqueValue(bytes, pos+value, uniqueValue);
955 // ignore the last comparison byte
956 return ((long)(pos+1)<<33)|(uniqueValue&0x1ffffffffL);
958 // Recursively finds a unique value (or whether there is not a unique one)
959 // starting from a position on a node lead byte.
960 // uniqueValue: If there is one, then bits 32..1 contain the value and bit 0 is set.
961 // Otherwise, uniqueValue is 0. Bits 63..33 are ignored.
962 private static long findUniqueValue(byte[] bytes, int pos, long uniqueValue) {
964 int node=bytes[pos++]&0xff;
965 if(node<kMinLinearMatch) {
967 node=bytes[pos++]&0xff;
969 uniqueValue=findUniqueValueFromBranch(bytes, pos, node+1, uniqueValue);
973 pos=(int)(uniqueValue>>>33);
974 } else if(node<kMinValueLead) {
976 pos+=node-kMinLinearMatch+1; // Ignore the match bytes.
978 boolean isFinal=(node&kValueIsFinal)!=0;
979 int value=readValue(bytes, pos, node>>1);
981 if(value!=(int)(uniqueValue>>1)) {
985 uniqueValue=((long)value<<1)|1;
990 pos=skipValue(pos, node);
995 // Helper functions for getNextBytes().
996 // getNextBytes() when pos is on a branch node.
997 private static void getNextBranchBytes(byte[] bytes, int pos, int length, Appendable out) {
998 while(length>kMaxBranchLinearSubNodeLength) {
999 ++pos; // ignore the comparison byte
1000 getNextBranchBytes(bytes, jumpByDelta(bytes, pos), length>>1, out);
1001 length=length-(length>>1);
1002 pos=skipDelta(bytes, pos);
1005 append(out, bytes[pos++]&0xff);
1006 pos=skipValue(bytes, pos);
1007 } while(--length>1);
1008 append(out, bytes[pos]&0xff);
1010 private static void append(Appendable out, int c) {
1012 out.append((char)c);
1013 } catch(IOException e) {
1014 throw new RuntimeException(e);
1018 // BytesTrie data structure
1020 // The trie consists of a series of byte-serialized nodes for incremental
1021 // string/byte sequence matching. The root node is at the beginning of the trie data.
1023 // Types of nodes are distinguished by their node lead byte ranges.
1024 // After each node, except a final-value node, another node follows to
1025 // encode match values or continue matching further bytes.
1028 // - Value node: Stores a 32-bit integer in a compact, variable-length format.
1029 // The value is for the string/byte sequence so far.
1030 // One node bit indicates whether the value is final or whether
1031 // matching continues with the next node.
1032 // - Linear-match node: Matches a number of bytes.
1033 // - Branch node: Branches to other nodes according to the current input byte.
1034 // The node byte is the length of the branch (number of bytes to select from)
1035 // minus 1. It is followed by a sub-node:
1036 // - If the length is at most kMaxBranchLinearSubNodeLength, then
1037 // there are length-1 (key, value) pairs and then one more comparison byte.
1038 // If one of the key bytes matches, then the value is either a final value for
1039 // the string/byte sequence so far, or a "jump" delta to the next node.
1040 // If the last byte matches, then matching continues with the next node.
1041 // (Values have the same encoding as value nodes.)
1042 // - If the length is greater than kMaxBranchLinearSubNodeLength, then
1043 // there is one byte and one "jump" delta.
1044 // If the input byte is less than the sub-node byte, then "jump" by delta to
1045 // the next sub-node which will have a length of length/2.
1046 // (The delta has its own compact encoding.)
1047 // Otherwise, skip the "jump" delta to the next sub-node
1048 // which will have a length of length-length/2.
1050 // Node lead byte values.
1052 // 00..0f: Branch node. If node!=0 then the length is node+1, otherwise
1053 // the length is one more than the next byte.
1055 // For a branch sub-node with at most this many entries, we drop down
1056 // to a linear search.
1057 /*package*/ static final int kMaxBranchLinearSubNodeLength=5;
1059 // 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node.
1060 /*package*/ static final int kMinLinearMatch=0x10;
1061 /*package*/ static final int kMaxLinearMatchLength=0x10;
1063 // 20..ff: Variable-length value node.
1064 // If odd, the value is final. (Otherwise, intermediate value or jump delta.)
1065 // Then shift-right by 1 bit.
1066 // The remaining lead byte value indicates the number of following bytes (0..4)
1067 // and contains the value's top bits.
1068 /*package*/ static final int kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x20
1069 // It is a final value if bit 0 is set.
1070 private static final int kValueIsFinal=1;
1072 // Compact value: After testing bit 0, shift right by 1 and then use the following thresholds.
1073 /*package*/ static final int kMinOneByteValueLead=kMinValueLead/2; // 0x10
1074 /*package*/ static final int kMaxOneByteValue=0x40; // At least 6 bits in the first byte.
1076 /*package*/ static final int kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1; // 0x51
1077 /*package*/ static final int kMaxTwoByteValue=0x1aff;
1079 /*package*/ static final int kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1; // 0x6c
1080 /*package*/ static final int kFourByteValueLead=0x7e;
1082 // A little more than Unicode code points. (0x11ffff)
1083 /*package*/ static final int kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1;
1085 /*package*/ static final int kFiveByteValueLead=0x7f;
1087 // Compact delta integers.
1088 /*package*/ static final int kMaxOneByteDelta=0xbf;
1089 /*package*/ static final int kMinTwoByteDeltaLead=kMaxOneByteDelta+1; // 0xc0
1090 /*package*/ static final int kMinThreeByteDeltaLead=0xf0;
1091 /*package*/ static final int kFourByteDeltaLead=0xfe;
1092 /*package*/ static final int kFiveByteDeltaLead=0xff;
1094 /*package*/ static final int kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1; // 0x2fff
1095 /*package*/ static final int kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1; // 0xdffff
1097 // Fixed value referencing the BytesTrie bytes.
1098 private byte[] bytes_;
1101 // Iterator variables.
1103 // Index of next trie byte to read. Negative if no more matches.
1105 // Remaining length of a linear-match node, minus 1. Negative if not in such a node.
1106 private int remainingMatchLength_;