2 *******************************************************************************
3 * Copyright (C) 2011-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * created on: 2011jan05
7 * created by: Markus W. Scherer
8 * ported from ICU4C stringtriebuilder.h/.cpp
10 package com.ibm.icu.util;
12 import java.util.ArrayList;
13 import java.util.HashMap;
16 * Base class for string trie builder classes.
18 * <p>This class is not intended for public subclassing.
20 * @author Markus W. Scherer
23 public abstract class StringTrieBuilder {
25 * Build options for BytesTrieBuilder and CharsTrieBuilder.
30 * Builds a trie quickly.
35 * Builds a trie more slowly, attempting to generate
36 * a shorter but equivalent serialization.
37 * This build option also uses more memory.
39 * <p>This option can be effective when many integer values are the same
40 * and string/byte sequence suffixes can be shared.
41 * Runtime speed is not expected to improve.
49 * @deprecated This API is ICU internal only.
51 protected StringTrieBuilder() {}
55 * @deprecated This API is ICU internal only.
57 protected void addImpl(CharSequence s, int value) {
58 if(state!=State.ADDING) {
59 // Cannot add elements after building.
60 throw new IllegalStateException("Cannot add (string, value) pairs after build().");
62 if(s.length()>0xffff) {
63 // Too long: Limited by iterator internals, and by builder recursion depth.
64 throw new IndexOutOfBoundsException("The maximum string length is 0xffff.");
67 root=createSuffixNode(s, 0, value);
69 root=root.add(this, s, 0, value);
75 * @deprecated This API is ICU internal only.
77 protected final void buildImpl(Option buildOption) {
81 throw new IndexOutOfBoundsException("No (string, value) pairs were added.");
83 if(buildOption==Option.FAST) {
84 state=State.BUILDING_FAST;
85 // Building "fast" is somewhat faster (25..50% in some test)
86 // because it makes registerNode() return the input node
87 // rather than checking for duplicates.
88 // As a result, we sometimes write larger trie serializations.
90 // In either case we need to fix-up linear-match nodes (for their maximum length)
91 // and branch nodes (turning dynamic branch nodes into trees of
92 // runtime-equivalent nodes), but the HashMap/hashCode()/equals() are omitted for
93 // nodes other than final values.
95 state=State.BUILDING_SMALL;
100 // Building must have failed.
101 throw new IllegalStateException("Builder failed and must be clear()ed.");
103 return; // Nothing more to do.
105 // Implementation note:
106 // We really build three versions of the trie.
107 // The first is a fully dynamic trie, built successively by addImpl().
108 // Then we call root.register() to turn it into a tree of nodes
109 // which is 1:1 equivalent to the runtime data structure.
110 // Finally, root.markRightEdgesFirst() and root.write() write that serialized form.
111 root=root.register(this);
112 root.markRightEdgesFirst(-1);
119 * @deprecated This API is ICU internal only.
121 protected void clearImpl() {
122 strings.setLength(0);
129 * Makes sure that there is only one unique node registered that is
130 * equivalent to newNode, unless BUILDING_FAST.
131 * @param newNode Input node. The builder takes ownership.
132 * @return newNode if it is the first of its kind, or
133 * an equivalent node if newNode is a duplicate.
135 private final Node registerNode(Node newNode) {
136 if(state==State.BUILDING_FAST) {
140 Node oldNode=nodes.get(newNode);
144 // If put() returns a non-null value from an equivalent, previously
145 // registered node, then get() failed to find that and we will leak newNode.
146 oldNode=nodes.put(newNode, newNode);
147 assert(oldNode==null);
152 * Makes sure that there is only one unique FinalValueNode registered
154 * Avoids creating a node if the value is a duplicate.
155 * @param value A final value.
156 * @return A FinalValueNode with the given value.
158 private final ValueNode registerFinalValue(int value) {
159 // We always register final values because while ADDING
160 // we do not know yet whether we will build fast or small.
161 lookupFinalValueNode.setFinalValue(value);
162 Node oldNode=nodes.get(lookupFinalValueNode);
164 return (ValueNode)oldNode;
166 ValueNode newNode=new ValueNode(value);
167 // If put() returns a non-null value from an equivalent, previously
168 // registered node, then get() failed to find that and we will leak newNode.
169 oldNode=nodes.put(newNode, newNode);
170 assert(oldNode==null);
174 private static abstract class Node {
178 // hashCode() and equals() for use with registerNode() and the nodes hash.
180 public abstract int hashCode() /*const*/;
181 // Base class equals() compares the actual class types.
183 public boolean equals(Object other) {
184 return this==other || this.getClass()==other.getClass();
187 * Recursive method for adding a new (string, value) pair.
188 * Matches the remaining part of s from start,
189 * and adds a new node where there is a mismatch.
190 * @return this or a replacement Node
192 public Node add(StringTrieBuilder builder, CharSequence s, int start, int sValue) {
196 * Recursive method for registering unique nodes,
197 * after all (string, value) pairs have been added.
198 * Final-value nodes are pre-registered while add()ing (string, value) pairs.
199 * Other nodes created while add()ing registerNode() themselves later
200 * and might replace themselves with new types of nodes for write()ing.
201 * @return The registered version of this node which implements write().
203 public Node register(StringTrieBuilder builder) { return this; }
205 * Traverses the Node graph and numbers branch edges, with rightmost edges first.
206 * This is to avoid writing a duplicate node twice.
208 * Branch nodes in this trie data structure are not symmetric.
209 * Most branch edges "jump" to other nodes but the rightmost branch edges
210 * just continue without a jump.
211 * Therefore, write() must write the rightmost branch edge last
212 * (trie units are written backwards), and must write it at that point even if
213 * it is a duplicate of a node previously written elsewhere.
215 * This function visits and marks right branch edges first.
216 * Edges are numbered with increasingly negative values because we share the
217 * offset field which gets positive values when nodes are written.
218 * A branch edge also remembers the first number for any of its edges.
220 * When a further-left branch edge has a number in the range of the rightmost
221 * edge's numbers, then it will be written as part of the required right edge
222 * and we can avoid writing it first.
224 * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
227 * @param edgeNumber The first edge number for this node and its sub-nodes.
228 * @return An edge number that is at least the maximum-negative
229 * of the input edge number and the numbers of this node and all of its sub-nodes.
231 public int markRightEdgesFirst(int edgeNumber) {
237 // write() must set the offset to a positive value.
238 public abstract void write(StringTrieBuilder builder);
239 // See markRightEdgesFirst.
240 public final void writeUnlessInsideRightEdge(int firstRight, int lastRight,
241 StringTrieBuilder builder) {
242 // Note: Edge numbers are negative, lastRight<=firstRight.
243 // If offset>0 then this node and its sub-nodes have been written already
244 // and we need not write them again.
245 // If this node is part of the unwritten right branch edge,
246 // then we wait until that is written.
247 if(offset<0 && (offset<lastRight || firstRight<offset)) {
251 public final int getOffset() /*const*/ { return offset; }
253 protected int offset;
256 // Used directly for final values, and as as a superclass for
257 // match nodes with intermediate values.
258 private static class ValueNode extends Node {
259 public ValueNode() {}
260 public ValueNode(int v) {
264 public final void setValue(int v) {
269 private void setFinalValue(int v) {
274 public int hashCode() /*const*/ {
282 public boolean equals(Object other) {
286 if(!super.equals(other)) {
289 ValueNode o=(ValueNode)other;
290 return hasValue==o.hasValue && (!hasValue || value==o.value);
293 public Node add(StringTrieBuilder builder, CharSequence s, int start, int sValue) {
294 if(start==s.length()) {
295 throw new IllegalArgumentException("Duplicate string.");
297 // Replace self with a node for the remaining string suffix and value.
298 ValueNode node=builder.createSuffixNode(s, start, sValue);
299 node.setValue(value);
303 public void write(StringTrieBuilder builder) {
304 offset=builder.writeValueAndFinal(value, true);
307 protected boolean hasValue;
311 private static final class IntermediateValueNode extends ValueNode {
312 public IntermediateValueNode(int v, Node nextNode) {
317 public int hashCode() /*const*/ {
318 return (0x222222*37+value)*37+next.hashCode();
321 public boolean equals(Object other) {
325 if(!super.equals(other)) {
328 IntermediateValueNode o=(IntermediateValueNode)other;
332 public int markRightEdgesFirst(int edgeNumber) {
334 offset=edgeNumber=next.markRightEdgesFirst(edgeNumber);
339 public void write(StringTrieBuilder builder) {
341 offset=builder.writeValueAndFinal(value, false);
347 private static final class LinearMatchNode extends ValueNode {
348 public LinearMatchNode(CharSequence builderStrings, int sOffset, int len, Node nextNode) {
349 strings=builderStrings;
350 stringOffset=sOffset;
355 public int hashCode() /*const*/ { return hash; }
357 public boolean equals(Object other) {
361 if(!super.equals(other)) {
364 LinearMatchNode o=(LinearMatchNode)other;
365 if(length!=o.length || next!=o.next) {
368 for(int i=stringOffset, j=o.stringOffset, limit=stringOffset+length; i<limit; ++i, ++j) {
369 if(strings.charAt(i)!=strings.charAt(j)) {
376 public Node add(StringTrieBuilder builder, CharSequence s, int start, int sValue) {
377 if(start==s.length()) {
379 throw new IllegalArgumentException("Duplicate string.");
385 int limit=stringOffset+length;
386 for(int i=stringOffset; i<limit; ++i, ++start) {
387 if(start==s.length()) {
388 // s is a prefix with a new value. Split self into two linear-match nodes.
389 int prefixLength=i-stringOffset;
390 LinearMatchNode suffixNode=new LinearMatchNode(strings, i, length-prefixLength, next);
391 suffixNode.setValue(sValue);
396 char thisChar=strings.charAt(i);
397 char newChar=s.charAt(start);
398 if(thisChar!=newChar) {
399 // Mismatch, insert a branch node.
400 DynamicBranchNode branchNode=new DynamicBranchNode();
401 // Reuse this node for one of the remaining substrings, if any.
402 Node result, thisSuffixNode;
403 if(i==stringOffset) {
404 // Mismatch on first character, turn this node into a suffix.
406 // Move the value for prefix length "start" to the new node.
407 branchNode.setValue(value);
413 thisSuffixNode= length>0 ? this : next;
414 // C++: if(length==0) { delete this; }
416 } else if(i==limit-1) {
417 // Mismatch on last character, keep this node for the prefix.
423 // Mismatch on intermediate character, keep this node for the prefix.
424 int prefixLength=i-stringOffset;
425 ++i; // Suffix start offset (after thisChar).
426 thisSuffixNode=new LinearMatchNode(
427 strings, i, length-(prefixLength+1), next);
432 ValueNode newSuffixNode=builder.createSuffixNode(s, start+1, sValue);
433 branchNode.add(thisChar, thisSuffixNode);
434 branchNode.add(newChar, newSuffixNode);
438 // s matches all of this node's characters.
439 next=next.add(builder, s, start, sValue);
443 public Node register(StringTrieBuilder builder) {
444 next=next.register(builder);
445 // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
446 int maxLinearMatchLength=builder.getMaxLinearMatchLength();
447 while(length>maxLinearMatchLength) {
448 int nextOffset=stringOffset+length-maxLinearMatchLength;
449 length-=maxLinearMatchLength;
450 LinearMatchNode suffixNode=
451 new LinearMatchNode(strings, nextOffset, maxLinearMatchLength, next);
452 suffixNode.setHashCode();
453 next=builder.registerNode(suffixNode);
456 if(hasValue && !builder.matchNodesCanHaveValues()) {
457 int intermediateValue=value;
461 result=new IntermediateValueNode(intermediateValue, builder.registerNode(this));
466 return builder.registerNode(result);
469 public int markRightEdgesFirst(int edgeNumber) {
471 offset=edgeNumber=next.markRightEdgesFirst(edgeNumber);
476 public void write(StringTrieBuilder builder) {
478 builder.write(stringOffset, length);
479 offset=builder.writeValueAndType(hasValue, value, builder.getMinLinearMatch()+length-1);
482 // Must be called just before registerNode(this).
483 private void setHashCode() /*const*/ {
484 hash=(0x333333*37+length)*37+next.hashCode();
488 for(int i=stringOffset, limit=stringOffset+length; i<limit; ++i) {
489 hash=hash*37+strings.charAt(i);
493 private CharSequence strings;
494 private int stringOffset;
500 private static final class DynamicBranchNode extends ValueNode {
501 public DynamicBranchNode() {}
502 // c must not be in chars yet.
503 public void add(char c, Node node) {
509 public Node add(StringTrieBuilder builder, CharSequence s, int start, int sValue) {
510 if(start==s.length()) {
512 throw new IllegalArgumentException("Duplicate string.");
518 char c=s.charAt(start++);
520 if(i<chars.length() && c==chars.charAt(i)) {
521 equal.set(i, equal.get(i).add(builder, s, start, sValue));
524 equal.add(i, builder.createSuffixNode(s, start, sValue));
529 public Node register(StringTrieBuilder builder) {
530 Node subNode=register(builder, 0, chars.length());
531 BranchHeadNode head=new BranchHeadNode(chars.length(), subNode);
534 if(builder.matchNodesCanHaveValues()) {
535 head.setValue(value);
537 result=new IntermediateValueNode(value, builder.registerNode(head));
540 return builder.registerNode(result);
542 private Node register(StringTrieBuilder builder, int start, int limit) {
543 int length=limit-start;
544 if(length>builder.getMaxBranchLinearSubNodeLength()) {
545 // Branch on the middle unit.
546 int middle=start+length/2;
547 return builder.registerNode(
549 chars.charAt(middle),
550 register(builder, start, middle),
551 register(builder, middle, limit)));
553 ListBranchNode listNode=new ListBranchNode(length);
555 char c=chars.charAt(start);
556 Node node=equal.get(start);
557 if(node.getClass()==ValueNode.class) {
559 listNode.add(c, ((ValueNode)node).value);
561 listNode.add(c, node.register(builder));
563 } while(++start<limit);
564 return builder.registerNode(listNode);
567 private int find(char c) {
569 int limit=chars.length();
571 int i=(start+limit)/2;
572 char middleChar=chars.charAt(i);
575 } else if(c==middleChar) {
584 private StringBuilder chars=new StringBuilder();
585 private ArrayList<Node> equal=new ArrayList<Node>();
588 private static abstract class BranchNode extends Node {
589 public BranchNode() {}
591 public int hashCode() /*const*/ { return hash; }
594 protected int firstEdgeNumber;
597 private static final class ListBranchNode extends BranchNode {
598 public ListBranchNode(int capacity) {
599 hash=0x444444*37+capacity;
600 equal=new Node[capacity];
601 values=new int[capacity];
602 units=new char[capacity];
605 public boolean equals(Object other) {
609 if(!super.equals(other)) {
612 ListBranchNode o=(ListBranchNode)other;
613 for(int i=0; i<length; ++i) {
614 if(units[i]!=o.units[i] || values[i]!=o.values[i] || equal[i]!=o.equal[i]) {
621 public int hashCode() {
622 return super.hashCode();
625 public int markRightEdgesFirst(int edgeNumber) {
627 firstEdgeNumber=edgeNumber;
631 Node edge=equal[--i];
633 edgeNumber=edge.markRightEdgesFirst(edgeNumber-step);
635 // For all but the rightmost edge, decrement the edge number.
643 public void write(StringTrieBuilder builder) {
644 // Write the sub-nodes in reverse order: The jump lengths are deltas from
645 // after their own positions, so if we wrote the minUnit sub-node first,
646 // then its jump delta would be larger.
647 // Instead we write the minUnit sub-node last, for a shorter delta.
648 int unitNumber=length-1;
649 Node rightEdge=equal[unitNumber];
650 int rightEdgeNumber= rightEdge==null ? firstEdgeNumber : rightEdge.getOffset();
653 if(equal[unitNumber]!=null) {
654 equal[unitNumber].writeUnlessInsideRightEdge(firstEdgeNumber, rightEdgeNumber, builder);
656 } while(unitNumber>0);
657 // The maxUnit sub-node is written as the very last one because we do
658 // not jump for it at all.
660 if(rightEdge==null) {
661 builder.writeValueAndFinal(values[unitNumber], true);
663 rightEdge.write(builder);
665 offset=builder.write(units[unitNumber]);
666 // Write the rest of this node's unit-value pairs.
667 while(--unitNumber>=0) {
670 if(equal[unitNumber]==null) {
671 // Write the final value for the one string ending with this unit.
672 value=values[unitNumber];
675 // Write the delta to the start position of the sub-node.
676 assert(equal[unitNumber].getOffset()>0);
677 value=offset-equal[unitNumber].getOffset();
680 builder.writeValueAndFinal(value, isFinal);
681 offset=builder.write(units[unitNumber]);
684 // Adds a unit with a final value.
685 public void add(int c, int value) {
686 units[length]=(char)c;
688 values[length]=value;
690 hash=(hash*37+c)*37+value;
692 // Adds a unit which leads to another match node.
693 public void add(int c, Node node) {
694 units[length]=(char)c;
698 hash=(hash*37+c)*37+node.hashCode();
701 // Note: We could try to reduce memory allocations
702 // by replacing these per-node arrays with per-builder ArrayLists and
703 // (for units) a StringBuilder (or even use its strings for the units too).
704 // It remains to be seen whether that would improve performance.
705 private Node[] equal; // null means "has final value".
707 private int[] values;
708 private char[] units;
711 private static final class SplitBranchNode extends BranchNode {
712 public SplitBranchNode(char middleUnit, Node lessThanNode, Node greaterOrEqualNode) {
713 hash=((0x555555*37+middleUnit)*37+
714 lessThanNode.hashCode())*37+greaterOrEqualNode.hashCode();
716 lessThan=lessThanNode;
717 greaterOrEqual=greaterOrEqualNode;
720 public boolean equals(Object other) {
724 if(!super.equals(other)) {
727 SplitBranchNode o=(SplitBranchNode)other;
728 return unit==o.unit && lessThan==o.lessThan && greaterOrEqual==o.greaterOrEqual;
731 public int hashCode() {
732 return super.hashCode();
735 public int markRightEdgesFirst(int edgeNumber) {
737 firstEdgeNumber=edgeNumber;
738 edgeNumber=greaterOrEqual.markRightEdgesFirst(edgeNumber);
739 offset=edgeNumber=lessThan.markRightEdgesFirst(edgeNumber-1);
744 public void write(StringTrieBuilder builder) {
745 // Encode the less-than branch first.
746 lessThan.writeUnlessInsideRightEdge(firstEdgeNumber, greaterOrEqual.getOffset(), builder);
747 // Encode the greater-or-equal branch last because we do not jump for it at all.
748 greaterOrEqual.write(builder);
750 assert(lessThan.getOffset()>0);
751 builder.writeDeltaTo(lessThan.getOffset()); // less-than
752 offset=builder.write(unit);
756 private Node lessThan;
757 private Node greaterOrEqual;
760 // Branch head node, for writing the actual node lead unit.
761 private static final class BranchHeadNode extends ValueNode {
762 public BranchHeadNode(int len, Node subNode) {
767 public int hashCode() /*const*/ {
768 return (0x666666*37+length)*37+next.hashCode();
771 public boolean equals(Object other) {
775 if(!super.equals(other)) {
778 BranchHeadNode o=(BranchHeadNode)other;
779 return length==o.length && next==o.next;
782 public int markRightEdgesFirst(int edgeNumber) {
784 offset=edgeNumber=next.markRightEdgesFirst(edgeNumber);
789 public void write(StringTrieBuilder builder) {
791 if(length<=builder.getMinLinearMatch()) {
792 offset=builder.writeValueAndType(hasValue, value, length-1);
794 builder.write(length-1);
795 offset=builder.writeValueAndType(hasValue, value, 0);
800 private Node next; // A branch sub-node.
803 private ValueNode createSuffixNode(CharSequence s, int start, int sValue) {
804 ValueNode node=registerFinalValue(sValue);
805 if(start<s.length()) {
806 int offset=strings.length();
807 strings.append(s, start, s.length());
808 node=new LinearMatchNode(strings, offset, s.length()-start, node);
815 * @deprecated This API is ICU internal only.
817 protected abstract boolean matchNodesCanHaveValues() /*const*/;
821 * @deprecated This API is ICU internal only.
823 protected abstract int getMaxBranchLinearSubNodeLength() /*const*/;
826 * @deprecated This API is ICU internal only.
828 protected abstract int getMinLinearMatch() /*const*/;
831 * @deprecated This API is ICU internal only.
833 protected abstract int getMaxLinearMatchLength() /*const*/;
837 * @deprecated This API is ICU internal only.
839 protected abstract int write(int unit);
842 * @deprecated This API is ICU internal only.
844 protected abstract int write(int offset, int length);
847 * @deprecated This API is ICU internal only.
849 protected abstract int writeValueAndFinal(int i, boolean isFinal);
852 * @deprecated This API is ICU internal only.
854 protected abstract int writeValueAndType(boolean hasValue, int value, int node);
857 * @deprecated This API is ICU internal only.
859 protected abstract int writeDeltaTo(int jumpTarget);
862 ADDING, BUILDING_FAST, BUILDING_SMALL, BUILT
864 private State state=State.ADDING;
866 // Strings and sub-strings for linear-match nodes.
869 * @deprecated This API is ICU internal only.
871 protected StringBuilder strings=new StringBuilder();
874 // Hash set of nodes, maps from nodes to integer 1.
875 private HashMap<Node, Node> nodes=new HashMap<Node, Node>();
876 private ValueNode lookupFinalValueNode=new ValueNode();