2 **********************************************************************
\r
3 * Copyright (c) 2002-2007, International Business Machines
\r
4 * Corporation and others. All Rights Reserved.
\r
5 **********************************************************************
\r
8 package com.ibm.icu.text;
\r
10 import java.util.List;
\r
11 import java.util.ArrayList;
\r
12 import java.util.Set;
\r
13 import java.util.HashSet;
\r
14 import java.util.SortedSet;
\r
15 import java.util.TreeSet;
\r
16 import java.util.Iterator;
\r
17 import java.util.Collection;
\r
19 import com.ibm.icu.impl.Assert;
\r
20 import com.ibm.icu.lang.UCharacter;
\r
21 import com.ibm.icu.lang.UProperty;
\r
24 // class RBBITableBuilder is part of the RBBI rule compiler.
\r
25 // It builds the state transition table used by the RBBI runtime
\r
26 // from the expression syntax tree generated by the rule scanner.
\r
28 // This class is part of the RBBI implementation only.
\r
29 // There is no user-visible public API here.
\r
31 class RBBITableBuilder {
\r
36 // RBBIStateDescriptor - The DFA is initially constructed as a set of these descriptors,
\r
37 // one for each state.
\r
38 static class RBBIStateDescriptor {
\r
44 Set fPositions; // Set of parse tree positions associated
\r
45 // with this state. Unordered (it's a set).
\r
46 // UVector contents are RBBINode *
\r
48 int[] fDtran; // Transitions out of this state.
\r
49 // indexed by input character
\r
50 // contents is int index of dest state
\r
51 // in RBBITableBuilder.fDStates
\r
53 RBBIStateDescriptor(int maxInputSymbol) {
\r
54 fTagVals = new TreeSet();
\r
55 fPositions = new HashSet();
\r
56 fDtran = new int[maxInputSymbol+1]; // fDtran needs to be pre-sized.
\r
57 // It is indexed by input symbols, and will
\r
58 // hold the next state number for each
\r
64 private RBBIRuleBuilder fRB;
\r
65 private int fRootIx; // The array index into RBBIRuleBuilder.fTreeRoots
\r
66 // for the parse tree to operate on.
\r
67 // Too bad Java can't do indirection more easily!
\r
69 private List fDStates; // D states (Aho's terminology)
\r
70 // Index is state number
\r
71 // Contents are RBBIStateDescriptor pointers.
\r
73 //-----------------------------------------------------------------------------
\r
75 // Constructor for RBBITableBuilder.
\r
77 // rootNode is an index into the array of root nodes that is held by
\r
78 // the overall RBBIRuleBuilder.
\r
79 //-----------------------------------------------------------------------------
\r
80 RBBITableBuilder(RBBIRuleBuilder rb, int rootNodeIx) {
\r
81 fRootIx = rootNodeIx;
\r
83 fDStates = new ArrayList();
\r
89 //-----------------------------------------------------------------------------
\r
91 // RBBITableBuilder::build - This is the main function for building the DFA state transtion
\r
92 // table from the RBBI rules parse tree.
\r
94 //-----------------------------------------------------------------------------
\r
96 // If there were no rules, just return. This situation can easily arise
\r
97 // for the reverse rules.
\r
98 if (fRB.fTreeRoots[fRootIx]==null) {
\r
103 // Walk through the tree, replacing any references to $variables with a copy of the
\r
104 // parse tree for the substition expression.
\r
106 fRB.fTreeRoots[fRootIx] = fRB.fTreeRoots[fRootIx].flattenVariables();
\r
107 if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("ftree")>=0) {
\r
108 System.out.println("Parse tree after flattening variable references.");
\r
109 fRB.fTreeRoots[fRootIx].printTree(true);
\r
113 // If the rules contained any references to {bof}
\r
114 // add a {bof} <cat> <former root of tree> to the
\r
115 // tree. Means that all matches must start out with the
\r
116 // {bof} fake character.
\r
118 if (fRB.fSetBuilder.sawBOF()) {
\r
119 RBBINode bofTop = new RBBINode(RBBINode.opCat);
\r
120 RBBINode bofLeaf = new RBBINode(RBBINode.leafChar);
\r
121 bofTop.fLeftChild = bofLeaf;
\r
122 bofTop.fRightChild = fRB.fTreeRoots[fRootIx];
\r
123 bofLeaf.fParent = bofTop;
\r
124 bofLeaf.fVal = 2; // Reserved value for {bof}.
\r
125 fRB.fTreeRoots[fRootIx] = bofTop;
\r
129 // Add a unique right-end marker to the expression.
\r
130 // Appears as a cat-node, left child being the original tree,
\r
131 // right child being the end marker.
\r
133 RBBINode cn = new RBBINode(RBBINode.opCat);
\r
134 cn.fLeftChild = fRB.fTreeRoots[fRootIx];
\r
135 fRB.fTreeRoots[fRootIx].fParent = cn;
\r
136 cn.fRightChild = new RBBINode(RBBINode.endMark);
\r
137 cn.fRightChild.fParent = cn;
\r
138 fRB.fTreeRoots[fRootIx] = cn;
\r
141 // Replace all references to UnicodeSets with the tree for the equivalent
\r
144 fRB.fTreeRoots[fRootIx].flattenSets();
\r
145 if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("stree")>=0) {
\r
146 System.out.println("Parse tree after flattening Unicode Set references.");
\r
147 fRB.fTreeRoots[fRootIx].printTree(true);
\r
152 // calculate the functions nullable, firstpos, lastpos and followpos on
\r
153 // nodes in the parse tree.
\r
154 // See the alogrithm description in Aho.
\r
155 // Understanding how this works by looking at the code alone will be
\r
156 // nearly impossible.
\r
158 calcNullable(fRB.fTreeRoots[fRootIx]);
\r
159 calcFirstPos(fRB.fTreeRoots[fRootIx]);
\r
160 calcLastPos(fRB.fTreeRoots[fRootIx]);
\r
161 calcFollowPos(fRB.fTreeRoots[fRootIx]);
\r
162 if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("pos")>=0) {
\r
163 System.out.print("\n");
\r
164 printPosSets(fRB.fTreeRoots[fRootIx]);
\r
168 // For "chained" rules, modify the followPos sets
\r
170 if (fRB.fChainRules) {
\r
171 calcChainedFollowPos(fRB.fTreeRoots[fRootIx]);
\r
175 // BOF (start of input) test fixup.
\r
177 if (fRB.fSetBuilder.sawBOF()) {
\r
182 // Build the DFA state transition tables.
\r
185 flagAcceptingStates();
\r
186 flagLookAheadStates();
\r
187 flagTaggedStates();
\r
190 // Update the global table of rule status {tag} values
\r
191 // The rule builder has a global vector of status values that are common
\r
192 // for all tables. Merge the ones from this table into the global set.
\r
194 mergeRuleStatusVals();
\r
196 if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("states")>=0) {printStates();}
\r
201 //-----------------------------------------------------------------------------
\r
203 // calcNullable. Impossible to explain succinctly. See Aho, section 3.9
\r
205 //-----------------------------------------------------------------------------
\r
206 void calcNullable(RBBINode n) {
\r
210 if (n.fType == RBBINode.setRef ||
\r
211 n.fType == RBBINode.endMark ) {
\r
212 // These are non-empty leaf node types.
\r
213 n.fNullable = false;
\r
217 if (n.fType == RBBINode.lookAhead || n.fType == RBBINode.tag) {
\r
218 // Lookahead marker node. It's a leaf, so no recursion on children.
\r
219 // It's nullable because it does not match any literal text from the input stream.
\r
220 n.fNullable = true;
\r
225 // The node is not a leaf.
\r
226 // Calculate nullable on its children.
\r
227 calcNullable(n.fLeftChild);
\r
228 calcNullable(n.fRightChild);
\r
230 // Apply functions from table 3.40 in Aho
\r
231 if (n.fType == RBBINode.opOr) {
\r
232 n.fNullable = n.fLeftChild.fNullable || n.fRightChild.fNullable;
\r
234 else if (n.fType == RBBINode.opCat) {
\r
235 n.fNullable = n.fLeftChild.fNullable && n.fRightChild.fNullable;
\r
237 else if (n.fType == RBBINode.opStar || n.fType == RBBINode.opQuestion) {
\r
238 n.fNullable = true;
\r
241 n.fNullable = false;
\r
248 //-----------------------------------------------------------------------------
\r
250 // calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9
\r
252 //-----------------------------------------------------------------------------
\r
253 void calcFirstPos(RBBINode n) {
\r
257 if (n.fType == RBBINode.leafChar ||
\r
258 n.fType == RBBINode.endMark ||
\r
259 n.fType == RBBINode.lookAhead ||
\r
260 n.fType == RBBINode.tag) {
\r
261 // These are non-empty leaf node types.
\r
262 n.fFirstPosSet.add(n);
\r
266 // The node is not a leaf.
\r
267 // Calculate firstPos on its children.
\r
268 calcFirstPos(n.fLeftChild);
\r
269 calcFirstPos(n.fRightChild);
\r
271 // Apply functions from table 3.40 in Aho
\r
272 if (n.fType == RBBINode.opOr) {
\r
273 n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);
\r
274 n.fFirstPosSet.addAll(n.fRightChild.fFirstPosSet);
\r
276 else if (n.fType == RBBINode.opCat) {
\r
277 n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);
\r
278 if (n.fLeftChild.fNullable) {
\r
279 n.fFirstPosSet.addAll(n.fRightChild.fFirstPosSet);
\r
282 else if (n.fType == RBBINode.opStar ||
\r
283 n.fType == RBBINode.opQuestion ||
\r
284 n.fType == RBBINode.opPlus) {
\r
285 n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);
\r
291 //-----------------------------------------------------------------------------
\r
293 // calcLastPos. Impossible to explain succinctly. See Aho, section 3.9
\r
295 //-----------------------------------------------------------------------------
\r
296 void calcLastPos(RBBINode n) {
\r
300 if (n.fType == RBBINode.leafChar ||
\r
301 n.fType == RBBINode.endMark ||
\r
302 n.fType == RBBINode.lookAhead ||
\r
303 n.fType == RBBINode.tag) {
\r
304 // These are non-empty leaf node types.
\r
305 n.fLastPosSet.add(n);
\r
309 // The node is not a leaf.
\r
310 // Calculate lastPos on its children.
\r
311 calcLastPos(n.fLeftChild);
\r
312 calcLastPos(n.fRightChild);
\r
314 // Apply functions from table 3.40 in Aho
\r
315 if (n.fType == RBBINode.opOr) {
\r
316 n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);
\r
317 n.fLastPosSet.addAll(n.fRightChild.fLastPosSet);
\r
319 else if (n.fType == RBBINode.opCat) {
\r
320 n.fLastPosSet.addAll(n.fRightChild.fLastPosSet);
\r
321 if (n.fRightChild.fNullable) {
\r
322 n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);
\r
325 else if (n.fType == RBBINode.opStar ||
\r
326 n.fType == RBBINode.opQuestion ||
\r
327 n.fType == RBBINode.opPlus) {
\r
328 n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);
\r
334 //-----------------------------------------------------------------------------
\r
336 // calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9
\r
338 //-----------------------------------------------------------------------------
\r
339 void calcFollowPos(RBBINode n) {
\r
341 n.fType == RBBINode.leafChar ||
\r
342 n.fType == RBBINode.endMark) {
\r
346 calcFollowPos(n.fLeftChild);
\r
347 calcFollowPos(n.fRightChild);
\r
350 if (n.fType == RBBINode.opCat) {
\r
351 RBBINode i; // is 'i' in Aho's description
\r
353 Set LastPosOfLeftChild = n.fLeftChild.fLastPosSet;
\r
355 Iterator ix = LastPosOfLeftChild.iterator();
\r
356 while (ix.hasNext()) {
\r
357 i = (RBBINode )ix.next();
\r
358 i.fFollowPos.addAll(n.fRightChild.fFirstPosSet);
\r
363 if (n.fType == RBBINode.opStar ||
\r
364 n.fType == RBBINode.opPlus) {
\r
365 RBBINode i; // again, n and i are the names from Aho's description.
\r
366 Iterator ix = n.fLastPosSet.iterator();
\r
367 while (ix.hasNext()) {
\r
368 i = (RBBINode) ix.next();
\r
369 i.fFollowPos.addAll(n.fFirstPosSet);
\r
379 //-----------------------------------------------------------------------------
\r
381 // calcChainedFollowPos. Modify the previously calculated followPos sets
\r
382 // to implement rule chaining. NOT described by Aho
\r
384 //-----------------------------------------------------------------------------
\r
385 void calcChainedFollowPos(RBBINode tree) {
\r
387 List endMarkerNodes = new ArrayList();
\r
388 List leafNodes = new ArrayList();
\r
390 // get a list of all endmarker nodes.
\r
391 tree.findNodes(endMarkerNodes, RBBINode.endMark);
\r
393 // get a list all leaf nodes
\r
394 tree.findNodes(leafNodes, RBBINode.leafChar);
\r
396 // Get all nodes that can be the start a match, which is FirstPosition()
\r
397 // of the portion of the tree corresponding to user-written rules.
\r
398 // See the tree description in bofFixup().
\r
399 RBBINode userRuleRoot = tree;
\r
400 if (fRB.fSetBuilder.sawBOF()) {
\r
401 userRuleRoot = tree.fLeftChild.fRightChild;
\r
403 Assert.assrt(userRuleRoot != null);
\r
404 Set matchStartNodes = userRuleRoot.fFirstPosSet;
\r
407 // Iteratate over all leaf nodes,
\r
409 Iterator endNodeIx = leafNodes.iterator();
\r
411 while (endNodeIx.hasNext()) {
\r
412 RBBINode tNode = (RBBINode)endNodeIx.next();
\r
413 RBBINode endNode = null;
\r
415 // Identify leaf nodes that correspond to overall rule match positions.
\r
416 // These include an endMarkerNode in their followPos sets.
\r
417 Iterator i = endMarkerNodes.iterator();
\r
418 while (i.hasNext()) {
\r
419 RBBINode endMarkerNode = (RBBINode)i.next();
\r
420 if (tNode.fFollowPos.contains(endMarkerNode)) {
\r
425 if (endNode == null) {
\r
426 // node wasn't an end node. Try again with the next.
\r
430 // We've got a node that can end a match.
\r
432 // Line Break Specific hack: If this node's val correspond to the $CM char class,
\r
433 // don't chain from it.
\r
434 // TODO: Add rule syntax for this behavior, get specifics out of here and
\r
435 // into the rule file.
\r
436 if (fRB.fLBCMNoChain) {
\r
437 int c = this.fRB.fSetBuilder.getFirstChar(endNode.fVal);
\r
439 // c == -1 occurs with sets containing only the {eof} marker string.
\r
440 int cLBProp = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK);
\r
441 if (cLBProp == UCharacter.LineBreak.COMBINING_MARK) {
\r
448 // Now iterate over the nodes that can start a match, looking for ones
\r
449 // with the same char class as our ending node.
\r
450 RBBINode startNode;
\r
451 Iterator startNodeIx = matchStartNodes.iterator();
\r
452 while (startNodeIx.hasNext()) {
\r
453 startNode = (RBBINode )startNodeIx.next();
\r
454 if (startNode.fType != RBBINode.leafChar) {
\r
458 if (endNode.fVal == startNode.fVal) {
\r
459 // The end val (character class) of one possible match is the
\r
460 // same as the start of another.
\r
462 // Add all nodes from the followPos of the start node to the
\r
463 // followPos set of the end node, which will have the effect of
\r
464 // letting matches transition from a match state at endNode
\r
465 // to the second char of a match starting with startNode.
\r
466 endNode.fFollowPos.addAll(startNode.fFollowPos);
\r
473 //-----------------------------------------------------------------------------
\r
475 // bofFixup. Fixup for state tables that include {bof} beginning of input testing.
\r
476 // Do an swizzle similar to chaining, modifying the followPos set of
\r
477 // the bofNode to include the followPos nodes from other {bot} nodes
\r
478 // scattered through the tree.
\r
480 // This function has much in common with calcChainedFollowPos().
\r
482 //-----------------------------------------------------------------------------
\r
485 // The parse tree looks like this ...
\r
486 // fTree root --. <cat>
\r
488 // <cat> <#end node>
\r
493 // We will be adding things to the followPos set of the <bofNode>
\r
495 RBBINode bofNode = fRB.fTreeRoots[fRootIx].fLeftChild.fLeftChild;
\r
496 Assert.assrt(bofNode.fType == RBBINode.leafChar);
\r
497 Assert.assrt(bofNode.fVal == 2);
\r
499 // Get all nodes that can be the start a match of the user-written rules
\r
500 // (excluding the fake bofNode)
\r
501 // We want the nodes that can start a match in the
\r
502 // part labeled "rest of tree"
\r
504 Set matchStartNodes = fRB.fTreeRoots[fRootIx].fLeftChild.fRightChild.fFirstPosSet;
\r
505 Iterator startNodeIt = matchStartNodes.iterator();
\r
506 while (startNodeIt.hasNext()) {
\r
507 RBBINode startNode = (RBBINode)startNodeIt.next();
\r
508 if (startNode.fType != RBBINode.leafChar) {
\r
512 if (startNode.fVal == bofNode.fVal) {
\r
513 // We found a leaf node corresponding to a {bof} that was
\r
514 // explicitly written into a rule.
\r
515 // Add everything from the followPos set of this node to the
\r
516 // followPos set of the fake bofNode at the start of the tree.
\r
518 bofNode.fFollowPos.addAll(startNode.fFollowPos);
\r
523 //-----------------------------------------------------------------------------
\r
525 // buildStateTable() Determine the set of runtime DFA states and the
\r
526 // transition tables for these states, by the algorithm
\r
527 // of fig. 3.44 in Aho.
\r
529 // Most of the comments are quotes of Aho's psuedo-code.
\r
531 //-----------------------------------------------------------------------------
\r
532 void buildStateTable() {
\r
534 // Add a dummy state 0 - the stop state. Not from Aho.
\r
535 int lastInputSymbol = fRB.fSetBuilder.getNumCharCategories() - 1;
\r
536 RBBIStateDescriptor failState = new RBBIStateDescriptor(lastInputSymbol);
\r
537 fDStates.add(failState);
\r
539 // initially, the only unmarked state in Dstates is firstpos(root),
\r
540 // where toot is the root of the syntax tree for (r)#;
\r
541 RBBIStateDescriptor initialState = new RBBIStateDescriptor(lastInputSymbol);
\r
542 initialState.fPositions.addAll(fRB.fTreeRoots[fRootIx].fFirstPosSet);
\r
543 fDStates.add(initialState);
\r
545 // while there is an unmarked state T in Dstates do begin
\r
547 RBBIStateDescriptor T = null;
\r
549 for (tx=1; tx<fDStates.size(); tx++) {
\r
550 RBBIStateDescriptor temp = (RBBIStateDescriptor )fDStates.get(tx);
\r
551 if (temp.fMarked == false) {
\r
563 // for each input symbol a do begin
\r
565 for (a = 1; a<=lastInputSymbol; a++) {
\r
566 // let U be the set of positions that are in followpos(p)
\r
567 // for some position p in T
\r
568 // such that the symbol at position p is a;
\r
571 Iterator pit = T.fPositions.iterator();
\r
572 while (pit.hasNext()) {
\r
573 p = (RBBINode )pit.next();
\r
574 if ((p.fType == RBBINode.leafChar) && (p.fVal == a)) {
\r
578 U.addAll(p.fFollowPos);
\r
582 // if U is not empty and not in DStates then
\r
584 boolean UinDstates = false;
\r
586 Assert.assrt(U.size() > 0);
\r
588 for (ix=0; ix<fDStates.size(); ix++) {
\r
589 RBBIStateDescriptor temp2;
\r
590 temp2 = (RBBIStateDescriptor )fDStates.get(ix);
\r
591 if (U.equals(temp2.fPositions)) {
\r
592 U = temp2.fPositions;
\r
599 // Add U as an unmarked state to Dstates
\r
602 RBBIStateDescriptor newState = new RBBIStateDescriptor(lastInputSymbol);
\r
603 newState.fPositions = U;
\r
604 fDStates.add(newState);
\r
605 ux = fDStates.size()-1;
\r
608 // Dtran[T, a] := U;
\r
617 //-----------------------------------------------------------------------------
\r
619 // flagAcceptingStates Identify accepting states.
\r
620 // First get a list of all of the end marker nodes.
\r
621 // Then, for each state s,
\r
622 // if s contains one of the end marker nodes in its list of tree positions then
\r
623 // s is an accepting state.
\r
625 //-----------------------------------------------------------------------------
\r
626 void flagAcceptingStates() {
\r
627 List endMarkerNodes = new ArrayList();
\r
628 RBBINode endMarker;
\r
632 fRB.fTreeRoots[fRootIx].findNodes(endMarkerNodes, RBBINode.endMark);
\r
634 for (i=0; i<endMarkerNodes.size(); i++) {
\r
635 endMarker = (RBBINode)endMarkerNodes.get(i);
\r
636 for (n=0; n<fDStates.size(); n++) {
\r
637 RBBIStateDescriptor sd = (RBBIStateDescriptor )fDStates.get(n);
\r
638 //if (sd.fPositions.indexOf(endMarker) >= 0) {
\r
639 if (sd.fPositions.contains(endMarker)) {
\r
640 // Any non-zero value for fAccepting means this is an accepting node.
\r
641 // The value is what will be returned to the user as the break status.
\r
642 // If no other value was specified, force it to -1.
\r
644 if (sd.fAccepting==0) {
\r
645 // State hasn't been marked as accepting yet. Do it now.
\r
646 sd.fAccepting = endMarker.fVal;
\r
647 if (sd.fAccepting == 0) {
\r
648 sd.fAccepting = -1;
\r
651 if (sd.fAccepting==-1 && endMarker.fVal != 0) {
\r
652 // Both lookahead and non-lookahead accepting for this state.
\r
653 // Favor the look-ahead. Expedient for line break.
\r
654 // TODO: need a more elegant resolution for conflicting rules.
\r
655 sd.fAccepting = endMarker.fVal;
\r
658 // if sd.fAccepting already had a value other than 0 or -1, leave it be.
\r
660 // If the end marker node is from a look-ahead rule, set
\r
661 // the fLookAhead field or this state also.
\r
662 if (endMarker.fLookAheadEnd) {
\r
663 // TODO: don't change value if already set?
\r
664 // TODO: allow for more than one active look-ahead rule in engine.
\r
665 // Make value here an index to a side array in engine?
\r
666 sd.fLookAhead = sd.fAccepting;
\r
674 //-----------------------------------------------------------------------------
\r
676 // flagLookAheadStates Very similar to flagAcceptingStates, above.
\r
678 //-----------------------------------------------------------------------------
\r
679 void flagLookAheadStates() {
\r
680 List lookAheadNodes = new ArrayList();
\r
681 RBBINode lookAheadNode;
\r
685 fRB.fTreeRoots[fRootIx].findNodes(lookAheadNodes, RBBINode.lookAhead);
\r
686 for (i=0; i<lookAheadNodes.size(); i++) {
\r
687 lookAheadNode = (RBBINode )lookAheadNodes.get(i);
\r
689 for (n=0; n<fDStates.size(); n++) {
\r
690 RBBIStateDescriptor sd = (RBBIStateDescriptor )fDStates.get(n);
\r
691 if (sd.fPositions.contains(lookAheadNode)) {
\r
692 sd.fLookAhead = lookAheadNode.fVal;
\r
701 //-----------------------------------------------------------------------------
\r
703 // flagTaggedStates
\r
705 //-----------------------------------------------------------------------------
\r
706 void flagTaggedStates() {
\r
707 List tagNodes = new ArrayList();
\r
712 fRB.fTreeRoots[fRootIx].findNodes(tagNodes, RBBINode.tag);
\r
713 for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em)
\r
714 tagNode = (RBBINode )tagNodes.get(i);
\r
716 for (n=0; n<fDStates.size(); n++) { // For each state s (row in the state table)
\r
717 RBBIStateDescriptor sd = (RBBIStateDescriptor )fDStates.get(n);
\r
718 if (sd.fPositions.contains(tagNode)) { // if s include the tag node t
\r
719 sd.fTagVals.add(new Integer(tagNode.fVal));
\r
727 //-----------------------------------------------------------------------------
\r
729 // mergeRuleStatusVals
\r
731 // Allocate positions in the global array of rule status {tag} values
\r
733 // The RBBI runtime uses an array of {sets of status values} that can
\r
734 // be returned for boundaries. Each accepting state that has non-zero
\r
735 // status includes an index into this array. The format of the array
\r
737 // Num of status values in group 1
\r
741 // Num of status vals in group 2
\r
748 //-----------------------------------------------------------------------------
\r
750 void mergeRuleStatusVals() {
\r
752 // The basic outline of what happens here is this...
\r
754 // for each state in this state table
\r
755 // if the status tag list for this state is in the global statuses list
\r
756 // record where and
\r
757 // continue with the next state
\r
759 // add the tag list for this state to the global list.
\r
763 // Pre-load a single tag of {0} into the table.
\r
764 // We will need this as a default, for rule sets with no explicit tagging,
\r
765 // or with explicit tagging of {0}.
\r
766 if (fRB.fRuleStatusVals.size() == 0) {
\r
767 fRB.fRuleStatusVals.add(new Integer(1)); // Num of statuses in group
\r
768 fRB.fRuleStatusVals.add(new Integer(0)); // and our single status of zero
\r
770 SortedSet s0 = new TreeSet();
\r
771 Integer izero = new Integer(0);
\r
772 fRB.fStatusSets.put(s0, izero);
\r
773 SortedSet s1 = new TreeSet();
\r
775 fRB.fStatusSets.put(s0, izero);
\r
778 // For each state, check whether the state's status tag values are
\r
779 // already entered into the status values array, and add them if not.
\r
780 for (n=0; n<fDStates.size(); n++) {
\r
781 RBBIStateDescriptor sd = (RBBIStateDescriptor )fDStates.get(n);
\r
782 Set statusVals = sd.fTagVals;
\r
783 Integer arrayIndexI = (Integer)fRB.fStatusSets.get(statusVals);
\r
784 if (arrayIndexI == null) {
\r
785 // This is the first encounter of this set of status values.
\r
786 // Add them to the statusSets map, This map associates
\r
787 // the set of status values with an index in the runtime status
\r
789 arrayIndexI = new Integer(fRB.fRuleStatusVals.size());
\r
790 fRB.fStatusSets.put(statusVals, arrayIndexI);
\r
792 // Add the new set of status values to the vector of values that
\r
793 // will eventually become the array used by the runtime engine.
\r
794 fRB.fRuleStatusVals.add(new Integer(statusVals.size()));
\r
795 Iterator it = statusVals.iterator();
\r
796 while (it.hasNext()) {
\r
797 fRB.fRuleStatusVals.add(it.next());
\r
802 // Save the runtime array index back into the state descriptor.
\r
803 sd.fTagsIdx = arrayIndexI.intValue();
\r
813 //-----------------------------------------------------------------------------
\r
815 // printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos
\r
816 // for each node in the tree.
\r
818 //-----------------------------------------------------------------------------
\r
820 void printPosSets(RBBINode n) {
\r
824 RBBINode.printNode(n);
\r
825 System.out.print(" Nullable: " + n.fNullable);
\r
827 System.out.print(" firstpos: ");
\r
828 printSet(n.fFirstPosSet);
\r
830 System.out.print(" lastpos: ");
\r
831 printSet(n.fLastPosSet);
\r
833 System.out.print(" followpos: ");
\r
834 printSet(n.fFollowPos);
\r
836 printPosSets(n.fLeftChild);
\r
837 printPosSets(n.fRightChild);
\r
843 //-----------------------------------------------------------------------------
\r
845 // getTableSize() Calculate the size in bytes of the runtime form of this
\r
846 // state transition table.
\r
848 // Note: Refer to common/rbbidata.h from ICU4C for the declarations
\r
849 // of the structures being matched by this calculation.
\r
851 //-----------------------------------------------------------------------------
\r
852 int getTableSize() {
\r
858 if (fRB.fTreeRoots[fRootIx] == null) {
\r
862 size = /*sizeof(RBBIStateTable) - 4 */ 16; // The header, with no rows to the table.
\r
864 numRows = fDStates.size();
\r
865 numCols = fRB.fSetBuilder.getNumCharCategories();
\r
867 // Note The declaration of RBBIStateTableRow is for a table of two columns.
\r
868 // Therefore we subtract two from numCols when determining
\r
869 // how much storage to add to a row for the total columns.
\r
870 // rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2);
\r
871 rowSize = 8 + 2*numCols;
\r
872 size += numRows * rowSize;
\r
873 while (size % 8 > 0) { // Size must be multiple of 8 bytes in size.
\r
882 //-----------------------------------------------------------------------------
\r
884 // exportTable() export the state transition table in the ICU4C format.
\r
886 // Most of the table is 16 bit shorts. This function exports
\r
887 // the whole thing as an array of shorts.
\r
889 // The size of the array must be rounded up to a multiple of
\r
892 // See struct RBBIStateTable in ICU4C, common/rbbidata.h
\r
894 //-----------------------------------------------------------------------------
\r
896 short [] exportTable() {
\r
900 if (fRB.fTreeRoots[fRootIx] == null) {
\r
901 return new short[0];
\r
904 Assert.assrt(fRB.fSetBuilder.getNumCharCategories() < 0x7fff &&
\r
905 fDStates.size() < 0x7fff);
\r
907 int numStates = fDStates.size();
\r
909 // Size of table size in shorts.
\r
910 // the "4" is the size of struct RBBIStateTableRow, the row header part only.
\r
911 int rowLen = 4 + fRB.fSetBuilder.getNumCharCategories();
\r
912 int tableSize = getTableSize() / 2;
\r
915 short [] table = new short[tableSize];
\r
918 // Fill in the header fields.
\r
919 // Annoying because they really want to be ints, not shorts.
\r
921 // RBBIStateTable.fNumStates
\r
922 table[RBBIDataWrapper.NUMSTATES] = (short)(numStates >>> 16);
\r
923 table[RBBIDataWrapper.NUMSTATES+1] = (short)(numStates & 0x0000ffff);
\r
925 // RBBIStateTable.fRowLen
\r
926 table[RBBIDataWrapper.ROWLEN] = (short)(rowLen >>> 16);
\r
927 table[RBBIDataWrapper.ROWLEN+1] = (short)(rowLen & 0x0000ffff);
\r
929 // RBBIStateTable.fFlags
\r
931 if (fRB.fLookAheadHardBreak) {
\r
932 flags |= RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK;
\r
934 if (fRB.fSetBuilder.sawBOF()) {
\r
935 flags |= RBBIDataWrapper.RBBI_BOF_REQUIRED;
\r
937 table[RBBIDataWrapper.FLAGS] = (short)(flags >>> 16);
\r
938 table[RBBIDataWrapper.FLAGS+1] = (short)(flags & 0x0000ffff);
\r
940 int numCharCategories = fRB.fSetBuilder.getNumCharCategories();
\r
941 for (state=0; state<numStates; state++) {
\r
942 RBBIStateDescriptor sd = (RBBIStateDescriptor )fDStates.get(state);
\r
943 int row = 8 + state*rowLen;
\r
944 Assert.assrt (-32768 < sd.fAccepting && sd.fAccepting <= 32767);
\r
945 Assert.assrt (-32768 < sd.fLookAhead && sd.fLookAhead <= 32767);
\r
946 table[row + RBBIDataWrapper.ACCEPTING] = (short)sd.fAccepting;
\r
947 table[row + RBBIDataWrapper.LOOKAHEAD] = (short)sd.fLookAhead;
\r
948 table[row + RBBIDataWrapper.TAGIDX] = (short)sd.fTagsIdx;
\r
949 for (col=0; col<numCharCategories; col++) {
\r
950 table[row + RBBIDataWrapper.NEXTSTATES + col] = (short)sd.fDtran[col];
\r
958 //-----------------------------------------------------------------------------
\r
960 // printSet Debug function. Print the contents of a set of Nodes
\r
962 //-----------------------------------------------------------------------------
\r
964 void printSet(Collection s) {
\r
965 Iterator it = s.iterator();
\r
966 while (it.hasNext()) {
\r
967 RBBINode n = (RBBINode)it.next();
\r
968 RBBINode.printInt(n.fSerialNum, 8);
\r
970 System.out.println();
\r
975 //-----------------------------------------------------------------------------
\r
977 // printStates Debug Function. Dump the fully constructed state transition table.
\r
979 //-----------------------------------------------------------------------------
\r
981 void printStates() {
\r
982 int c; // input "character"
\r
983 int n; // state number
\r
985 System.out.print("state | i n p u t s y m b o l s \n");
\r
986 System.out.print(" | Acc LA Tag");
\r
987 for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
\r
988 RBBINode.printInt((int)c, 3);
\r
990 System.out.print("\n");
\r
991 System.out.print(" |---------------");
\r
992 for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
\r
993 System.out.print("---");
\r
995 System.out.print("\n");
\r
997 for (n=0; n<fDStates.size(); n++) {
\r
998 RBBIStateDescriptor sd = (RBBIStateDescriptor)fDStates.get(n);
\r
999 RBBINode.printInt(n, 5);
\r
1000 System.out.print(" | ");
\r
1002 RBBINode.printInt(sd.fAccepting, 3);
\r
1003 RBBINode.printInt(sd.fLookAhead, 4);
\r
1004 RBBINode.printInt(sd.fTagsIdx, 6);
\r
1005 System.out.print(" ");
\r
1006 for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
\r
1007 RBBINode.printInt(sd.fDtran[c], 3);
\r
1009 System.out.print("\n");
\r
1011 System.out.print("\n\n");
\r
1017 //-----------------------------------------------------------------------------
\r
1019 // printRuleStatusTable Debug Function. Dump the common rule status table
\r
1021 //-----------------------------------------------------------------------------
\r
1023 void printRuleStatusTable() {
\r
1024 int thisRecord = 0;
\r
1025 int nextRecord = 0;
\r
1027 List tbl = fRB.fRuleStatusVals;
\r
1029 System.out.print("index | tags \n");
\r
1030 System.out.print("-------------------\n");
\r
1032 while (nextRecord < tbl.size()) {
\r
1033 thisRecord = nextRecord;
\r
1034 nextRecord = thisRecord + ((Integer)tbl.get(thisRecord)).intValue() + 1;
\r
1035 RBBINode.printInt(thisRecord, 7);
\r
1036 for (i=thisRecord+1; i<nextRecord; i++) {
\r
1037 int val = ((Integer)tbl.get(i)).intValue();
\r
1038 RBBINode.printInt(val, 7);
\r
1040 System.out.print("\n");
\r
1042 System.out.print("\n\n");
\r