2 **********************************************************************
\r
3 * Copyright (c) 2002-2009, International Business Machines
\r
4 * Corporation and others. All Rights Reserved.
\r
5 **********************************************************************
\r
8 package com.ibm.icu.text;
\r
10 import java.util.ArrayList;
\r
11 import java.util.Collection;
\r
12 import java.util.HashSet;
\r
13 import java.util.List;
\r
14 import java.util.Set;
\r
15 import java.util.SortedSet;
\r
16 import java.util.TreeSet;
\r
18 import com.ibm.icu.impl.Assert;
\r
19 import com.ibm.icu.lang.UCharacter;
\r
20 import com.ibm.icu.lang.UProperty;
\r
23 // class RBBITableBuilder is part of the RBBI rule compiler.
\r
24 // It builds the state transition table used by the RBBI runtime
\r
25 // from the expression syntax tree generated by the rule scanner.
\r
27 // This class is part of the RBBI implementation only.
\r
28 // There is no user-visible public API here.
\r
30 class RBBITableBuilder {
\r
35 // RBBIStateDescriptor - The DFA is initially constructed as a set of these descriptors,
\r
36 // one for each state.
\r
37 static class RBBIStateDescriptor {
\r
41 SortedSet<Integer> fTagVals;
\r
43 Set<RBBINode> fPositions; // Set of parse tree positions associated
\r
44 // with this state. Unordered (it's a set).
\r
45 // UVector contents are RBBINode *
\r
47 int[] fDtran; // Transitions out of this state.
\r
48 // indexed by input character
\r
49 // contents is int index of dest state
\r
50 // in RBBITableBuilder.fDStates
\r
52 RBBIStateDescriptor(int maxInputSymbol) {
\r
53 fTagVals = new TreeSet<Integer>();
\r
54 fPositions = new HashSet<RBBINode>();
\r
55 fDtran = new int[maxInputSymbol+1]; // fDtran needs to be pre-sized.
\r
56 // It is indexed by input symbols, and will
\r
57 // hold the next state number for each
\r
63 private RBBIRuleBuilder fRB;
\r
64 private int fRootIx; // The array index into RBBIRuleBuilder.fTreeRoots
\r
65 // for the parse tree to operate on.
\r
66 // Too bad Java can't do indirection more easily!
\r
68 private List<RBBIStateDescriptor> fDStates; // D states (Aho's terminology)
\r
69 // Index is state number
\r
70 // Contents are RBBIStateDescriptor pointers.
\r
72 //-----------------------------------------------------------------------------
\r
74 // Constructor for RBBITableBuilder.
\r
76 // rootNode is an index into the array of root nodes that is held by
\r
77 // the overall RBBIRuleBuilder.
\r
78 //-----------------------------------------------------------------------------
\r
79 RBBITableBuilder(RBBIRuleBuilder rb, int rootNodeIx) {
\r
80 fRootIx = rootNodeIx;
\r
82 fDStates = new ArrayList<RBBIStateDescriptor>();
\r
88 //-----------------------------------------------------------------------------
\r
90 // RBBITableBuilder::build - This is the main function for building the DFA state transtion
\r
91 // table from the RBBI rules parse tree.
\r
93 //-----------------------------------------------------------------------------
\r
95 // If there were no rules, just return. This situation can easily arise
\r
96 // for the reverse rules.
\r
97 if (fRB.fTreeRoots[fRootIx]==null) {
\r
102 // Walk through the tree, replacing any references to $variables with a copy of the
\r
103 // parse tree for the substition expression.
\r
105 fRB.fTreeRoots[fRootIx] = fRB.fTreeRoots[fRootIx].flattenVariables();
\r
106 if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("ftree")>=0) {
\r
107 System.out.println("Parse tree after flattening variable references.");
\r
108 fRB.fTreeRoots[fRootIx].printTree(true);
\r
112 // If the rules contained any references to {bof}
\r
113 // add a {bof} <cat> <former root of tree> to the
\r
114 // tree. Means that all matches must start out with the
\r
115 // {bof} fake character.
\r
117 if (fRB.fSetBuilder.sawBOF()) {
\r
118 RBBINode bofTop = new RBBINode(RBBINode.opCat);
\r
119 RBBINode bofLeaf = new RBBINode(RBBINode.leafChar);
\r
120 bofTop.fLeftChild = bofLeaf;
\r
121 bofTop.fRightChild = fRB.fTreeRoots[fRootIx];
\r
122 bofLeaf.fParent = bofTop;
\r
123 bofLeaf.fVal = 2; // Reserved value for {bof}.
\r
124 fRB.fTreeRoots[fRootIx] = bofTop;
\r
128 // Add a unique right-end marker to the expression.
\r
129 // Appears as a cat-node, left child being the original tree,
\r
130 // right child being the end marker.
\r
132 RBBINode cn = new RBBINode(RBBINode.opCat);
\r
133 cn.fLeftChild = fRB.fTreeRoots[fRootIx];
\r
134 fRB.fTreeRoots[fRootIx].fParent = cn;
\r
135 cn.fRightChild = new RBBINode(RBBINode.endMark);
\r
136 cn.fRightChild.fParent = cn;
\r
137 fRB.fTreeRoots[fRootIx] = cn;
\r
140 // Replace all references to UnicodeSets with the tree for the equivalent
\r
143 fRB.fTreeRoots[fRootIx].flattenSets();
\r
144 if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("stree")>=0) {
\r
145 System.out.println("Parse tree after flattening Unicode Set references.");
\r
146 fRB.fTreeRoots[fRootIx].printTree(true);
\r
151 // calculate the functions nullable, firstpos, lastpos and followpos on
\r
152 // nodes in the parse tree.
\r
153 // See the alogrithm description in Aho.
\r
154 // Understanding how this works by looking at the code alone will be
\r
155 // nearly impossible.
\r
157 calcNullable(fRB.fTreeRoots[fRootIx]);
\r
158 calcFirstPos(fRB.fTreeRoots[fRootIx]);
\r
159 calcLastPos(fRB.fTreeRoots[fRootIx]);
\r
160 calcFollowPos(fRB.fTreeRoots[fRootIx]);
\r
161 if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("pos")>=0) {
\r
162 System.out.print("\n");
\r
163 printPosSets(fRB.fTreeRoots[fRootIx]);
\r
167 // For "chained" rules, modify the followPos sets
\r
169 if (fRB.fChainRules) {
\r
170 calcChainedFollowPos(fRB.fTreeRoots[fRootIx]);
\r
174 // BOF (start of input) test fixup.
\r
176 if (fRB.fSetBuilder.sawBOF()) {
\r
181 // Build the DFA state transition tables.
\r
184 flagAcceptingStates();
\r
185 flagLookAheadStates();
\r
186 flagTaggedStates();
\r
189 // Update the global table of rule status {tag} values
\r
190 // The rule builder has a global vector of status values that are common
\r
191 // for all tables. Merge the ones from this table into the global set.
\r
193 mergeRuleStatusVals();
\r
195 if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("states")>=0) {printStates();}
\r
200 //-----------------------------------------------------------------------------
\r
202 // calcNullable. Impossible to explain succinctly. See Aho, section 3.9
\r
204 //-----------------------------------------------------------------------------
\r
205 void calcNullable(RBBINode n) {
\r
209 if (n.fType == RBBINode.setRef ||
\r
210 n.fType == RBBINode.endMark ) {
\r
211 // These are non-empty leaf node types.
\r
212 n.fNullable = false;
\r
216 if (n.fType == RBBINode.lookAhead || n.fType == RBBINode.tag) {
\r
217 // Lookahead marker node. It's a leaf, so no recursion on children.
\r
218 // It's nullable because it does not match any literal text from the input stream.
\r
219 n.fNullable = true;
\r
224 // The node is not a leaf.
\r
225 // Calculate nullable on its children.
\r
226 calcNullable(n.fLeftChild);
\r
227 calcNullable(n.fRightChild);
\r
229 // Apply functions from table 3.40 in Aho
\r
230 if (n.fType == RBBINode.opOr) {
\r
231 n.fNullable = n.fLeftChild.fNullable || n.fRightChild.fNullable;
\r
233 else if (n.fType == RBBINode.opCat) {
\r
234 n.fNullable = n.fLeftChild.fNullable && n.fRightChild.fNullable;
\r
236 else if (n.fType == RBBINode.opStar || n.fType == RBBINode.opQuestion) {
\r
237 n.fNullable = true;
\r
240 n.fNullable = false;
\r
247 //-----------------------------------------------------------------------------
\r
249 // calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9
\r
251 //-----------------------------------------------------------------------------
\r
252 void calcFirstPos(RBBINode n) {
\r
256 if (n.fType == RBBINode.leafChar ||
\r
257 n.fType == RBBINode.endMark ||
\r
258 n.fType == RBBINode.lookAhead ||
\r
259 n.fType == RBBINode.tag) {
\r
260 // These are non-empty leaf node types.
\r
261 n.fFirstPosSet.add(n);
\r
265 // The node is not a leaf.
\r
266 // Calculate firstPos on its children.
\r
267 calcFirstPos(n.fLeftChild);
\r
268 calcFirstPos(n.fRightChild);
\r
270 // Apply functions from table 3.40 in Aho
\r
271 if (n.fType == RBBINode.opOr) {
\r
272 n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);
\r
273 n.fFirstPosSet.addAll(n.fRightChild.fFirstPosSet);
\r
275 else if (n.fType == RBBINode.opCat) {
\r
276 n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);
\r
277 if (n.fLeftChild.fNullable) {
\r
278 n.fFirstPosSet.addAll(n.fRightChild.fFirstPosSet);
\r
281 else if (n.fType == RBBINode.opStar ||
\r
282 n.fType == RBBINode.opQuestion ||
\r
283 n.fType == RBBINode.opPlus) {
\r
284 n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);
\r
290 //-----------------------------------------------------------------------------
\r
292 // calcLastPos. Impossible to explain succinctly. See Aho, section 3.9
\r
294 //-----------------------------------------------------------------------------
\r
295 void calcLastPos(RBBINode n) {
\r
299 if (n.fType == RBBINode.leafChar ||
\r
300 n.fType == RBBINode.endMark ||
\r
301 n.fType == RBBINode.lookAhead ||
\r
302 n.fType == RBBINode.tag) {
\r
303 // These are non-empty leaf node types.
\r
304 n.fLastPosSet.add(n);
\r
308 // The node is not a leaf.
\r
309 // Calculate lastPos on its children.
\r
310 calcLastPos(n.fLeftChild);
\r
311 calcLastPos(n.fRightChild);
\r
313 // Apply functions from table 3.40 in Aho
\r
314 if (n.fType == RBBINode.opOr) {
\r
315 n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);
\r
316 n.fLastPosSet.addAll(n.fRightChild.fLastPosSet);
\r
318 else if (n.fType == RBBINode.opCat) {
\r
319 n.fLastPosSet.addAll(n.fRightChild.fLastPosSet);
\r
320 if (n.fRightChild.fNullable) {
\r
321 n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);
\r
324 else if (n.fType == RBBINode.opStar ||
\r
325 n.fType == RBBINode.opQuestion ||
\r
326 n.fType == RBBINode.opPlus) {
\r
327 n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);
\r
333 //-----------------------------------------------------------------------------
\r
335 // calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9
\r
337 //-----------------------------------------------------------------------------
\r
338 void calcFollowPos(RBBINode n) {
\r
340 n.fType == RBBINode.leafChar ||
\r
341 n.fType == RBBINode.endMark) {
\r
345 calcFollowPos(n.fLeftChild);
\r
346 calcFollowPos(n.fRightChild);
\r
349 if (n.fType == RBBINode.opCat) {
\r
350 for (RBBINode i /* is 'i' in Aho's description */ : n.fLeftChild.fLastPosSet) {
\r
351 i.fFollowPos.addAll(n.fRightChild.fFirstPosSet);
\r
356 if (n.fType == RBBINode.opStar ||
\r
357 n.fType == RBBINode.opPlus) {
\r
358 for (RBBINode i /* again, n and i are the names from Aho's description */ : n.fLastPosSet) {
\r
359 i.fFollowPos.addAll(n.fFirstPosSet);
\r
365 //-----------------------------------------------------------------------------
\r
367 // calcChainedFollowPos. Modify the previously calculated followPos sets
\r
368 // to implement rule chaining. NOT described by Aho
\r
370 //-----------------------------------------------------------------------------
\r
371 void calcChainedFollowPos(RBBINode tree) {
\r
373 List<RBBINode> endMarkerNodes = new ArrayList<RBBINode>();
\r
374 List<RBBINode> leafNodes = new ArrayList<RBBINode>();
\r
376 // get a list of all endmarker nodes.
\r
377 tree.findNodes(endMarkerNodes, RBBINode.endMark);
\r
379 // get a list all leaf nodes
\r
380 tree.findNodes(leafNodes, RBBINode.leafChar);
\r
382 // Get all nodes that can be the start a match, which is FirstPosition()
\r
383 // of the portion of the tree corresponding to user-written rules.
\r
384 // See the tree description in bofFixup().
\r
385 RBBINode userRuleRoot = tree;
\r
386 if (fRB.fSetBuilder.sawBOF()) {
\r
387 userRuleRoot = tree.fLeftChild.fRightChild;
\r
389 Assert.assrt(userRuleRoot != null);
\r
390 Set<RBBINode> matchStartNodes = userRuleRoot.fFirstPosSet;
\r
392 // Iteratate over all leaf nodes,
\r
394 for (RBBINode tNode : leafNodes) {
\r
395 RBBINode endNode = null;
\r
397 // Identify leaf nodes that correspond to overall rule match positions.
\r
398 // These include an endMarkerNode in their followPos sets.
\r
399 for (RBBINode endMarkerNode : endMarkerNodes) {
\r
400 if (tNode.fFollowPos.contains(endMarkerNode)) {
\r
405 if (endNode == null) {
\r
406 // node wasn't an end node. Try again with the next.
\r
410 // We've got a node that can end a match.
\r
412 // Line Break Specific hack: If this node's val correspond to the $CM char class,
\r
413 // don't chain from it.
\r
414 // TODO: Add rule syntax for this behavior, get specifics out of here and
\r
415 // into the rule file.
\r
416 if (fRB.fLBCMNoChain) {
\r
417 int c = this.fRB.fSetBuilder.getFirstChar(endNode.fVal);
\r
419 // c == -1 occurs with sets containing only the {eof} marker string.
\r
420 int cLBProp = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK);
\r
421 if (cLBProp == UCharacter.LineBreak.COMBINING_MARK) {
\r
428 // Now iterate over the nodes that can start a match, looking for ones
\r
429 // with the same char class as our ending node.
\r
430 for (RBBINode startNode : matchStartNodes) {
\r
431 if (startNode.fType != RBBINode.leafChar) {
\r
435 if (endNode.fVal == startNode.fVal) {
\r
436 // The end val (character class) of one possible match is the
\r
437 // same as the start of another.
\r
439 // Add all nodes from the followPos of the start node to the
\r
440 // followPos set of the end node, which will have the effect of
\r
441 // letting matches transition from a match state at endNode
\r
442 // to the second char of a match starting with startNode.
\r
443 endNode.fFollowPos.addAll(startNode.fFollowPos);
\r
450 //-----------------------------------------------------------------------------
\r
452 // bofFixup. Fixup for state tables that include {bof} beginning of input testing.
\r
453 // Do an swizzle similar to chaining, modifying the followPos set of
\r
454 // the bofNode to include the followPos nodes from other {bot} nodes
\r
455 // scattered through the tree.
\r
457 // This function has much in common with calcChainedFollowPos().
\r
459 //-----------------------------------------------------------------------------
\r
462 // The parse tree looks like this ...
\r
463 // fTree root --. <cat>
\r
465 // <cat> <#end node>
\r
470 // We will be adding things to the followPos set of the <bofNode>
\r
472 RBBINode bofNode = fRB.fTreeRoots[fRootIx].fLeftChild.fLeftChild;
\r
473 Assert.assrt(bofNode.fType == RBBINode.leafChar);
\r
474 Assert.assrt(bofNode.fVal == 2);
\r
476 // Get all nodes that can be the start a match of the user-written rules
\r
477 // (excluding the fake bofNode)
\r
478 // We want the nodes that can start a match in the
\r
479 // part labeled "rest of tree"
\r
481 Set<RBBINode> matchStartNodes = fRB.fTreeRoots[fRootIx].fLeftChild.fRightChild.fFirstPosSet;
\r
482 for (RBBINode startNode : matchStartNodes) {
\r
483 if (startNode.fType != RBBINode.leafChar) {
\r
487 if (startNode.fVal == bofNode.fVal) {
\r
488 // We found a leaf node corresponding to a {bof} that was
\r
489 // explicitly written into a rule.
\r
490 // Add everything from the followPos set of this node to the
\r
491 // followPos set of the fake bofNode at the start of the tree.
\r
493 bofNode.fFollowPos.addAll(startNode.fFollowPos);
\r
498 //-----------------------------------------------------------------------------
\r
500 // buildStateTable() Determine the set of runtime DFA states and the
\r
501 // transition tables for these states, by the algorithm
\r
502 // of fig. 3.44 in Aho.
\r
504 // Most of the comments are quotes of Aho's psuedo-code.
\r
506 //-----------------------------------------------------------------------------
\r
507 void buildStateTable() {
\r
509 // Add a dummy state 0 - the stop state. Not from Aho.
\r
510 int lastInputSymbol = fRB.fSetBuilder.getNumCharCategories() - 1;
\r
511 RBBIStateDescriptor failState = new RBBIStateDescriptor(lastInputSymbol);
\r
512 fDStates.add(failState);
\r
514 // initially, the only unmarked state in Dstates is firstpos(root),
\r
515 // where toot is the root of the syntax tree for (r)#;
\r
516 RBBIStateDescriptor initialState = new RBBIStateDescriptor(lastInputSymbol);
\r
517 initialState.fPositions.addAll(fRB.fTreeRoots[fRootIx].fFirstPosSet);
\r
518 fDStates.add(initialState);
\r
520 // while there is an unmarked state T in Dstates do begin
\r
522 RBBIStateDescriptor T = null;
\r
524 for (tx=1; tx<fDStates.size(); tx++) {
\r
525 RBBIStateDescriptor temp = fDStates.get(tx);
\r
526 if (temp.fMarked == false) {
\r
538 // for each input symbol a do begin
\r
540 for (a = 1; a<=lastInputSymbol; a++) {
\r
541 // let U be the set of positions that are in followpos(p)
\r
542 // for some position p in T
\r
543 // such that the symbol at position p is a;
\r
544 Set<RBBINode> U = null;
\r
545 for (RBBINode p : T.fPositions) {
\r
546 if ((p.fType == RBBINode.leafChar) && (p.fVal == a)) {
\r
548 U = new HashSet<RBBINode>();
\r
550 U.addAll(p.fFollowPos);
\r
554 // if U is not empty and not in DStates then
\r
556 boolean UinDstates = false;
\r
558 Assert.assrt(U.size() > 0);
\r
560 for (ix=0; ix<fDStates.size(); ix++) {
\r
561 RBBIStateDescriptor temp2;
\r
562 temp2 = fDStates.get(ix);
\r
563 if (U.equals(temp2.fPositions)) {
\r
564 U = temp2.fPositions;
\r
571 // Add U as an unmarked state to Dstates
\r
574 RBBIStateDescriptor newState = new RBBIStateDescriptor(lastInputSymbol);
\r
575 newState.fPositions = U;
\r
576 fDStates.add(newState);
\r
577 ux = fDStates.size()-1;
\r
580 // Dtran[T, a] := U;
\r
589 //-----------------------------------------------------------------------------
\r
591 // flagAcceptingStates Identify accepting states.
\r
592 // First get a list of all of the end marker nodes.
\r
593 // Then, for each state s,
\r
594 // if s contains one of the end marker nodes in its list of tree positions then
\r
595 // s is an accepting state.
\r
597 //-----------------------------------------------------------------------------
\r
598 void flagAcceptingStates() {
\r
599 List<RBBINode> endMarkerNodes = new ArrayList<RBBINode>();
\r
600 RBBINode endMarker;
\r
604 fRB.fTreeRoots[fRootIx].findNodes(endMarkerNodes, RBBINode.endMark);
\r
606 for (i=0; i<endMarkerNodes.size(); i++) {
\r
607 endMarker = endMarkerNodes.get(i);
\r
608 for (n=0; n<fDStates.size(); n++) {
\r
609 RBBIStateDescriptor sd = fDStates.get(n);
\r
610 //if (sd.fPositions.indexOf(endMarker) >= 0) {
\r
611 if (sd.fPositions.contains(endMarker)) {
\r
612 // Any non-zero value for fAccepting means this is an accepting node.
\r
613 // The value is what will be returned to the user as the break status.
\r
614 // If no other value was specified, force it to -1.
\r
616 if (sd.fAccepting==0) {
\r
617 // State hasn't been marked as accepting yet. Do it now.
\r
618 sd.fAccepting = endMarker.fVal;
\r
619 if (sd.fAccepting == 0) {
\r
620 sd.fAccepting = -1;
\r
623 if (sd.fAccepting==-1 && endMarker.fVal != 0) {
\r
624 // Both lookahead and non-lookahead accepting for this state.
\r
625 // Favor the look-ahead. Expedient for line break.
\r
626 // TODO: need a more elegant resolution for conflicting rules.
\r
627 sd.fAccepting = endMarker.fVal;
\r
630 // if sd.fAccepting already had a value other than 0 or -1, leave it be.
\r
632 // If the end marker node is from a look-ahead rule, set
\r
633 // the fLookAhead field or this state also.
\r
634 if (endMarker.fLookAheadEnd) {
\r
635 // TODO: don't change value if already set?
\r
636 // TODO: allow for more than one active look-ahead rule in engine.
\r
637 // Make value here an index to a side array in engine?
\r
638 sd.fLookAhead = sd.fAccepting;
\r
646 //-----------------------------------------------------------------------------
\r
648 // flagLookAheadStates Very similar to flagAcceptingStates, above.
\r
650 //-----------------------------------------------------------------------------
\r
651 void flagLookAheadStates() {
\r
652 List<RBBINode> lookAheadNodes = new ArrayList<RBBINode>();
\r
653 RBBINode lookAheadNode;
\r
657 fRB.fTreeRoots[fRootIx].findNodes(lookAheadNodes, RBBINode.lookAhead);
\r
658 for (i=0; i<lookAheadNodes.size(); i++) {
\r
659 lookAheadNode = lookAheadNodes.get(i);
\r
661 for (n=0; n<fDStates.size(); n++) {
\r
662 RBBIStateDescriptor sd = fDStates.get(n);
\r
663 if (sd.fPositions.contains(lookAheadNode)) {
\r
664 sd.fLookAhead = lookAheadNode.fVal;
\r
673 //-----------------------------------------------------------------------------
\r
675 // flagTaggedStates
\r
677 //-----------------------------------------------------------------------------
\r
678 void flagTaggedStates() {
\r
679 List<RBBINode> tagNodes = new ArrayList<RBBINode>();
\r
684 fRB.fTreeRoots[fRootIx].findNodes(tagNodes, RBBINode.tag);
\r
685 for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em)
\r
686 tagNode = tagNodes.get(i);
\r
688 for (n=0; n<fDStates.size(); n++) { // For each state s (row in the state table)
\r
689 RBBIStateDescriptor sd = fDStates.get(n);
\r
690 if (sd.fPositions.contains(tagNode)) { // if s include the tag node t
\r
691 sd.fTagVals.add(Integer.valueOf(tagNode.fVal));
\r
699 //-----------------------------------------------------------------------------
\r
701 // mergeRuleStatusVals
\r
703 // Allocate positions in the global array of rule status {tag} values
\r
705 // The RBBI runtime uses an array of {sets of status values} that can
\r
706 // be returned for boundaries. Each accepting state that has non-zero
\r
707 // status includes an index into this array. The format of the array
\r
709 // Num of status values in group 1
\r
713 // Num of status vals in group 2
\r
720 //-----------------------------------------------------------------------------
\r
722 void mergeRuleStatusVals() {
\r
724 // The basic outline of what happens here is this...
\r
726 // for each state in this state table
\r
727 // if the status tag list for this state is in the global statuses list
\r
728 // record where and
\r
729 // continue with the next state
\r
731 // add the tag list for this state to the global list.
\r
735 // Pre-load a single tag of {0} into the table.
\r
736 // We will need this as a default, for rule sets with no explicit tagging,
\r
737 // or with explicit tagging of {0}.
\r
738 if (fRB.fRuleStatusVals.size() == 0) {
\r
739 fRB.fRuleStatusVals.add(Integer.valueOf(1)); // Num of statuses in group
\r
740 fRB.fRuleStatusVals.add(Integer.valueOf(0)); // and our single status of zero
\r
742 SortedSet<Integer> s0 = new TreeSet<Integer>();
\r
743 Integer izero = Integer.valueOf(0);
\r
744 fRB.fStatusSets.put(s0, izero);
\r
745 SortedSet<Integer> s1 = new TreeSet<Integer>();
\r
747 fRB.fStatusSets.put(s0, izero);
\r
750 // For each state, check whether the state's status tag values are
\r
751 // already entered into the status values array, and add them if not.
\r
752 for (n=0; n<fDStates.size(); n++) {
\r
753 RBBIStateDescriptor sd = fDStates.get(n);
\r
754 Set<Integer> statusVals = sd.fTagVals;
\r
755 Integer arrayIndexI = fRB.fStatusSets.get(statusVals);
\r
756 if (arrayIndexI == null) {
\r
757 // This is the first encounter of this set of status values.
\r
758 // Add them to the statusSets map, This map associates
\r
759 // the set of status values with an index in the runtime status
\r
761 arrayIndexI = Integer.valueOf(fRB.fRuleStatusVals.size());
\r
762 fRB.fStatusSets.put(statusVals, arrayIndexI);
\r
764 // Add the new set of status values to the vector of values that
\r
765 // will eventually become the array used by the runtime engine.
\r
766 fRB.fRuleStatusVals.add(Integer.valueOf(statusVals.size()));
\r
767 fRB.fRuleStatusVals.addAll(statusVals);
\r
770 // Save the runtime array index back into the state descriptor.
\r
771 sd.fTagsIdx = arrayIndexI.intValue();
\r
781 //-----------------------------------------------------------------------------
\r
783 // printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos
\r
784 // for each node in the tree.
\r
786 //-----------------------------------------------------------------------------
\r
788 void printPosSets(RBBINode n) {
\r
792 RBBINode.printNode(n);
\r
793 System.out.print(" Nullable: " + n.fNullable);
\r
795 System.out.print(" firstpos: ");
\r
796 printSet(n.fFirstPosSet);
\r
798 System.out.print(" lastpos: ");
\r
799 printSet(n.fLastPosSet);
\r
801 System.out.print(" followpos: ");
\r
802 printSet(n.fFollowPos);
\r
804 printPosSets(n.fLeftChild);
\r
805 printPosSets(n.fRightChild);
\r
811 //-----------------------------------------------------------------------------
\r
813 // getTableSize() Calculate the size in bytes of the runtime form of this
\r
814 // state transition table.
\r
816 // Note: Refer to common/rbbidata.h from ICU4C for the declarations
\r
817 // of the structures being matched by this calculation.
\r
819 //-----------------------------------------------------------------------------
\r
820 int getTableSize() {
\r
826 if (fRB.fTreeRoots[fRootIx] == null) {
\r
830 size = /*sizeof(RBBIStateTable) - 4 */ 16; // The header, with no rows to the table.
\r
832 numRows = fDStates.size();
\r
833 numCols = fRB.fSetBuilder.getNumCharCategories();
\r
835 // Note The declaration of RBBIStateTableRow is for a table of two columns.
\r
836 // Therefore we subtract two from numCols when determining
\r
837 // how much storage to add to a row for the total columns.
\r
838 // rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2);
\r
839 rowSize = 8 + 2*numCols;
\r
840 size += numRows * rowSize;
\r
841 while (size % 8 > 0) { // Size must be multiple of 8 bytes in size.
\r
850 //-----------------------------------------------------------------------------
\r
852 // exportTable() export the state transition table in the ICU4C format.
\r
854 // Most of the table is 16 bit shorts. This function exports
\r
855 // the whole thing as an array of shorts.
\r
857 // The size of the array must be rounded up to a multiple of
\r
860 // See struct RBBIStateTable in ICU4C, common/rbbidata.h
\r
862 //-----------------------------------------------------------------------------
\r
864 short [] exportTable() {
\r
868 if (fRB.fTreeRoots[fRootIx] == null) {
\r
869 return new short[0];
\r
872 Assert.assrt(fRB.fSetBuilder.getNumCharCategories() < 0x7fff &&
\r
873 fDStates.size() < 0x7fff);
\r
875 int numStates = fDStates.size();
\r
877 // Size of table size in shorts.
\r
878 // the "4" is the size of struct RBBIStateTableRow, the row header part only.
\r
879 int rowLen = 4 + fRB.fSetBuilder.getNumCharCategories();
\r
880 int tableSize = getTableSize() / 2;
\r
883 short [] table = new short[tableSize];
\r
886 // Fill in the header fields.
\r
887 // Annoying because they really want to be ints, not shorts.
\r
889 // RBBIStateTable.fNumStates
\r
890 table[RBBIDataWrapper.NUMSTATES] = (short)(numStates >>> 16);
\r
891 table[RBBIDataWrapper.NUMSTATES+1] = (short)(numStates & 0x0000ffff);
\r
893 // RBBIStateTable.fRowLen
\r
894 table[RBBIDataWrapper.ROWLEN] = (short)(rowLen >>> 16);
\r
895 table[RBBIDataWrapper.ROWLEN+1] = (short)(rowLen & 0x0000ffff);
\r
897 // RBBIStateTable.fFlags
\r
899 if (fRB.fLookAheadHardBreak) {
\r
900 flags |= RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK;
\r
902 if (fRB.fSetBuilder.sawBOF()) {
\r
903 flags |= RBBIDataWrapper.RBBI_BOF_REQUIRED;
\r
905 table[RBBIDataWrapper.FLAGS] = (short)(flags >>> 16);
\r
906 table[RBBIDataWrapper.FLAGS+1] = (short)(flags & 0x0000ffff);
\r
908 int numCharCategories = fRB.fSetBuilder.getNumCharCategories();
\r
909 for (state=0; state<numStates; state++) {
\r
910 RBBIStateDescriptor sd = fDStates.get(state);
\r
911 int row = 8 + state*rowLen;
\r
912 Assert.assrt (-32768 < sd.fAccepting && sd.fAccepting <= 32767);
\r
913 Assert.assrt (-32768 < sd.fLookAhead && sd.fLookAhead <= 32767);
\r
914 table[row + RBBIDataWrapper.ACCEPTING] = (short)sd.fAccepting;
\r
915 table[row + RBBIDataWrapper.LOOKAHEAD] = (short)sd.fLookAhead;
\r
916 table[row + RBBIDataWrapper.TAGIDX] = (short)sd.fTagsIdx;
\r
917 for (col=0; col<numCharCategories; col++) {
\r
918 table[row + RBBIDataWrapper.NEXTSTATES + col] = (short)sd.fDtran[col];
\r
926 //-----------------------------------------------------------------------------
\r
928 // printSet Debug function. Print the contents of a set of Nodes
\r
930 //-----------------------------------------------------------------------------
\r
932 void printSet(Collection<RBBINode> s) {
\r
933 for (RBBINode n : s) {
\r
934 RBBINode.printInt(n.fSerialNum, 8);
\r
936 System.out.println();
\r
941 //-----------------------------------------------------------------------------
\r
943 // printStates Debug Function. Dump the fully constructed state transition table.
\r
945 //-----------------------------------------------------------------------------
\r
947 void printStates() {
\r
948 int c; // input "character"
\r
949 int n; // state number
\r
951 System.out.print("state | i n p u t s y m b o l s \n");
\r
952 System.out.print(" | Acc LA Tag");
\r
953 for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
\r
954 RBBINode.printInt(c, 3);
\r
956 System.out.print("\n");
\r
957 System.out.print(" |---------------");
\r
958 for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
\r
959 System.out.print("---");
\r
961 System.out.print("\n");
\r
963 for (n=0; n<fDStates.size(); n++) {
\r
964 RBBIStateDescriptor sd = fDStates.get(n);
\r
965 RBBINode.printInt(n, 5);
\r
966 System.out.print(" | ");
\r
968 RBBINode.printInt(sd.fAccepting, 3);
\r
969 RBBINode.printInt(sd.fLookAhead, 4);
\r
970 RBBINode.printInt(sd.fTagsIdx, 6);
\r
971 System.out.print(" ");
\r
972 for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
\r
973 RBBINode.printInt(sd.fDtran[c], 3);
\r
975 System.out.print("\n");
\r
977 System.out.print("\n\n");
\r
983 //-----------------------------------------------------------------------------
\r
985 // printRuleStatusTable Debug Function. Dump the common rule status table
\r
987 //-----------------------------------------------------------------------------
\r
989 void printRuleStatusTable() {
\r
990 int thisRecord = 0;
\r
991 int nextRecord = 0;
\r
993 List<Integer> tbl = fRB.fRuleStatusVals;
\r
995 System.out.print("index | tags \n");
\r
996 System.out.print("-------------------\n");
\r
998 while (nextRecord < tbl.size()) {
\r
999 thisRecord = nextRecord;
\r
1000 nextRecord = thisRecord + tbl.get(thisRecord).intValue() + 1;
\r
1001 RBBINode.printInt(thisRecord, 7);
\r
1002 for (i=thisRecord+1; i<nextRecord; i++) {
\r
1003 int val = tbl.get(i).intValue();
\r
1004 RBBINode.printInt(val, 7);
\r
1006 System.out.print("\n");
\r
1008 System.out.print("\n\n");
\r