]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-52_1/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java
Upgrade ICU4J.
[Dictionary.git] / jars / icu4j-52_1 / main / classes / core / src / com / ibm / icu / text / RBBITableBuilder.java
1 /*
2 **********************************************************************
3 *   Copyright (c) 2002-2009, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 */
7
8 package com.ibm.icu.text;
9
10 import java.util.ArrayList;
11 import java.util.Collection;
12 import java.util.HashSet;
13 import java.util.List;
14 import java.util.Set;
15 import java.util.SortedSet;
16 import java.util.TreeSet;
17
18 import com.ibm.icu.impl.Assert;
19 import com.ibm.icu.lang.UCharacter;
20 import com.ibm.icu.lang.UProperty;
21
22 //
23 //  class RBBITableBuilder is part of the RBBI rule compiler.
24 //                         It builds the state transition table used by the RBBI runtime
25 //                         from the expression syntax tree generated by the rule scanner.
26 //
27 //                         This class is part of the RBBI implementation only.
28 //                         There is no user-visible public API here.
29 //
30 class RBBITableBuilder {
31     
32     
33     
34     //
35     //  RBBIStateDescriptor - The DFA is initially constructed as a set of these descriptors,
36     //                        one for each state.
37     static class RBBIStateDescriptor {
38         boolean      fMarked;
39         int          fAccepting;
40         int          fLookAhead;
41         SortedSet<Integer> fTagVals;
42         int          fTagsIdx;
43         Set<RBBINode> fPositions;                 // Set of parse tree positions associated
44                                                   //   with this state.  Unordered (it's a set).
45                                                   //   UVector contents are RBBINode *
46
47         int[]        fDtran;                      // Transitions out of this state.
48                                                    //   indexed by input character
49                                                    //   contents is int index of dest state
50                                                    //   in RBBITableBuilder.fDStates
51
52         RBBIStateDescriptor(int maxInputSymbol) {
53             fTagVals = new TreeSet<Integer>();
54             fPositions = new HashSet<RBBINode>();
55             fDtran = new int[maxInputSymbol+1];    // fDtran needs to be pre-sized.
56                                                     //   It is indexed by input symbols, and will
57                                                     //   hold  the next state number for each
58                                                     //   symbol.
59         }
60     }
61     
62     
63     private  RBBIRuleBuilder  fRB;
64     private  int             fRootIx;             // The array index into RBBIRuleBuilder.fTreeRoots
65                                                    //   for the parse tree to operate on.
66                                                    //   Too bad Java can't do indirection more easily!
67
68     private  List<RBBIStateDescriptor> fDStates;    //  D states (Aho's terminology)
69                                                     //  Index is state number
70                                                     //  Contents are RBBIStateDescriptor pointers.
71
72     //-----------------------------------------------------------------------------
73     //
74     //  Constructor    for RBBITableBuilder.
75     //
76     //                 rootNode is an index into the array of root nodes that is held by
77     //                          the overall RBBIRuleBuilder.
78     //-----------------------------------------------------------------------------
79     RBBITableBuilder(RBBIRuleBuilder rb,  int rootNodeIx)  {
80            fRootIx     = rootNodeIx;
81            fRB         = rb;
82            fDStates    = new ArrayList<RBBIStateDescriptor>();
83         }
84
85
86
87  
88        //-----------------------------------------------------------------------------
89        //
90        //   RBBITableBuilder::build  -  This is the main function for building the DFA state transtion
91        //                               table from the RBBI rules parse tree.
92        //
93        //-----------------------------------------------------------------------------
94        void  build() {
95            // If there were no rules, just return.  This situation can easily arise
96            //   for the reverse rules.
97            if (fRB.fTreeRoots[fRootIx]==null) {
98                return;
99            }
100
101            //
102            // Walk through the tree, replacing any references to $variables with a copy of the
103            //   parse tree for the substition expression.
104            //
105            fRB.fTreeRoots[fRootIx] = fRB.fTreeRoots[fRootIx].flattenVariables();
106            if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("ftree")>=0) {
107                System.out.println("Parse tree after flattening variable references.");
108                fRB.fTreeRoots[fRootIx].printTree(true);
109            }
110
111            //
112            // If the rules contained any references to {bof} 
113            //   add a {bof} <cat> <former root of tree> to the
114            //   tree.  Means that all matches must start out with the 
115            //   {bof} fake character.
116            // 
117            if (fRB.fSetBuilder.sawBOF()) {
118                RBBINode bofTop    = new RBBINode(RBBINode.opCat);
119                RBBINode bofLeaf   = new RBBINode(RBBINode.leafChar);
120                bofTop.fLeftChild  = bofLeaf;
121                bofTop.fRightChild = fRB.fTreeRoots[fRootIx];
122                bofLeaf.fParent    = bofTop;
123                bofLeaf.fVal       = 2;      // Reserved value for {bof}.
124                fRB.fTreeRoots[fRootIx] = bofTop;
125            }
126
127            //
128            // Add a unique right-end marker to the expression.
129            //   Appears as a cat-node, left child being the original tree,
130            //   right child being the end marker.
131            //
132            RBBINode cn = new RBBINode(RBBINode.opCat);
133            cn.fLeftChild = fRB.fTreeRoots[fRootIx];
134            fRB.fTreeRoots[fRootIx].fParent = cn;
135            cn.fRightChild = new RBBINode(RBBINode.endMark);
136            cn.fRightChild.fParent = cn;
137            fRB.fTreeRoots[fRootIx] = cn;
138
139            //
140            //  Replace all references to UnicodeSets with the tree for the equivalent
141            //      expression.
142            //
143            fRB.fTreeRoots[fRootIx].flattenSets();
144            if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("stree")>=0) {
145                System.out.println("Parse tree after flattening Unicode Set references.");
146                fRB.fTreeRoots[fRootIx].printTree(true);
147            }
148
149
150            //
151            // calculate the functions nullable, firstpos, lastpos and followpos on
152            // nodes in the parse tree.
153            //    See the alogrithm description in Aho.
154            //    Understanding how this works by looking at the code alone will be
155            //       nearly impossible.
156            //
157            calcNullable(fRB.fTreeRoots[fRootIx]);
158            calcFirstPos(fRB.fTreeRoots[fRootIx]);
159            calcLastPos(fRB.fTreeRoots[fRootIx]);
160            calcFollowPos(fRB.fTreeRoots[fRootIx]);
161            if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("pos")>=0) {
162                System.out.print("\n");
163                printPosSets(fRB.fTreeRoots[fRootIx]);
164            }
165
166            //
167            //  For "chained" rules, modify the followPos sets
168            //
169            if (fRB.fChainRules) {
170                calcChainedFollowPos(fRB.fTreeRoots[fRootIx]);
171            }
172
173            //
174            //  BOF (start of input) test fixup.
175            //
176            if (fRB.fSetBuilder.sawBOF()) {
177                bofFixup();
178            }
179
180            //
181            // Build the DFA state transition tables.
182            //
183            buildStateTable();
184            flagAcceptingStates();
185            flagLookAheadStates();
186            flagTaggedStates();
187
188            //
189            // Update the global table of rule status {tag} values
190            // The rule builder has a global vector of status values that are common
191            //    for all tables.  Merge the ones from this table into the global set.
192            //
193            mergeRuleStatusVals();
194
195            if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("states")>=0) {printStates();}
196        }
197
198
199
200        //-----------------------------------------------------------------------------
201        //
202        //   calcNullable.    Impossible to explain succinctly.  See Aho, section 3.9
203        //
204        //-----------------------------------------------------------------------------
205        void calcNullable(RBBINode n) {
206            if (n == null) {
207                return;
208            }
209            if (n.fType == RBBINode.setRef ||
210                n.fType == RBBINode.endMark ) {
211                // These are non-empty leaf node types.
212                n.fNullable = false;
213                return;
214            }
215
216            if (n.fType == RBBINode.lookAhead || n.fType == RBBINode.tag) {
217                // Lookahead marker node.  It's a leaf, so no recursion on children.
218                // It's nullable because it does not match any literal text from the input stream.
219                n.fNullable = true;
220                return;
221            }
222
223
224            // The node is not a leaf.
225            //  Calculate nullable on its children.
226            calcNullable(n.fLeftChild);
227            calcNullable(n.fRightChild);
228
229            // Apply functions from table 3.40 in Aho
230            if (n.fType == RBBINode.opOr) {
231                n.fNullable = n.fLeftChild.fNullable || n.fRightChild.fNullable;
232            }
233            else if (n.fType == RBBINode.opCat) {
234                n.fNullable = n.fLeftChild.fNullable && n.fRightChild.fNullable;
235            }
236            else if (n.fType == RBBINode.opStar || n.fType == RBBINode.opQuestion) {
237                n.fNullable = true;
238            }
239            else {
240                n.fNullable = false;
241            }
242        }
243
244
245
246
247        //-----------------------------------------------------------------------------
248        //
249        //   calcFirstPos.    Impossible to explain succinctly.  See Aho, section 3.9
250        //
251        //-----------------------------------------------------------------------------
252        void calcFirstPos(RBBINode n) {
253            if (n == null) {
254                return;
255            }
256            if (n.fType == RBBINode.leafChar  ||
257                n.fType == RBBINode.endMark   ||
258                n.fType == RBBINode.lookAhead ||
259                n.fType == RBBINode.tag) {
260                // These are non-empty leaf node types.
261                n.fFirstPosSet.add(n);
262                return;
263            }
264
265            // The node is not a leaf.
266            //  Calculate firstPos on its children.
267            calcFirstPos(n.fLeftChild);
268            calcFirstPos(n.fRightChild);
269
270            // Apply functions from table 3.40 in Aho
271            if (n.fType == RBBINode.opOr) {
272                n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);
273                n.fFirstPosSet.addAll(n.fRightChild.fFirstPosSet);
274            }
275            else if (n.fType == RBBINode.opCat) {
276                n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);
277                if (n.fLeftChild.fNullable) {
278                    n.fFirstPosSet.addAll(n.fRightChild.fFirstPosSet);
279                }
280            }
281            else if (n.fType == RBBINode.opStar ||
282                     n.fType == RBBINode.opQuestion ||
283                     n.fType == RBBINode.opPlus) {
284                n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);
285            }
286        }
287
288
289
290        //-----------------------------------------------------------------------------
291        //
292        //   calcLastPos.    Impossible to explain succinctly.  See Aho, section 3.9
293        //
294        //-----------------------------------------------------------------------------
295        void calcLastPos(RBBINode n) {
296            if (n == null) {
297                return;
298            }
299            if (n.fType == RBBINode.leafChar  ||
300                n.fType == RBBINode.endMark   ||
301                n.fType == RBBINode.lookAhead ||
302                n.fType == RBBINode.tag) {
303                // These are non-empty leaf node types.
304                n.fLastPosSet.add(n);
305                return;
306            }
307
308            // The node is not a leaf.
309            //  Calculate lastPos on its children.
310            calcLastPos(n.fLeftChild);
311            calcLastPos(n.fRightChild);
312
313            // Apply functions from table 3.40 in Aho
314            if (n.fType == RBBINode.opOr) {
315                n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);
316                n.fLastPosSet.addAll(n.fRightChild.fLastPosSet);
317            }
318            else if (n.fType == RBBINode.opCat) {
319                n.fLastPosSet.addAll(n.fRightChild.fLastPosSet);
320                if (n.fRightChild.fNullable) {
321                    n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);
322                }
323            }
324            else if (n.fType == RBBINode.opStar     ||
325                     n.fType == RBBINode.opQuestion ||
326                     n.fType == RBBINode.opPlus) {
327                n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);
328            }
329        }
330
331
332
333        //-----------------------------------------------------------------------------
334        //
335        //   calcFollowPos.    Impossible to explain succinctly.  See Aho, section 3.9
336        //
337        //-----------------------------------------------------------------------------
338        void calcFollowPos(RBBINode n) {
339            if (n == null ||
340                n.fType == RBBINode.leafChar ||
341                n.fType == RBBINode.endMark) {
342                return;
343            }
344
345            calcFollowPos(n.fLeftChild);
346            calcFollowPos(n.fRightChild);
347
348            // Aho rule #1
349            if (n.fType == RBBINode.opCat) {
350                for (RBBINode i /* is 'i' in Aho's description */ : n.fLeftChild.fLastPosSet) {
351                    i.fFollowPos.addAll(n.fRightChild.fFirstPosSet);
352                }
353            }
354
355            // Aho rule #2
356            if (n.fType == RBBINode.opStar ||
357                n.fType == RBBINode.opPlus) {
358                for (RBBINode i /* again, n and i are the names from Aho's description */ : n.fLastPosSet) {
359                    i.fFollowPos.addAll(n.fFirstPosSet);
360                }
361            }
362        }
363
364
365        //-----------------------------------------------------------------------------
366        //
367        //   calcChainedFollowPos.    Modify the previously calculated followPos sets
368        //                            to implement rule chaining.  NOT described by Aho
369        //
370        //-----------------------------------------------------------------------------
371        void calcChainedFollowPos(RBBINode tree) {
372
373            List<RBBINode> endMarkerNodes = new ArrayList<RBBINode>();
374            List<RBBINode> leafNodes      = new ArrayList<RBBINode>();
375
376             // get a list of all endmarker nodes.
377            tree.findNodes(endMarkerNodes, RBBINode.endMark);
378
379            // get a list all leaf nodes
380            tree.findNodes(leafNodes, RBBINode.leafChar);
381
382            // Get all nodes that can be the start a match, which is FirstPosition()
383            // of the portion of the tree corresponding to user-written rules.
384            // See the tree description in bofFixup().
385            RBBINode userRuleRoot = tree;
386            if (fRB.fSetBuilder.sawBOF()) {
387                userRuleRoot = tree.fLeftChild.fRightChild;
388            }
389            Assert.assrt(userRuleRoot != null);
390            Set<RBBINode> matchStartNodes = userRuleRoot.fFirstPosSet;
391
392            // Iteratate over all leaf nodes,
393            //
394            for (RBBINode tNode : leafNodes) {
395                RBBINode endNode = null;
396
397                // Identify leaf nodes that correspond to overall rule match positions.
398                //   These include an endMarkerNode in their followPos sets.
399                for (RBBINode endMarkerNode : endMarkerNodes) {
400                    if (tNode.fFollowPos.contains(endMarkerNode)) {
401                        endNode = tNode;
402                        break;
403                    }
404                }
405                if (endNode == null) {
406                    // node wasn't an end node.  Try again with the next.
407                    continue;
408                }
409
410                // We've got a node that can end a match.
411
412                // Line Break Specific hack:  If this node's val correspond to the $CM char class,
413                //                            don't chain from it.
414                // TODO:  Add rule syntax for this behavior, get specifics out of here and
415                //        into the rule file.
416                if (fRB.fLBCMNoChain) {
417                    int c = this.fRB.fSetBuilder.getFirstChar(endNode.fVal);
418                    if (c != -1) {
419                        // c == -1 occurs with sets containing only the {eof} marker string.
420                        int cLBProp = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK);
421                        if (cLBProp == UCharacter.LineBreak.COMBINING_MARK) {
422                            continue;
423                        }
424                    }
425                }
426
427
428                // Now iterate over the nodes that can start a match, looking for ones
429                //   with the same char class as our ending node.
430                for (RBBINode startNode : matchStartNodes) {
431                    if (startNode.fType != RBBINode.leafChar) {
432                        continue;
433                    }
434
435                    if (endNode.fVal == startNode.fVal) {
436                        // The end val (character class) of one possible match is the
437                        //   same as the start of another.
438
439                        // Add all nodes from the followPos of the start node to the
440                        //  followPos set of the end node, which will have the effect of
441                        //  letting matches transition from a match state at endNode
442                        //  to the second char of a match starting with startNode.
443                        endNode.fFollowPos.addAll(startNode.fFollowPos);
444                    }
445                }
446            }
447        }
448
449
450        //-----------------------------------------------------------------------------
451        //
452        //   bofFixup.    Fixup for state tables that include {bof} beginning of input testing.
453        //                Do an swizzle similar to chaining, modifying the followPos set of
454        //                the bofNode to include the followPos nodes from other {bot} nodes
455        //                scattered through the tree.
456        //
457        //                This function has much in common with calcChainedFollowPos().
458        //
459        //-----------------------------------------------------------------------------
460        void bofFixup() {
461            //
462            //   The parse tree looks like this ...
463            //         fTree root  --.       <cat>
464            //                               /     \   
465            //                            <cat>   <#end node>
466            //                           /     \   
467            //                     <bofNode>   rest
468            //                               of tree
469            //
470            //    We will be adding things to the followPos set of the <bofNode>
471            //
472            RBBINode  bofNode = fRB.fTreeRoots[fRootIx].fLeftChild.fLeftChild;
473            Assert.assrt(bofNode.fType == RBBINode.leafChar);
474            Assert.assrt(bofNode.fVal == 2);
475
476            // Get all nodes that can be the start a match of the user-written rules
477            //  (excluding the fake bofNode)
478            //  We want the nodes that can start a match in the
479            //     part labeled "rest of tree"
480            // 
481            Set<RBBINode> matchStartNodes = fRB.fTreeRoots[fRootIx].fLeftChild.fRightChild.fFirstPosSet;
482            for (RBBINode startNode : matchStartNodes) {
483                if (startNode.fType != RBBINode.leafChar) {
484                    continue;
485                }
486
487                if (startNode.fVal == bofNode.fVal) {
488                    //  We found a leaf node corresponding to a {bof} that was
489                    //    explicitly written into a rule.
490                    //  Add everything from the followPos set of this node to the
491                    //    followPos set of the fake bofNode at the start of the tree.
492                    //  
493                    bofNode.fFollowPos.addAll(startNode.fFollowPos);
494                }
495            }
496        }
497
498        //-----------------------------------------------------------------------------
499        //
500        //   buildStateTable()    Determine the set of runtime DFA states and the
501        //                        transition tables for these states, by the algorithm
502        //                        of fig. 3.44 in Aho.
503        //
504        //                        Most of the comments are quotes of Aho's psuedo-code.
505        //
506        //-----------------------------------------------------------------------------
507        void buildStateTable() {
508            //
509            // Add a dummy state 0 - the stop state.  Not from Aho.
510            int      lastInputSymbol = fRB.fSetBuilder.getNumCharCategories() - 1;
511            RBBIStateDescriptor failState = new RBBIStateDescriptor(lastInputSymbol);
512            fDStates.add(failState);
513
514            // initially, the only unmarked state in Dstates is firstpos(root),
515            //       where toot is the root of the syntax tree for (r)#;
516            RBBIStateDescriptor initialState = new RBBIStateDescriptor(lastInputSymbol);
517            initialState.fPositions.addAll(fRB.fTreeRoots[fRootIx].fFirstPosSet);
518            fDStates.add(initialState);
519
520            // while there is an unmarked state T in Dstates do begin
521            for (;;) {
522                RBBIStateDescriptor T = null;
523                int              tx;
524                for (tx=1; tx<fDStates.size(); tx++) {
525                    RBBIStateDescriptor temp  = fDStates.get(tx);
526                    if (temp.fMarked == false) {
527                        T = temp;
528                        break;
529                    }
530                }
531                if (T == null) {
532                    break;
533                }
534
535                // mark T;
536                T.fMarked = true;
537
538                // for each input symbol a do begin
539                int  a;
540                for (a = 1; a<=lastInputSymbol; a++) {
541                    // let U be the set of positions that are in followpos(p)
542                    //    for some position p in T
543                    //    such that the symbol at position p is a;
544                    Set<RBBINode> U = null;
545                    for (RBBINode p : T.fPositions) {
546                        if ((p.fType == RBBINode.leafChar) &&  (p.fVal == a)) {
547                            if (U == null) {
548                                U = new HashSet<RBBINode>();
549                            }
550                            U.addAll(p.fFollowPos);
551                        }
552                    }
553
554                    // if U is not empty and not in DStates then
555                    int  ux = 0;
556                    boolean    UinDstates = false;
557                    if (U != null) {
558                        Assert.assrt(U.size() > 0);
559                        int  ix;
560                        for (ix=0; ix<fDStates.size(); ix++) {
561                            RBBIStateDescriptor temp2;
562                            temp2 = fDStates.get(ix);
563                            if (U.equals(temp2.fPositions)) {
564                                U  = temp2.fPositions;
565                                ux = ix;
566                                UinDstates = true;
567                                break;
568                            }
569                        }
570
571                        // Add U as an unmarked state to Dstates
572                        if (!UinDstates)
573                        {
574                            RBBIStateDescriptor newState = new RBBIStateDescriptor(lastInputSymbol);
575                            newState.fPositions = U;
576                            fDStates.add(newState);
577                            ux = fDStates.size()-1;
578                        }
579
580                        // Dtran[T, a] := U;
581                        T.fDtran[a] = ux;
582                    }
583                }
584            }
585        }
586
587
588
589        //-----------------------------------------------------------------------------
590        //
591        //   flagAcceptingStates    Identify accepting states.
592        //                          First get a list of all of the end marker nodes.
593        //                          Then, for each state s,
594        //                              if s contains one of the end marker nodes in its list of tree positions then
595        //                                  s is an accepting state.
596        //
597        //-----------------------------------------------------------------------------
598        void     flagAcceptingStates() {
599            List<RBBINode> endMarkerNodes = new ArrayList<RBBINode>();
600            RBBINode    endMarker;
601            int     i;
602            int     n;
603
604            fRB.fTreeRoots[fRootIx].findNodes(endMarkerNodes, RBBINode.endMark);
605
606            for (i=0; i<endMarkerNodes.size(); i++) {
607                endMarker = endMarkerNodes.get(i);
608                for (n=0; n<fDStates.size(); n++) {
609                    RBBIStateDescriptor sd = fDStates.get(n);
610                    //if (sd.fPositions.indexOf(endMarker) >= 0) {
611                    if (sd.fPositions.contains(endMarker)) {
612                        // Any non-zero value for fAccepting means this is an accepting node.
613                        // The value is what will be returned to the user as the break status.
614                        // If no other value was specified, force it to -1.
615
616                        if (sd.fAccepting==0) {
617                         // State hasn't been marked as accepting yet.  Do it now.
618                            sd.fAccepting = endMarker.fVal;
619                            if (sd.fAccepting == 0) {
620                                sd.fAccepting = -1;
621                         }
622                        }
623                        if (sd.fAccepting==-1 && endMarker.fVal != 0) {
624                         // Both lookahead and non-lookahead accepting for this state.
625                         // Favor the look-ahead.  Expedient for line break.
626                         // TODO:  need a more elegant resolution for conflicting rules.
627                         sd.fAccepting = endMarker.fVal;
628                     }
629                         // implicit else:
630                         // if sd.fAccepting already had a value other than 0 or -1, leave it be.
631
632                        // If the end marker node is from a look-ahead rule, set
633                        //   the fLookAhead field or this state also.
634                        if (endMarker.fLookAheadEnd) {
635                         // TODO:  don't change value if already set?
636                         // TODO:  allow for more than one active look-ahead rule in engine.
637                         //        Make value here an index to a side array in engine?
638                            sd.fLookAhead = sd.fAccepting;
639                        }
640                    }
641                }
642            }
643        }
644
645
646        //-----------------------------------------------------------------------------
647        //
648        //    flagLookAheadStates   Very similar to flagAcceptingStates, above.
649        //
650        //-----------------------------------------------------------------------------
651        void     flagLookAheadStates() {
652            List<RBBINode> lookAheadNodes = new ArrayList<RBBINode>();
653            RBBINode    lookAheadNode;
654            int     i;
655            int     n;
656
657            fRB.fTreeRoots[fRootIx].findNodes(lookAheadNodes, RBBINode.lookAhead);
658            for (i=0; i<lookAheadNodes.size(); i++) {
659                lookAheadNode = lookAheadNodes.get(i);
660
661                for (n=0; n<fDStates.size(); n++) {
662                    RBBIStateDescriptor sd = fDStates.get(n);
663                    if (sd.fPositions.contains(lookAheadNode)) {
664                        sd.fLookAhead = lookAheadNode.fVal;
665                    }
666                }
667            }
668        }
669
670
671
672
673        //-----------------------------------------------------------------------------
674        //
675        //    flagTaggedStates
676        //
677        //-----------------------------------------------------------------------------
678        void     flagTaggedStates() {
679            List<RBBINode> tagNodes = new ArrayList<RBBINode>();
680            RBBINode    tagNode;
681            int     i;
682            int     n;
683
684            fRB.fTreeRoots[fRootIx].findNodes(tagNodes, RBBINode.tag);
685            for (i=0; i<tagNodes.size(); i++) {                   // For each tag node t (all of 'em)
686                tagNode = tagNodes.get(i);
687
688                for (n=0; n<fDStates.size(); n++) {              //    For each state  s (row in the state table)
689                    RBBIStateDescriptor sd = fDStates.get(n);
690                    if (sd.fPositions.contains(tagNode)) {       //       if  s include the tag node t
691                        sd.fTagVals.add(Integer.valueOf(tagNode.fVal));
692                    }
693                }
694            }
695        }
696
697
698
699        //-----------------------------------------------------------------------------
700        //
701        //  mergeRuleStatusVals
702        //
703        //      Allocate positions in the  global array of rule status {tag} values
704        //
705        //      The RBBI runtime uses an array of {sets of status values} that can
706        //      be returned for boundaries.  Each accepting state that has non-zero
707        //      status includes an index into this array.  The format of the array
708        //      is 
709        //           Num of status values in group 1
710        //              status val
711        //              status val
712        //              ...
713        //           Num of status vals in group 2
714        //              status val
715        //              status val
716        //              ...
717        //           etc.
718        //
719        //
720        //-----------------------------------------------------------------------------
721        
722        void  mergeRuleStatusVals() {
723            //
724            //  The basic outline of what happens here is this...
725            //
726            //    for each state in this state table
727            //       if the status tag list for this state is in the global statuses list
728            //           record where and
729            //           continue with the next state
730            //       else
731            //           add the tag list for this state to the global list.
732            //
733            int n;
734            
735            // Pre-load a single tag of {0} into the table.
736            //   We will need this as a default, for rule sets with no explicit tagging,
737            //   or with explicit tagging of {0}.
738            if (fRB.fRuleStatusVals.size() == 0) {
739                fRB.fRuleStatusVals.add(Integer.valueOf(1));    // Num of statuses in group
740                fRB.fRuleStatusVals.add(Integer.valueOf(0));    //   and our single status of zero
741                
742                SortedSet<Integer> s0 = new TreeSet<Integer>();
743                Integer izero = Integer.valueOf(0);
744                fRB.fStatusSets.put(s0, izero);
745                SortedSet<Integer> s1 = new TreeSet<Integer>();
746                s1.add(izero);
747                fRB.fStatusSets.put(s0, izero);
748            }
749
750            //    For each state, check whether the state's status tag values are
751            //       already entered into the status values array, and add them if not.
752            for (n=0; n<fDStates.size(); n++) {
753                RBBIStateDescriptor sd = fDStates.get(n);
754                Set<Integer> statusVals = sd.fTagVals;
755                Integer arrayIndexI = fRB.fStatusSets.get(statusVals);
756                if (arrayIndexI == null) {
757                    // This is the first encounter of this set of status values.
758                    //   Add them to the statusSets map, This map associates
759                    //   the set of status values with an index in the runtime status 
760                    //   values array.
761                    arrayIndexI = Integer.valueOf(fRB.fRuleStatusVals.size());
762                    fRB.fStatusSets.put(statusVals, arrayIndexI);
763                    
764                    // Add the new set of status values to the vector of values that
765                    //   will eventually become the array used by the runtime engine.
766                    fRB.fRuleStatusVals.add(Integer.valueOf(statusVals.size()));
767                    fRB.fRuleStatusVals.addAll(statusVals);
768                }
769                
770                // Save the runtime array index back into the state descriptor.
771                sd.fTagsIdx = arrayIndexI.intValue();
772            }
773        }
774
775
776
777
778
779
780
781        //-----------------------------------------------------------------------------
782        //
783        //  printPosSets   Debug function.  Dump Nullable, firstpos, lastpos and followpos
784        //                 for each node in the tree.
785        //
786        //-----------------------------------------------------------------------------
787        
788        void printPosSets(RBBINode n) {
789            if (n==null) {
790                return;
791            }
792            RBBINode.printNode(n);
793            System.out.print("         Nullable:  " + n.fNullable);
794
795            System.out.print("         firstpos:  ");
796            printSet(n.fFirstPosSet);
797
798            System.out.print("         lastpos:   ");
799            printSet(n.fLastPosSet);
800
801            System.out.print("         followpos: ");
802            printSet(n.fFollowPos);
803
804            printPosSets(n.fLeftChild);
805            printPosSets(n.fRightChild);
806        }
807        
808
809
810
811        //-----------------------------------------------------------------------------
812        //
813        //   getTableSize()    Calculate the size in bytes of the runtime form of this
814        //                     state transition table.
815        //
816        //          Note:  Refer to common/rbbidata.h from ICU4C for the declarations
817        //                 of the structures being matched by this calculation.
818        //
819        //-----------------------------------------------------------------------------
820        int  getTableSize()  {
821            int    size = 0;
822            int    numRows;
823            int    numCols;
824            int    rowSize;
825
826            if (fRB.fTreeRoots[fRootIx] == null) {
827                return 0;
828            }
829
830            size    = /*sizeof(RBBIStateTable) - 4 */ 16;    // The header, with no rows to the table.
831
832            numRows = fDStates.size();
833            numCols = fRB.fSetBuilder.getNumCharCategories();
834
835            //  Note  The declaration of RBBIStateTableRow is for a table of two columns.
836            //        Therefore we subtract two from numCols when determining
837            //        how much storage to add to a row for the total columns.
838            // rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2);
839            rowSize = 8 + 2*numCols;
840            size   += numRows * rowSize;
841            while (size % 8 > 0) {    // Size must be multiple of 8 bytes in size.
842                size++;
843            }
844
845            return size;
846        }
847
848
849
850        //-----------------------------------------------------------------------------
851        //
852        //   exportTable()    export the state transition table in the ICU4C format.
853        //
854        //                    Most of the table is 16 bit shorts.  This function exports
855        //                    the whole thing as an array of shorts.
856        //
857        //                    The size of the array must be rounded up to a multiple of
858        //                    8 bytes.
859        //
860        //                    See struct RBBIStateTable in ICU4C, common/rbbidata.h
861        //
862        //-----------------------------------------------------------------------------
863        
864        short [] exportTable() {
865            int                state;
866            int                col;
867
868            if (fRB.fTreeRoots[fRootIx] == null) {
869                return new short[0];
870            }
871
872            Assert.assrt(fRB.fSetBuilder.getNumCharCategories() < 0x7fff &&
873                fDStates.size() < 0x7fff); 
874
875            int numStates = fDStates.size();
876     
877            // Size of table size in shorts.
878            //  the "4" is the size of struct RBBIStateTableRow, the row header part only.
879            int rowLen = 4 + fRB.fSetBuilder.getNumCharCategories();
880            int tableSize = getTableSize() / 2;
881
882            
883            short [] table = new short[tableSize];
884            
885            //
886            // Fill in the header fields.
887            //      Annoying because they really want to be ints, not shorts.
888            //
889            // RBBIStateTable.fNumStates
890            table[RBBIDataWrapper.NUMSTATES]   = (short)(numStates >>> 16);
891            table[RBBIDataWrapper.NUMSTATES+1] = (short)(numStates & 0x0000ffff);
892
893            // RBBIStateTable.fRowLen
894            table[RBBIDataWrapper.ROWLEN]   = (short)(rowLen >>> 16);
895            table[RBBIDataWrapper.ROWLEN+1] = (short)(rowLen & 0x0000ffff);
896            
897            // RBBIStateTable.fFlags
898            int flags = 0;
899            if (fRB.fLookAheadHardBreak) {
900                flags  |= RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK;
901            }
902            if (fRB.fSetBuilder.sawBOF()) {
903                flags  |= RBBIDataWrapper.RBBI_BOF_REQUIRED;
904            }
905            table[RBBIDataWrapper.FLAGS]   = (short)(flags >>> 16);
906            table[RBBIDataWrapper.FLAGS+1] = (short)(flags & 0x0000ffff);
907            
908            int numCharCategories = fRB.fSetBuilder.getNumCharCategories();
909            for (state=0; state<numStates; state++) {
910                RBBIStateDescriptor sd = fDStates.get(state);
911                int                row = 8 + state*rowLen;
912                Assert.assrt (-32768 < sd.fAccepting && sd.fAccepting <= 32767);
913                Assert.assrt (-32768 < sd.fLookAhead && sd.fLookAhead <= 32767);
914                table[row + RBBIDataWrapper.ACCEPTING] = (short)sd.fAccepting;
915                table[row + RBBIDataWrapper.LOOKAHEAD] = (short)sd.fLookAhead;
916                table[row + RBBIDataWrapper.TAGIDX]    = (short)sd.fTagsIdx;
917                for (col=0; col<numCharCategories; col++) {
918                    table[row + RBBIDataWrapper.NEXTSTATES + col] = (short)sd.fDtran[col];
919                }
920            }
921            return table;
922        }
923
924
925
926        //-----------------------------------------------------------------------------
927        //
928        //   printSet    Debug function.   Print the contents of a set of Nodes
929        //
930        //-----------------------------------------------------------------------------
931        
932        void printSet(Collection<RBBINode> s) {
933            for (RBBINode n : s) {
934                RBBINode.printInt(n.fSerialNum, 8);
935            }
936            System.out.println();
937        }
938        
939
940
941        //-----------------------------------------------------------------------------
942        //
943        //   printStates    Debug Function.  Dump the fully constructed state transition table.
944        //
945        //-----------------------------------------------------------------------------
946        
947        void printStates() {
948            int     c;    // input "character"
949            int     n;    // state number
950
951            System.out.print("state |           i n p u t     s y m b o l s \n");
952            System.out.print("      | Acc  LA    Tag");
953            for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
954                RBBINode.printInt(c, 3);
955            }
956            System.out.print("\n");
957            System.out.print("      |---------------");
958            for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
959                System.out.print("---");
960            }
961            System.out.print("\n");
962
963            for (n=0; n<fDStates.size(); n++) {
964                RBBIStateDescriptor sd = fDStates.get(n);
965                RBBINode.printInt(n, 5);
966                System.out.print(" | ");
967                
968                RBBINode.printInt(sd.fAccepting, 3);
969                RBBINode.printInt(sd.fLookAhead, 4);
970                RBBINode.printInt(sd.fTagsIdx, 6);
971                System.out.print(" ");
972                for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
973                    RBBINode.printInt(sd.fDtran[c], 3);
974                }
975                System.out.print("\n");
976            }
977            System.out.print("\n\n");
978        }
979        
980
981
982
983        //-----------------------------------------------------------------------------
984        //
985        //   printRuleStatusTable    Debug Function.  Dump the common rule status table
986        //
987        //-----------------------------------------------------------------------------
988        
989        void printRuleStatusTable() {
990            int  thisRecord = 0;
991            int  nextRecord = 0;
992            int      i;
993            List<Integer> tbl = fRB.fRuleStatusVals;
994
995            System.out.print("index |  tags \n");
996            System.out.print("-------------------\n");
997
998            while (nextRecord < tbl.size()) {
999                thisRecord = nextRecord;
1000                nextRecord = thisRecord + tbl.get(thisRecord).intValue() + 1;
1001                RBBINode.printInt(thisRecord, 7);
1002                for (i=thisRecord+1; i<nextRecord; i++) {
1003                    int val = tbl.get(i).intValue();
1004                    RBBINode.printInt(val, 7);
1005                }
1006                System.out.print("\n");
1007            }
1008            System.out.print("\n\n");
1009        }
1010        
1011
1012
1013 }