jars/icu4j-4_4_2-src/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java

   1 /*\r
   2 **********************************************************************\r
   3 *   Copyright (c) 2002-2009, International Business Machines\r
   4 *   Corporation and others.  All Rights Reserved.\r
   5 **********************************************************************\r
   6 */\r
   7 \r
   8 package com.ibm.icu.text;\r
   9 \r
  10 import java.util.ArrayList;\r
  11 import java.util.Collection;\r
  12 import java.util.HashSet;\r
  13 import java.util.List;\r
  14 import java.util.Set;\r
  15 import java.util.SortedSet;\r
  16 import java.util.TreeSet;\r
  17 \r
  18 import com.ibm.icu.impl.Assert;\r
  19 import com.ibm.icu.lang.UCharacter;\r
  20 import com.ibm.icu.lang.UProperty;\r
  21 \r
  22 //\r
  23 //  class RBBITableBuilder is part of the RBBI rule compiler.\r
  24 //                         It builds the state transition table used by the RBBI runtime\r
  25 //                         from the expression syntax tree generated by the rule scanner.\r
  26 //\r
  27 //                         This class is part of the RBBI implementation only.\r
  28 //                         There is no user-visible public API here.\r
  29 //\r
  30 class RBBITableBuilder {\r
  31     \r
  32     \r
  33     \r
  34     //\r
  35     //  RBBIStateDescriptor - The DFA is initially constructed as a set of these descriptors,\r
  36     //                        one for each state.\r
  37     static class RBBIStateDescriptor {\r
  38         boolean      fMarked;\r
  39         int          fAccepting;\r
  40         int          fLookAhead;\r
  41         SortedSet<Integer> fTagVals;\r
  42         int          fTagsIdx;\r
  43         Set<RBBINode> fPositions;                 // Set of parse tree positions associated\r
  44                                                   //   with this state.  Unordered (it's a set).\r
  45                                                   //   UVector contents are RBBINode *\r
  46 \r
  47         int[]        fDtran;                      // Transitions out of this state.\r
  48                                                    //   indexed by input character\r
  49                                                    //   contents is int index of dest state\r
  50                                                    //   in RBBITableBuilder.fDStates\r
  51 \r
  52         RBBIStateDescriptor(int maxInputSymbol) {\r
  53             fTagVals = new TreeSet<Integer>();\r
  54             fPositions = new HashSet<RBBINode>();\r
  55             fDtran = new int[maxInputSymbol+1];    // fDtran needs to be pre-sized.\r
  56                                                     //   It is indexed by input symbols, and will\r
  57                                                     //   hold  the next state number for each\r
  58                                                     //   symbol.\r
  59         }\r
  60     }\r
  61     \r
  62     \r
  63     private  RBBIRuleBuilder  fRB;\r
  64     private  int             fRootIx;             // The array index into RBBIRuleBuilder.fTreeRoots\r
  65                                                    //   for the parse tree to operate on.\r
  66                                                    //   Too bad Java can't do indirection more easily!\r
  67 \r
  68     private  List<RBBIStateDescriptor> fDStates;    //  D states (Aho's terminology)\r
  69                                                     //  Index is state number\r
  70                                                     //  Contents are RBBIStateDescriptor pointers.\r
  71 \r
  72     //-----------------------------------------------------------------------------\r
  73     //\r
  74     //  Constructor    for RBBITableBuilder.\r
  75     //\r
  76     //                 rootNode is an index into the array of root nodes that is held by\r
  77     //                          the overall RBBIRuleBuilder.\r
  78     //-----------------------------------------------------------------------------\r
  79     RBBITableBuilder(RBBIRuleBuilder rb,  int rootNodeIx)  {\r
  80            fRootIx     = rootNodeIx;\r
  81            fRB         = rb;\r
  82            fDStates    = new ArrayList<RBBIStateDescriptor>();\r
  83         }\r
  84 \r
  85 \r
  86 \r
  87  \r
  88        //-----------------------------------------------------------------------------\r
  89        //\r
  90        //   RBBITableBuilder::build  -  This is the main function for building the DFA state transtion\r
  91        //                               table from the RBBI rules parse tree.\r
  92        //\r
  93        //-----------------------------------------------------------------------------\r
  94        void  build() {\r
  95            // If there were no rules, just return.  This situation can easily arise\r
  96            //   for the reverse rules.\r
  97            if (fRB.fTreeRoots[fRootIx]==null) {\r
  98                return;\r
  99            }\r
 100 \r
 101            //\r
 102            // Walk through the tree, replacing any references to $variables with a copy of the\r
 103            //   parse tree for the substition expression.\r
 104            //\r
 105            fRB.fTreeRoots[fRootIx] = fRB.fTreeRoots[fRootIx].flattenVariables();\r
 106            if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("ftree")>=0) {\r
 107                System.out.println("Parse tree after flattening variable references.");\r
 108                fRB.fTreeRoots[fRootIx].printTree(true);\r
 109            }\r
 110 \r
 111            //\r
 112            // If the rules contained any references to {bof} \r
 113            //   add a {bof} <cat> <former root of tree> to the\r
 114            //   tree.  Means that all matches must start out with the \r
 115            //   {bof} fake character.\r
 116            // \r
 117            if (fRB.fSetBuilder.sawBOF()) {\r
 118                RBBINode bofTop    = new RBBINode(RBBINode.opCat);\r
 119                RBBINode bofLeaf   = new RBBINode(RBBINode.leafChar);\r
 120                bofTop.fLeftChild  = bofLeaf;\r
 121                bofTop.fRightChild = fRB.fTreeRoots[fRootIx];\r
 122                bofLeaf.fParent    = bofTop;\r
 123                bofLeaf.fVal       = 2;      // Reserved value for {bof}.\r
 124                fRB.fTreeRoots[fRootIx] = bofTop;\r
 125            }\r
 126 \r
 127            //\r
 128            // Add a unique right-end marker to the expression.\r
 129            //   Appears as a cat-node, left child being the original tree,\r
 130            //   right child being the end marker.\r
 131            //\r
 132            RBBINode cn = new RBBINode(RBBINode.opCat);\r
 133            cn.fLeftChild = fRB.fTreeRoots[fRootIx];\r
 134            fRB.fTreeRoots[fRootIx].fParent = cn;\r
 135            cn.fRightChild = new RBBINode(RBBINode.endMark);\r
 136            cn.fRightChild.fParent = cn;\r
 137            fRB.fTreeRoots[fRootIx] = cn;\r
 138 \r
 139            //\r
 140            //  Replace all references to UnicodeSets with the tree for the equivalent\r
 141            //      expression.\r
 142            //\r
 143            fRB.fTreeRoots[fRootIx].flattenSets();\r
 144            if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("stree")>=0) {\r
 145                System.out.println("Parse tree after flattening Unicode Set references.");\r
 146                fRB.fTreeRoots[fRootIx].printTree(true);\r
 147            }\r
 148 \r
 149 \r
 150            //\r
 151            // calculate the functions nullable, firstpos, lastpos and followpos on\r
 152            // nodes in the parse tree.\r
 153            //    See the alogrithm description in Aho.\r
 154            //    Understanding how this works by looking at the code alone will be\r
 155            //       nearly impossible.\r
 156            //\r
 157            calcNullable(fRB.fTreeRoots[fRootIx]);\r
 158            calcFirstPos(fRB.fTreeRoots[fRootIx]);\r
 159            calcLastPos(fRB.fTreeRoots[fRootIx]);\r
 160            calcFollowPos(fRB.fTreeRoots[fRootIx]);\r
 161            if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("pos")>=0) {\r
 162                System.out.print("\n");\r
 163                printPosSets(fRB.fTreeRoots[fRootIx]);\r
 164            }\r
 165 \r
 166            //\r
 167            //  For "chained" rules, modify the followPos sets\r
 168            //\r
 169            if (fRB.fChainRules) {\r
 170                calcChainedFollowPos(fRB.fTreeRoots[fRootIx]);\r
 171            }\r
 172 \r
 173            //\r
 174            //  BOF (start of input) test fixup.\r
 175            //\r
 176            if (fRB.fSetBuilder.sawBOF()) {\r
 177                bofFixup();\r
 178            }\r
 179 \r
 180            //\r
 181            // Build the DFA state transition tables.\r
 182            //\r
 183            buildStateTable();\r
 184            flagAcceptingStates();\r
 185            flagLookAheadStates();\r
 186            flagTaggedStates();\r
 187 \r
 188            //\r
 189            // Update the global table of rule status {tag} values\r
 190            // The rule builder has a global vector of status values that are common\r
 191            //    for all tables.  Merge the ones from this table into the global set.\r
 192            //\r
 193            mergeRuleStatusVals();\r
 194 \r
 195            if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("states")>=0) {printStates();}\r
 196        }\r
 197 \r
 198 \r
 199 \r
 200        //-----------------------------------------------------------------------------\r
 201        //\r
 202        //   calcNullable.    Impossible to explain succinctly.  See Aho, section 3.9\r
 203        //\r
 204        //-----------------------------------------------------------------------------\r
 205        void calcNullable(RBBINode n) {\r
 206            if (n == null) {\r
 207                return;\r
 208            }\r
 209            if (n.fType == RBBINode.setRef ||\r
 210                n.fType == RBBINode.endMark ) {\r
 211                // These are non-empty leaf node types.\r
 212                n.fNullable = false;\r
 213                return;\r
 214            }\r
 215 \r
 216            if (n.fType == RBBINode.lookAhead || n.fType == RBBINode.tag) {\r
 217                // Lookahead marker node.  It's a leaf, so no recursion on children.\r
 218                // It's nullable because it does not match any literal text from the input stream.\r
 219                n.fNullable = true;\r
 220                return;\r
 221            }\r
 222 \r
 223 \r
 224            // The node is not a leaf.\r
 225            //  Calculate nullable on its children.\r
 226            calcNullable(n.fLeftChild);\r
 227            calcNullable(n.fRightChild);\r
 228 \r
 229            // Apply functions from table 3.40 in Aho\r
 230            if (n.fType == RBBINode.opOr) {\r
 231                n.fNullable = n.fLeftChild.fNullable || n.fRightChild.fNullable;\r
 232            }\r
 233            else if (n.fType == RBBINode.opCat) {\r
 234                n.fNullable = n.fLeftChild.fNullable && n.fRightChild.fNullable;\r
 235            }\r
 236            else if (n.fType == RBBINode.opStar || n.fType == RBBINode.opQuestion) {\r
 237                n.fNullable = true;\r
 238            }\r
 239            else {\r
 240                n.fNullable = false;\r
 241            }\r
 242        }\r
 243 \r
 244 \r
 245 \r
 246 \r
 247        //-----------------------------------------------------------------------------\r
 248        //\r
 249        //   calcFirstPos.    Impossible to explain succinctly.  See Aho, section 3.9\r
 250        //\r
 251        //-----------------------------------------------------------------------------\r
 252        void calcFirstPos(RBBINode n) {\r
 253            if (n == null) {\r
 254                return;\r
 255            }\r
 256            if (n.fType == RBBINode.leafChar  ||\r
 257                n.fType == RBBINode.endMark   ||\r
 258                n.fType == RBBINode.lookAhead ||\r
 259                n.fType == RBBINode.tag) {\r
 260                // These are non-empty leaf node types.\r
 261                n.fFirstPosSet.add(n);\r
 262                return;\r
 263            }\r
 264 \r
 265            // The node is not a leaf.\r
 266            //  Calculate firstPos on its children.\r
 267            calcFirstPos(n.fLeftChild);\r
 268            calcFirstPos(n.fRightChild);\r
 269 \r
 270            // Apply functions from table 3.40 in Aho\r
 271            if (n.fType == RBBINode.opOr) {\r
 272                n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);\r
 273                n.fFirstPosSet.addAll(n.fRightChild.fFirstPosSet);\r
 274            }\r
 275            else if (n.fType == RBBINode.opCat) {\r
 276                n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);\r
 277                if (n.fLeftChild.fNullable) {\r
 278                    n.fFirstPosSet.addAll(n.fRightChild.fFirstPosSet);\r
 279                }\r
 280            }\r
 281            else if (n.fType == RBBINode.opStar ||\r
 282                     n.fType == RBBINode.opQuestion ||\r
 283                     n.fType == RBBINode.opPlus) {\r
 284                n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);\r
 285            }\r
 286        }\r
 287 \r
 288 \r
 289 \r
 290        //-----------------------------------------------------------------------------\r
 291        //\r
 292        //   calcLastPos.    Impossible to explain succinctly.  See Aho, section 3.9\r
 293        //\r
 294        //-----------------------------------------------------------------------------\r
 295        void calcLastPos(RBBINode n) {\r
 296            if (n == null) {\r
 297                return;\r
 298            }\r
 299            if (n.fType == RBBINode.leafChar  ||\r
 300                n.fType == RBBINode.endMark   ||\r
 301                n.fType == RBBINode.lookAhead ||\r
 302                n.fType == RBBINode.tag) {\r
 303                // These are non-empty leaf node types.\r
 304                n.fLastPosSet.add(n);\r
 305                return;\r
 306            }\r
 307 \r
 308            // The node is not a leaf.\r
 309            //  Calculate lastPos on its children.\r
 310            calcLastPos(n.fLeftChild);\r
 311            calcLastPos(n.fRightChild);\r
 312 \r
 313            // Apply functions from table 3.40 in Aho\r
 314            if (n.fType == RBBINode.opOr) {\r
 315                n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);\r
 316                n.fLastPosSet.addAll(n.fRightChild.fLastPosSet);\r
 317            }\r
 318            else if (n.fType == RBBINode.opCat) {\r
 319                n.fLastPosSet.addAll(n.fRightChild.fLastPosSet);\r
 320                if (n.fRightChild.fNullable) {\r
 321                    n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);\r
 322                }\r
 323            }\r
 324            else if (n.fType == RBBINode.opStar     ||\r
 325                     n.fType == RBBINode.opQuestion ||\r
 326                     n.fType == RBBINode.opPlus) {\r
 327                n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);\r
 328            }\r
 329        }\r
 330 \r
 331 \r
 332 \r
 333        //-----------------------------------------------------------------------------\r
 334        //\r
 335        //   calcFollowPos.    Impossible to explain succinctly.  See Aho, section 3.9\r
 336        //\r
 337        //-----------------------------------------------------------------------------\r
 338        void calcFollowPos(RBBINode n) {\r
 339            if (n == null ||\r
 340                n.fType == RBBINode.leafChar ||\r
 341                n.fType == RBBINode.endMark) {\r
 342                return;\r
 343            }\r
 344 \r
 345            calcFollowPos(n.fLeftChild);\r
 346            calcFollowPos(n.fRightChild);\r
 347 \r
 348            // Aho rule #1\r
 349            if (n.fType == RBBINode.opCat) {\r
 350                for (RBBINode i /* is 'i' in Aho's description */ : n.fLeftChild.fLastPosSet) {\r
 351                    i.fFollowPos.addAll(n.fRightChild.fFirstPosSet);\r
 352                }\r
 353            }\r
 354 \r
 355            // Aho rule #2\r
 356            if (n.fType == RBBINode.opStar ||\r
 357                n.fType == RBBINode.opPlus) {\r
 358                for (RBBINode i /* again, n and i are the names from Aho's description */ : n.fLastPosSet) {\r
 359                    i.fFollowPos.addAll(n.fFirstPosSet);\r
 360                }\r
 361            }\r
 362        }\r
 363 \r
 364 \r
 365        //-----------------------------------------------------------------------------\r
 366        //\r
 367        //   calcChainedFollowPos.    Modify the previously calculated followPos sets\r
 368        //                            to implement rule chaining.  NOT described by Aho\r
 369        //\r
 370        //-----------------------------------------------------------------------------\r
 371        void calcChainedFollowPos(RBBINode tree) {\r
 372 \r
 373            List<RBBINode> endMarkerNodes = new ArrayList<RBBINode>();\r
 374            List<RBBINode> leafNodes      = new ArrayList<RBBINode>();\r
 375 \r
 376             // get a list of all endmarker nodes.\r
 377            tree.findNodes(endMarkerNodes, RBBINode.endMark);\r
 378 \r
 379            // get a list all leaf nodes\r
 380            tree.findNodes(leafNodes, RBBINode.leafChar);\r
 381 \r
 382            // Get all nodes that can be the start a match, which is FirstPosition()\r
 383            // of the portion of the tree corresponding to user-written rules.\r
 384            // See the tree description in bofFixup().\r
 385            RBBINode userRuleRoot = tree;\r
 386            if (fRB.fSetBuilder.sawBOF()) {\r
 387                userRuleRoot = tree.fLeftChild.fRightChild;\r
 388            }\r
 389            Assert.assrt(userRuleRoot != null);\r
 390            Set<RBBINode> matchStartNodes = userRuleRoot.fFirstPosSet;\r
 391 \r
 392            // Iteratate over all leaf nodes,\r
 393            //\r
 394            for (RBBINode tNode : leafNodes) {\r
 395                RBBINode endNode = null;\r
 396 \r
 397                // Identify leaf nodes that correspond to overall rule match positions.\r
 398                //   These include an endMarkerNode in their followPos sets.\r
 399                for (RBBINode endMarkerNode : endMarkerNodes) {\r
 400                    if (tNode.fFollowPos.contains(endMarkerNode)) {\r
 401                        endNode = tNode;\r
 402                        break;\r
 403                    }\r
 404                }\r
 405                if (endNode == null) {\r
 406                    // node wasn't an end node.  Try again with the next.\r
 407                    continue;\r
 408                }\r
 409 \r
 410                // We've got a node that can end a match.\r
 411 \r
 412                // Line Break Specific hack:  If this node's val correspond to the $CM char class,\r
 413                //                            don't chain from it.\r
 414                // TODO:  Add rule syntax for this behavior, get specifics out of here and\r
 415                //        into the rule file.\r
 416                if (fRB.fLBCMNoChain) {\r
 417                    int c = this.fRB.fSetBuilder.getFirstChar(endNode.fVal);\r
 418                    if (c != -1) {\r
 419                        // c == -1 occurs with sets containing only the {eof} marker string.\r
 420                        int cLBProp = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK);\r
 421                        if (cLBProp == UCharacter.LineBreak.COMBINING_MARK) {\r
 422                            continue;\r
 423                        }\r
 424                    }\r
 425                }\r
 426 \r
 427 \r
 428                // Now iterate over the nodes that can start a match, looking for ones\r
 429                //   with the same char class as our ending node.\r
 430                for (RBBINode startNode : matchStartNodes) {\r
 431                    if (startNode.fType != RBBINode.leafChar) {\r
 432                        continue;\r
 433                    }\r
 434 \r
 435                    if (endNode.fVal == startNode.fVal) {\r
 436                        // The end val (character class) of one possible match is the\r
 437                        //   same as the start of another.\r
 438 \r
 439                        // Add all nodes from the followPos of the start node to the\r
 440                        //  followPos set of the end node, which will have the effect of\r
 441                        //  letting matches transition from a match state at endNode\r
 442                        //  to the second char of a match starting with startNode.\r
 443                        endNode.fFollowPos.addAll(startNode.fFollowPos);\r
 444                    }\r
 445                }\r
 446            }\r
 447        }\r
 448 \r
 449 \r
 450        //-----------------------------------------------------------------------------\r
 451        //\r
 452        //   bofFixup.    Fixup for state tables that include {bof} beginning of input testing.\r
 453        //                Do an swizzle similar to chaining, modifying the followPos set of\r
 454        //                the bofNode to include the followPos nodes from other {bot} nodes\r
 455        //                scattered through the tree.\r
 456        //\r
 457        //                This function has much in common with calcChainedFollowPos().\r
 458        //\r
 459        //-----------------------------------------------------------------------------\r
 460        void bofFixup() {\r
 461            //\r
 462            //   The parse tree looks like this ...\r
 463            //         fTree root  --.       <cat>\r
 464            //                               /     \   \r
 465            //                            <cat>   <#end node>\r
 466            //                           /     \   \r
 467            //                     <bofNode>   rest\r
 468            //                               of tree\r
 469            //\r
 470            //    We will be adding things to the followPos set of the <bofNode>\r
 471            //\r
 472            RBBINode  bofNode = fRB.fTreeRoots[fRootIx].fLeftChild.fLeftChild;\r
 473            Assert.assrt(bofNode.fType == RBBINode.leafChar);\r
 474            Assert.assrt(bofNode.fVal == 2);\r
 475 \r
 476            // Get all nodes that can be the start a match of the user-written rules\r
 477            //  (excluding the fake bofNode)\r
 478            //  We want the nodes that can start a match in the\r
 479            //     part labeled "rest of tree"\r
 480            // \r
 481            Set<RBBINode> matchStartNodes = fRB.fTreeRoots[fRootIx].fLeftChild.fRightChild.fFirstPosSet;\r
 482            for (RBBINode startNode : matchStartNodes) {\r
 483                if (startNode.fType != RBBINode.leafChar) {\r
 484                    continue;\r
 485                }\r
 486 \r
 487                if (startNode.fVal == bofNode.fVal) {\r
 488                    //  We found a leaf node corresponding to a {bof} that was\r
 489                    //    explicitly written into a rule.\r
 490                    //  Add everything from the followPos set of this node to the\r
 491                    //    followPos set of the fake bofNode at the start of the tree.\r
 492                    //  \r
 493                    bofNode.fFollowPos.addAll(startNode.fFollowPos);\r
 494                }\r
 495            }\r
 496        }\r
 497 \r
 498        //-----------------------------------------------------------------------------\r
 499        //\r
 500        //   buildStateTable()    Determine the set of runtime DFA states and the\r
 501        //                        transition tables for these states, by the algorithm\r
 502        //                        of fig. 3.44 in Aho.\r
 503        //\r
 504        //                        Most of the comments are quotes of Aho's psuedo-code.\r
 505        //\r
 506        //-----------------------------------------------------------------------------\r
 507        void buildStateTable() {\r
 508            //\r
 509            // Add a dummy state 0 - the stop state.  Not from Aho.\r
 510            int      lastInputSymbol = fRB.fSetBuilder.getNumCharCategories() - 1;\r
 511            RBBIStateDescriptor failState = new RBBIStateDescriptor(lastInputSymbol);\r
 512            fDStates.add(failState);\r
 513 \r
 514            // initially, the only unmarked state in Dstates is firstpos(root),\r
 515            //       where toot is the root of the syntax tree for (r)#;\r
 516            RBBIStateDescriptor initialState = new RBBIStateDescriptor(lastInputSymbol);\r
 517            initialState.fPositions.addAll(fRB.fTreeRoots[fRootIx].fFirstPosSet);\r
 518            fDStates.add(initialState);\r
 519 \r
 520            // while there is an unmarked state T in Dstates do begin\r
 521            for (;;) {\r
 522                RBBIStateDescriptor T = null;\r
 523                int              tx;\r
 524                for (tx=1; tx<fDStates.size(); tx++) {\r
 525                    RBBIStateDescriptor temp  = fDStates.get(tx);\r
 526                    if (temp.fMarked == false) {\r
 527                        T = temp;\r
 528                        break;\r
 529                    }\r
 530                }\r
 531                if (T == null) {\r
 532                    break;\r
 533                }\r
 534 \r
 535                // mark T;\r
 536                T.fMarked = true;\r
 537 \r
 538                // for each input symbol a do begin\r
 539                int  a;\r
 540                for (a = 1; a<=lastInputSymbol; a++) {\r
 541                    // let U be the set of positions that are in followpos(p)\r
 542                    //    for some position p in T\r
 543                    //    such that the symbol at position p is a;\r
 544                    Set<RBBINode> U = null;\r
 545                    for (RBBINode p : T.fPositions) {\r
 546                        if ((p.fType == RBBINode.leafChar) &&  (p.fVal == a)) {\r
 547                            if (U == null) {\r
 548                                U = new HashSet<RBBINode>();\r
 549                            }\r
 550                            U.addAll(p.fFollowPos);\r
 551                        }\r
 552                    }\r
 553 \r
 554                    // if U is not empty and not in DStates then\r
 555                    int  ux = 0;\r
 556                    boolean    UinDstates = false;\r
 557                    if (U != null) {\r
 558                        Assert.assrt(U.size() > 0);\r
 559                        int  ix;\r
 560                        for (ix=0; ix<fDStates.size(); ix++) {\r
 561                            RBBIStateDescriptor temp2;\r
 562                            temp2 = fDStates.get(ix);\r
 563                            if (U.equals(temp2.fPositions)) {\r
 564                                U  = temp2.fPositions;\r
 565                                ux = ix;\r
 566                                UinDstates = true;\r
 567                                break;\r
 568                            }\r
 569                        }\r
 570 \r
 571                        // Add U as an unmarked state to Dstates\r
 572                        if (!UinDstates)\r
 573                        {\r
 574                            RBBIStateDescriptor newState = new RBBIStateDescriptor(lastInputSymbol);\r
 575                            newState.fPositions = U;\r
 576                            fDStates.add(newState);\r
 577                            ux = fDStates.size()-1;\r
 578                        }\r
 579 \r
 580                        // Dtran[T, a] := U;\r
 581                        T.fDtran[a] = ux;\r
 582                    }\r
 583                }\r
 584            }\r
 585        }\r
 586 \r
 587 \r
 588 \r
 589        //-----------------------------------------------------------------------------\r
 590        //\r
 591        //   flagAcceptingStates    Identify accepting states.\r
 592        //                          First get a list of all of the end marker nodes.\r
 593        //                          Then, for each state s,\r
 594        //                              if s contains one of the end marker nodes in its list of tree positions then\r
 595        //                                  s is an accepting state.\r
 596        //\r
 597        //-----------------------------------------------------------------------------\r
 598        void     flagAcceptingStates() {\r
 599            List<RBBINode> endMarkerNodes = new ArrayList<RBBINode>();\r
 600            RBBINode    endMarker;\r
 601            int     i;\r
 602            int     n;\r
 603 \r
 604            fRB.fTreeRoots[fRootIx].findNodes(endMarkerNodes, RBBINode.endMark);\r
 605 \r
 606            for (i=0; i<endMarkerNodes.size(); i++) {\r
 607                endMarker = endMarkerNodes.get(i);\r
 608                for (n=0; n<fDStates.size(); n++) {\r
 609                    RBBIStateDescriptor sd = fDStates.get(n);\r
 610                    //if (sd.fPositions.indexOf(endMarker) >= 0) {\r
 611                    if (sd.fPositions.contains(endMarker)) {\r
 612                        // Any non-zero value for fAccepting means this is an accepting node.\r
 613                        // The value is what will be returned to the user as the break status.\r
 614                        // If no other value was specified, force it to -1.\r
 615 \r
 616                        if (sd.fAccepting==0) {\r
 617                         // State hasn't been marked as accepting yet.  Do it now.\r
 618                            sd.fAccepting = endMarker.fVal;\r
 619                            if (sd.fAccepting == 0) {\r
 620                                sd.fAccepting = -1;\r
 621                         }\r
 622                        }\r
 623                        if (sd.fAccepting==-1 && endMarker.fVal != 0) {\r
 624                         // Both lookahead and non-lookahead accepting for this state.\r
 625                         // Favor the look-ahead.  Expedient for line break.\r
 626                         // TODO:  need a more elegant resolution for conflicting rules.\r
 627                         sd.fAccepting = endMarker.fVal;\r
 628                     }\r
 629                         // implicit else:\r
 630                         // if sd.fAccepting already had a value other than 0 or -1, leave it be.\r
 631 \r
 632                        // If the end marker node is from a look-ahead rule, set\r
 633                        //   the fLookAhead field or this state also.\r
 634                        if (endMarker.fLookAheadEnd) {\r
 635                         // TODO:  don't change value if already set?\r
 636                         // TODO:  allow for more than one active look-ahead rule in engine.\r
 637                         //        Make value here an index to a side array in engine?\r
 638                            sd.fLookAhead = sd.fAccepting;\r
 639                        }\r
 640                    }\r
 641                }\r
 642            }\r
 643        }\r
 644 \r
 645 \r
 646        //-----------------------------------------------------------------------------\r
 647        //\r
 648        //    flagLookAheadStates   Very similar to flagAcceptingStates, above.\r
 649        //\r
 650        //-----------------------------------------------------------------------------\r
 651        void     flagLookAheadStates() {\r
 652            List<RBBINode> lookAheadNodes = new ArrayList<RBBINode>();\r
 653            RBBINode    lookAheadNode;\r
 654            int     i;\r
 655            int     n;\r
 656 \r
 657            fRB.fTreeRoots[fRootIx].findNodes(lookAheadNodes, RBBINode.lookAhead);\r
 658            for (i=0; i<lookAheadNodes.size(); i++) {\r
 659                lookAheadNode = lookAheadNodes.get(i);\r
 660 \r
 661                for (n=0; n<fDStates.size(); n++) {\r
 662                    RBBIStateDescriptor sd = fDStates.get(n);\r
 663                    if (sd.fPositions.contains(lookAheadNode)) {\r
 664                        sd.fLookAhead = lookAheadNode.fVal;\r
 665                    }\r
 666                }\r
 667            }\r
 668        }\r
 669 \r
 670 \r
 671 \r
 672 \r
 673        //-----------------------------------------------------------------------------\r
 674        //\r
 675        //    flagTaggedStates\r
 676        //\r
 677        //-----------------------------------------------------------------------------\r
 678        void     flagTaggedStates() {\r
 679            List<RBBINode> tagNodes = new ArrayList<RBBINode>();\r
 680            RBBINode    tagNode;\r
 681            int     i;\r
 682            int     n;\r
 683 \r
 684            fRB.fTreeRoots[fRootIx].findNodes(tagNodes, RBBINode.tag);\r
 685            for (i=0; i<tagNodes.size(); i++) {                   // For each tag node t (all of 'em)\r
 686                tagNode = tagNodes.get(i);\r
 687 \r
 688                for (n=0; n<fDStates.size(); n++) {              //    For each state  s (row in the state table)\r
 689                    RBBIStateDescriptor sd = fDStates.get(n);\r
 690                    if (sd.fPositions.contains(tagNode)) {       //       if  s include the tag node t\r
 691                        sd.fTagVals.add(Integer.valueOf(tagNode.fVal));\r
 692                    }\r
 693                }\r
 694            }\r
 695        }\r
 696 \r
 697 \r
 698 \r
 699        //-----------------------------------------------------------------------------\r
 700        //\r
 701        //  mergeRuleStatusVals\r
 702        //\r
 703        //      Allocate positions in the  global array of rule status {tag} values\r
 704        //\r
 705        //      The RBBI runtime uses an array of {sets of status values} that can\r
 706        //      be returned for boundaries.  Each accepting state that has non-zero\r
 707        //      status includes an index into this array.  The format of the array\r
 708        //      is \r
 709        //           Num of status values in group 1\r
 710        //              status val\r
 711        //              status val\r
 712        //              ...\r
 713        //           Num of status vals in group 2\r
 714        //              status val\r
 715        //              status val\r
 716        //              ...\r
 717        //           etc.\r
 718        //\r
 719        //\r
 720        //-----------------------------------------------------------------------------\r
 721        \r
 722        void  mergeRuleStatusVals() {\r
 723            //\r
 724            //  The basic outline of what happens here is this...\r
 725            //\r
 726            //    for each state in this state table\r
 727            //       if the status tag list for this state is in the global statuses list\r
 728            //           record where and\r
 729            //           continue with the next state\r
 730            //       else\r
 731            //           add the tag list for this state to the global list.\r
 732            //\r
 733            int n;\r
 734            \r
 735            // Pre-load a single tag of {0} into the table.\r
 736            //   We will need this as a default, for rule sets with no explicit tagging,\r
 737            //   or with explicit tagging of {0}.\r
 738            if (fRB.fRuleStatusVals.size() == 0) {\r
 739                fRB.fRuleStatusVals.add(Integer.valueOf(1));    // Num of statuses in group\r
 740                fRB.fRuleStatusVals.add(Integer.valueOf(0));    //   and our single status of zero\r
 741                \r
 742                SortedSet<Integer> s0 = new TreeSet<Integer>();\r
 743                Integer izero = Integer.valueOf(0);\r
 744                fRB.fStatusSets.put(s0, izero);\r
 745                SortedSet<Integer> s1 = new TreeSet<Integer>();\r
 746                s1.add(izero);\r
 747                fRB.fStatusSets.put(s0, izero);\r
 748            }\r
 749 \r
 750            //    For each state, check whether the state's status tag values are\r
 751            //       already entered into the status values array, and add them if not.\r
 752            for (n=0; n<fDStates.size(); n++) {\r
 753                RBBIStateDescriptor sd = fDStates.get(n);\r
 754                Set<Integer> statusVals = sd.fTagVals;\r
 755                Integer arrayIndexI = fRB.fStatusSets.get(statusVals);\r
 756                if (arrayIndexI == null) {\r
 757                    // This is the first encounter of this set of status values.\r
 758                    //   Add them to the statusSets map, This map associates\r
 759                    //   the set of status values with an index in the runtime status \r
 760                    //   values array.\r
 761                    arrayIndexI = Integer.valueOf(fRB.fRuleStatusVals.size());\r
 762                    fRB.fStatusSets.put(statusVals, arrayIndexI);\r
 763                    \r
 764                    // Add the new set of status values to the vector of values that\r
 765                    //   will eventually become the array used by the runtime engine.\r
 766                    fRB.fRuleStatusVals.add(Integer.valueOf(statusVals.size()));\r
 767                    fRB.fRuleStatusVals.addAll(statusVals);\r
 768                }\r
 769                \r
 770                // Save the runtime array index back into the state descriptor.\r
 771                sd.fTagsIdx = arrayIndexI.intValue();\r
 772            }\r
 773        }\r
 774 \r
 775 \r
 776 \r
 777 \r
 778 \r
 779 \r
 780 \r
 781        //-----------------------------------------------------------------------------\r
 782        //\r
 783        //  printPosSets   Debug function.  Dump Nullable, firstpos, lastpos and followpos\r
 784        //                 for each node in the tree.\r
 785        //\r
 786        //-----------------------------------------------------------------------------\r
 787        \r
 788        void printPosSets(RBBINode n) {\r
 789            if (n==null) {\r
 790                return;\r
 791            }\r
 792            RBBINode.printNode(n);\r
 793            System.out.print("         Nullable:  " + n.fNullable);\r
 794 \r
 795            System.out.print("         firstpos:  ");\r
 796            printSet(n.fFirstPosSet);\r
 797 \r
 798            System.out.print("         lastpos:   ");\r
 799            printSet(n.fLastPosSet);\r
 800 \r
 801            System.out.print("         followpos: ");\r
 802            printSet(n.fFollowPos);\r
 803 \r
 804            printPosSets(n.fLeftChild);\r
 805            printPosSets(n.fRightChild);\r
 806        }\r
 807        \r
 808 \r
 809 \r
 810 \r
 811        //-----------------------------------------------------------------------------\r
 812        //\r
 813        //   getTableSize()    Calculate the size in bytes of the runtime form of this\r
 814        //                     state transition table.\r
 815        //\r
 816        //          Note:  Refer to common/rbbidata.h from ICU4C for the declarations\r
 817        //                 of the structures being matched by this calculation.\r
 818        //\r
 819        //-----------------------------------------------------------------------------\r
 820        int  getTableSize()  {\r
 821            int    size = 0;\r
 822            int    numRows;\r
 823            int    numCols;\r
 824            int    rowSize;\r
 825 \r
 826            if (fRB.fTreeRoots[fRootIx] == null) {\r
 827                return 0;\r
 828            }\r
 829 \r
 830            size    = /*sizeof(RBBIStateTable) - 4 */ 16;    // The header, with no rows to the table.\r
 831 \r
 832            numRows = fDStates.size();\r
 833            numCols = fRB.fSetBuilder.getNumCharCategories();\r
 834 \r
 835            //  Note  The declaration of RBBIStateTableRow is for a table of two columns.\r
 836            //        Therefore we subtract two from numCols when determining\r
 837            //        how much storage to add to a row for the total columns.\r
 838            // rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2);\r
 839            rowSize = 8 + 2*numCols;\r
 840            size   += numRows * rowSize;\r
 841            while (size % 8 > 0) {    // Size must be multiple of 8 bytes in size.\r
 842                size++;\r
 843            }\r
 844 \r
 845            return size;\r
 846        }\r
 847 \r
 848 \r
 849 \r
 850        //-----------------------------------------------------------------------------\r
 851        //\r
 852        //   exportTable()    export the state transition table in the ICU4C format.\r
 853        //\r
 854        //                    Most of the table is 16 bit shorts.  This function exports\r
 855        //                    the whole thing as an array of shorts.\r
 856        //\r
 857        //                    The size of the array must be rounded up to a multiple of\r
 858        //                    8 bytes.\r
 859        //\r
 860        //                    See struct RBBIStateTable in ICU4C, common/rbbidata.h\r
 861        //\r
 862        //-----------------------------------------------------------------------------\r
 863        \r
 864        short [] exportTable() {\r
 865            int                state;\r
 866            int                col;\r
 867 \r
 868            if (fRB.fTreeRoots[fRootIx] == null) {\r
 869                return new short[0];\r
 870            }\r
 871 \r
 872            Assert.assrt(fRB.fSetBuilder.getNumCharCategories() < 0x7fff &&\r
 873                fDStates.size() < 0x7fff); \r
 874 \r
 875            int numStates = fDStates.size();\r
 876     \r
 877            // Size of table size in shorts.\r
 878            //  the "4" is the size of struct RBBIStateTableRow, the row header part only.\r
 879            int rowLen = 4 + fRB.fSetBuilder.getNumCharCategories();\r
 880            int tableSize = getTableSize() / 2;\r
 881 \r
 882            \r
 883            short [] table = new short[tableSize];\r
 884            \r
 885            //\r
 886            // Fill in the header fields.\r
 887            //      Annoying because they really want to be ints, not shorts.\r
 888            //\r
 889            // RBBIStateTable.fNumStates\r
 890            table[RBBIDataWrapper.NUMSTATES]   = (short)(numStates >>> 16);\r
 891            table[RBBIDataWrapper.NUMSTATES+1] = (short)(numStates & 0x0000ffff);\r
 892 \r
 893            // RBBIStateTable.fRowLen\r
 894            table[RBBIDataWrapper.ROWLEN]   = (short)(rowLen >>> 16);\r
 895            table[RBBIDataWrapper.ROWLEN+1] = (short)(rowLen & 0x0000ffff);\r
 896            \r
 897            // RBBIStateTable.fFlags\r
 898            int flags = 0;\r
 899            if (fRB.fLookAheadHardBreak) {\r
 900                flags  |= RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK;\r
 901            }\r
 902            if (fRB.fSetBuilder.sawBOF()) {\r
 903                flags  |= RBBIDataWrapper.RBBI_BOF_REQUIRED;\r
 904            }\r
 905            table[RBBIDataWrapper.FLAGS]   = (short)(flags >>> 16);\r
 906            table[RBBIDataWrapper.FLAGS+1] = (short)(flags & 0x0000ffff);\r
 907            \r
 908            int numCharCategories = fRB.fSetBuilder.getNumCharCategories();\r
 909            for (state=0; state<numStates; state++) {\r
 910                RBBIStateDescriptor sd = fDStates.get(state);\r
 911                int                row = 8 + state*rowLen;\r
 912                Assert.assrt (-32768 < sd.fAccepting && sd.fAccepting <= 32767);\r
 913                Assert.assrt (-32768 < sd.fLookAhead && sd.fLookAhead <= 32767);\r
 914                table[row + RBBIDataWrapper.ACCEPTING] = (short)sd.fAccepting;\r
 915                table[row + RBBIDataWrapper.LOOKAHEAD] = (short)sd.fLookAhead;\r
 916                table[row + RBBIDataWrapper.TAGIDX]    = (short)sd.fTagsIdx;\r
 917                for (col=0; col<numCharCategories; col++) {\r
 918                    table[row + RBBIDataWrapper.NEXTSTATES + col] = (short)sd.fDtran[col];\r
 919                }\r
 920            }\r
 921            return table;\r
 922        }\r
 923 \r
 924 \r
 925 \r
 926        //-----------------------------------------------------------------------------\r
 927        //\r
 928        //   printSet    Debug function.   Print the contents of a set of Nodes\r
 929        //\r
 930        //-----------------------------------------------------------------------------\r
 931        \r
 932        void printSet(Collection<RBBINode> s) {\r
 933            for (RBBINode n : s) {\r
 934                RBBINode.printInt(n.fSerialNum, 8);\r
 935            }\r
 936            System.out.println();\r
 937        }\r
 938        \r
 939 \r
 940 \r
 941        //-----------------------------------------------------------------------------\r
 942        //\r
 943        //   printStates    Debug Function.  Dump the fully constructed state transition table.\r
 944        //\r
 945        //-----------------------------------------------------------------------------\r
 946        \r
 947        void printStates() {\r
 948            int     c;    // input "character"\r
 949            int     n;    // state number\r
 950 \r
 951            System.out.print("state |           i n p u t     s y m b o l s \n");\r
 952            System.out.print("      | Acc  LA    Tag");\r
 953            for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {\r
 954                RBBINode.printInt(c, 3);\r
 955            }\r
 956            System.out.print("\n");\r
 957            System.out.print("      |---------------");\r
 958            for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {\r
 959                System.out.print("---");\r
 960            }\r
 961            System.out.print("\n");\r
 962 \r
 963            for (n=0; n<fDStates.size(); n++) {\r
 964                RBBIStateDescriptor sd = fDStates.get(n);\r
 965                RBBINode.printInt(n, 5);\r
 966                System.out.print(" | ");\r
 967                \r
 968                RBBINode.printInt(sd.fAccepting, 3);\r
 969                RBBINode.printInt(sd.fLookAhead, 4);\r
 970                RBBINode.printInt(sd.fTagsIdx, 6);\r
 971                System.out.print(" ");\r
 972                for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {\r
 973                    RBBINode.printInt(sd.fDtran[c], 3);\r
 974                }\r
 975                System.out.print("\n");\r
 976            }\r
 977            System.out.print("\n\n");\r
 978        }\r
 979        \r
 980 \r
 981 \r
 982 \r
 983        //-----------------------------------------------------------------------------\r
 984        //\r
 985        //   printRuleStatusTable    Debug Function.  Dump the common rule status table\r
 986        //\r
 987        //-----------------------------------------------------------------------------\r
 988        \r
 989        void printRuleStatusTable() {\r
 990            int  thisRecord = 0;\r
 991            int  nextRecord = 0;\r
 992            int      i;\r
 993            List<Integer> tbl = fRB.fRuleStatusVals;\r
 994 \r
 995            System.out.print("index |  tags \n");\r
 996            System.out.print("-------------------\n");\r
 997 \r
 998            while (nextRecord < tbl.size()) {\r
 999                thisRecord = nextRecord;\r
1000                nextRecord = thisRecord + tbl.get(thisRecord).intValue() + 1;\r
1001                RBBINode.printInt(thisRecord, 7);\r
1002                for (i=thisRecord+1; i<nextRecord; i++) {\r
1003                    int val = tbl.get(i).intValue();\r
1004                    RBBINode.printInt(val, 7);\r
1005                }\r
1006                System.out.print("\n");\r
1007            }\r
1008            System.out.print("\n\n");\r
1009        }\r
1010        \r
1011 \r
1012 \r
1013 }\r