/* ********************************************************************** * Copyright (c) 2002-2009, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ package com.ibm.icu.text; import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; import com.ibm.icu.impl.Assert; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UProperty; // // class RBBITableBuilder is part of the RBBI rule compiler. // It builds the state transition table used by the RBBI runtime // from the expression syntax tree generated by the rule scanner. // // This class is part of the RBBI implementation only. // There is no user-visible public API here. // class RBBITableBuilder { // // RBBIStateDescriptor - The DFA is initially constructed as a set of these descriptors, // one for each state. static class RBBIStateDescriptor { boolean fMarked; int fAccepting; int fLookAhead; SortedSet fTagVals; int fTagsIdx; Set fPositions; // Set of parse tree positions associated // with this state. Unordered (it's a set). // UVector contents are RBBINode * int[] fDtran; // Transitions out of this state. // indexed by input character // contents is int index of dest state // in RBBITableBuilder.fDStates RBBIStateDescriptor(int maxInputSymbol) { fTagVals = new TreeSet(); fPositions = new HashSet(); fDtran = new int[maxInputSymbol+1]; // fDtran needs to be pre-sized. // It is indexed by input symbols, and will // hold the next state number for each // symbol. } } private RBBIRuleBuilder fRB; private int fRootIx; // The array index into RBBIRuleBuilder.fTreeRoots // for the parse tree to operate on. // Too bad Java can't do indirection more easily! private List fDStates; // D states (Aho's terminology) // Index is state number // Contents are RBBIStateDescriptor pointers. //----------------------------------------------------------------------------- // // Constructor for RBBITableBuilder. // // rootNode is an index into the array of root nodes that is held by // the overall RBBIRuleBuilder. //----------------------------------------------------------------------------- RBBITableBuilder(RBBIRuleBuilder rb, int rootNodeIx) { fRootIx = rootNodeIx; fRB = rb; fDStates = new ArrayList(); } //----------------------------------------------------------------------------- // // RBBITableBuilder::build - This is the main function for building the DFA state transtion // table from the RBBI rules parse tree. // //----------------------------------------------------------------------------- void build() { // If there were no rules, just return. This situation can easily arise // for the reverse rules. if (fRB.fTreeRoots[fRootIx]==null) { return; } // // Walk through the tree, replacing any references to $variables with a copy of the // parse tree for the substition expression. // fRB.fTreeRoots[fRootIx] = fRB.fTreeRoots[fRootIx].flattenVariables(); if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("ftree")>=0) { System.out.println("Parse tree after flattening variable references."); fRB.fTreeRoots[fRootIx].printTree(true); } // // If the rules contained any references to {bof} // add a {bof} to the // tree. Means that all matches must start out with the // {bof} fake character. // if (fRB.fSetBuilder.sawBOF()) { RBBINode bofTop = new RBBINode(RBBINode.opCat); RBBINode bofLeaf = new RBBINode(RBBINode.leafChar); bofTop.fLeftChild = bofLeaf; bofTop.fRightChild = fRB.fTreeRoots[fRootIx]; bofLeaf.fParent = bofTop; bofLeaf.fVal = 2; // Reserved value for {bof}. fRB.fTreeRoots[fRootIx] = bofTop; } // // Add a unique right-end marker to the expression. // Appears as a cat-node, left child being the original tree, // right child being the end marker. // RBBINode cn = new RBBINode(RBBINode.opCat); cn.fLeftChild = fRB.fTreeRoots[fRootIx]; fRB.fTreeRoots[fRootIx].fParent = cn; cn.fRightChild = new RBBINode(RBBINode.endMark); cn.fRightChild.fParent = cn; fRB.fTreeRoots[fRootIx] = cn; // // Replace all references to UnicodeSets with the tree for the equivalent // expression. // fRB.fTreeRoots[fRootIx].flattenSets(); if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("stree")>=0) { System.out.println("Parse tree after flattening Unicode Set references."); fRB.fTreeRoots[fRootIx].printTree(true); } // // calculate the functions nullable, firstpos, lastpos and followpos on // nodes in the parse tree. // See the alogrithm description in Aho. // Understanding how this works by looking at the code alone will be // nearly impossible. // calcNullable(fRB.fTreeRoots[fRootIx]); calcFirstPos(fRB.fTreeRoots[fRootIx]); calcLastPos(fRB.fTreeRoots[fRootIx]); calcFollowPos(fRB.fTreeRoots[fRootIx]); if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("pos")>=0) { System.out.print("\n"); printPosSets(fRB.fTreeRoots[fRootIx]); } // // For "chained" rules, modify the followPos sets // if (fRB.fChainRules) { calcChainedFollowPos(fRB.fTreeRoots[fRootIx]); } // // BOF (start of input) test fixup. // if (fRB.fSetBuilder.sawBOF()) { bofFixup(); } // // Build the DFA state transition tables. // buildStateTable(); flagAcceptingStates(); flagLookAheadStates(); flagTaggedStates(); // // Update the global table of rule status {tag} values // The rule builder has a global vector of status values that are common // for all tables. Merge the ones from this table into the global set. // mergeRuleStatusVals(); if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("states")>=0) {printStates();} } //----------------------------------------------------------------------------- // // calcNullable. Impossible to explain succinctly. See Aho, section 3.9 // //----------------------------------------------------------------------------- void calcNullable(RBBINode n) { if (n == null) { return; } if (n.fType == RBBINode.setRef || n.fType == RBBINode.endMark ) { // These are non-empty leaf node types. n.fNullable = false; return; } if (n.fType == RBBINode.lookAhead || n.fType == RBBINode.tag) { // Lookahead marker node. It's a leaf, so no recursion on children. // It's nullable because it does not match any literal text from the input stream. n.fNullable = true; return; } // The node is not a leaf. // Calculate nullable on its children. calcNullable(n.fLeftChild); calcNullable(n.fRightChild); // Apply functions from table 3.40 in Aho if (n.fType == RBBINode.opOr) { n.fNullable = n.fLeftChild.fNullable || n.fRightChild.fNullable; } else if (n.fType == RBBINode.opCat) { n.fNullable = n.fLeftChild.fNullable && n.fRightChild.fNullable; } else if (n.fType == RBBINode.opStar || n.fType == RBBINode.opQuestion) { n.fNullable = true; } else { n.fNullable = false; } } //----------------------------------------------------------------------------- // // calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9 // //----------------------------------------------------------------------------- void calcFirstPos(RBBINode n) { if (n == null) { return; } if (n.fType == RBBINode.leafChar || n.fType == RBBINode.endMark || n.fType == RBBINode.lookAhead || n.fType == RBBINode.tag) { // These are non-empty leaf node types. n.fFirstPosSet.add(n); return; } // The node is not a leaf. // Calculate firstPos on its children. calcFirstPos(n.fLeftChild); calcFirstPos(n.fRightChild); // Apply functions from table 3.40 in Aho if (n.fType == RBBINode.opOr) { n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet); n.fFirstPosSet.addAll(n.fRightChild.fFirstPosSet); } else if (n.fType == RBBINode.opCat) { n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet); if (n.fLeftChild.fNullable) { n.fFirstPosSet.addAll(n.fRightChild.fFirstPosSet); } } else if (n.fType == RBBINode.opStar || n.fType == RBBINode.opQuestion || n.fType == RBBINode.opPlus) { n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet); } } //----------------------------------------------------------------------------- // // calcLastPos. Impossible to explain succinctly. See Aho, section 3.9 // //----------------------------------------------------------------------------- void calcLastPos(RBBINode n) { if (n == null) { return; } if (n.fType == RBBINode.leafChar || n.fType == RBBINode.endMark || n.fType == RBBINode.lookAhead || n.fType == RBBINode.tag) { // These are non-empty leaf node types. n.fLastPosSet.add(n); return; } // The node is not a leaf. // Calculate lastPos on its children. calcLastPos(n.fLeftChild); calcLastPos(n.fRightChild); // Apply functions from table 3.40 in Aho if (n.fType == RBBINode.opOr) { n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet); n.fLastPosSet.addAll(n.fRightChild.fLastPosSet); } else if (n.fType == RBBINode.opCat) { n.fLastPosSet.addAll(n.fRightChild.fLastPosSet); if (n.fRightChild.fNullable) { n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet); } } else if (n.fType == RBBINode.opStar || n.fType == RBBINode.opQuestion || n.fType == RBBINode.opPlus) { n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet); } } //----------------------------------------------------------------------------- // // calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9 // //----------------------------------------------------------------------------- void calcFollowPos(RBBINode n) { if (n == null || n.fType == RBBINode.leafChar || n.fType == RBBINode.endMark) { return; } calcFollowPos(n.fLeftChild); calcFollowPos(n.fRightChild); // Aho rule #1 if (n.fType == RBBINode.opCat) { for (RBBINode i /* is 'i' in Aho's description */ : n.fLeftChild.fLastPosSet) { i.fFollowPos.addAll(n.fRightChild.fFirstPosSet); } } // Aho rule #2 if (n.fType == RBBINode.opStar || n.fType == RBBINode.opPlus) { for (RBBINode i /* again, n and i are the names from Aho's description */ : n.fLastPosSet) { i.fFollowPos.addAll(n.fFirstPosSet); } } } //----------------------------------------------------------------------------- // // calcChainedFollowPos. Modify the previously calculated followPos sets // to implement rule chaining. NOT described by Aho // //----------------------------------------------------------------------------- void calcChainedFollowPos(RBBINode tree) { List endMarkerNodes = new ArrayList(); List leafNodes = new ArrayList(); // get a list of all endmarker nodes. tree.findNodes(endMarkerNodes, RBBINode.endMark); // get a list all leaf nodes tree.findNodes(leafNodes, RBBINode.leafChar); // Get all nodes that can be the start a match, which is FirstPosition() // of the portion of the tree corresponding to user-written rules. // See the tree description in bofFixup(). RBBINode userRuleRoot = tree; if (fRB.fSetBuilder.sawBOF()) { userRuleRoot = tree.fLeftChild.fRightChild; } Assert.assrt(userRuleRoot != null); Set matchStartNodes = userRuleRoot.fFirstPosSet; // Iteratate over all leaf nodes, // for (RBBINode tNode : leafNodes) { RBBINode endNode = null; // Identify leaf nodes that correspond to overall rule match positions. // These include an endMarkerNode in their followPos sets. for (RBBINode endMarkerNode : endMarkerNodes) { if (tNode.fFollowPos.contains(endMarkerNode)) { endNode = tNode; break; } } if (endNode == null) { // node wasn't an end node. Try again with the next. continue; } // We've got a node that can end a match. // Line Break Specific hack: If this node's val correspond to the $CM char class, // don't chain from it. // TODO: Add rule syntax for this behavior, get specifics out of here and // into the rule file. if (fRB.fLBCMNoChain) { int c = this.fRB.fSetBuilder.getFirstChar(endNode.fVal); if (c != -1) { // c == -1 occurs with sets containing only the {eof} marker string. int cLBProp = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK); if (cLBProp == UCharacter.LineBreak.COMBINING_MARK) { continue; } } } // Now iterate over the nodes that can start a match, looking for ones // with the same char class as our ending node. for (RBBINode startNode : matchStartNodes) { if (startNode.fType != RBBINode.leafChar) { continue; } if (endNode.fVal == startNode.fVal) { // The end val (character class) of one possible match is the // same as the start of another. // Add all nodes from the followPos of the start node to the // followPos set of the end node, which will have the effect of // letting matches transition from a match state at endNode // to the second char of a match starting with startNode. endNode.fFollowPos.addAll(startNode.fFollowPos); } } } } //----------------------------------------------------------------------------- // // bofFixup. Fixup for state tables that include {bof} beginning of input testing. // Do an swizzle similar to chaining, modifying the followPos set of // the bofNode to include the followPos nodes from other {bot} nodes // scattered through the tree. // // This function has much in common with calcChainedFollowPos(). // //----------------------------------------------------------------------------- void bofFixup() { // // The parse tree looks like this ... // fTree root --. // / \ // <#end node> // / \ // rest // of tree // // We will be adding things to the followPos set of the // RBBINode bofNode = fRB.fTreeRoots[fRootIx].fLeftChild.fLeftChild; Assert.assrt(bofNode.fType == RBBINode.leafChar); Assert.assrt(bofNode.fVal == 2); // Get all nodes that can be the start a match of the user-written rules // (excluding the fake bofNode) // We want the nodes that can start a match in the // part labeled "rest of tree" // Set matchStartNodes = fRB.fTreeRoots[fRootIx].fLeftChild.fRightChild.fFirstPosSet; for (RBBINode startNode : matchStartNodes) { if (startNode.fType != RBBINode.leafChar) { continue; } if (startNode.fVal == bofNode.fVal) { // We found a leaf node corresponding to a {bof} that was // explicitly written into a rule. // Add everything from the followPos set of this node to the // followPos set of the fake bofNode at the start of the tree. // bofNode.fFollowPos.addAll(startNode.fFollowPos); } } } //----------------------------------------------------------------------------- // // buildStateTable() Determine the set of runtime DFA states and the // transition tables for these states, by the algorithm // of fig. 3.44 in Aho. // // Most of the comments are quotes of Aho's psuedo-code. // //----------------------------------------------------------------------------- void buildStateTable() { // // Add a dummy state 0 - the stop state. Not from Aho. int lastInputSymbol = fRB.fSetBuilder.getNumCharCategories() - 1; RBBIStateDescriptor failState = new RBBIStateDescriptor(lastInputSymbol); fDStates.add(failState); // initially, the only unmarked state in Dstates is firstpos(root), // where toot is the root of the syntax tree for (r)#; RBBIStateDescriptor initialState = new RBBIStateDescriptor(lastInputSymbol); initialState.fPositions.addAll(fRB.fTreeRoots[fRootIx].fFirstPosSet); fDStates.add(initialState); // while there is an unmarked state T in Dstates do begin for (;;) { RBBIStateDescriptor T = null; int tx; for (tx=1; tx U = null; for (RBBINode p : T.fPositions) { if ((p.fType == RBBINode.leafChar) && (p.fVal == a)) { if (U == null) { U = new HashSet(); } U.addAll(p.fFollowPos); } } // if U is not empty and not in DStates then int ux = 0; boolean UinDstates = false; if (U != null) { Assert.assrt(U.size() > 0); int ix; for (ix=0; ix endMarkerNodes = new ArrayList(); RBBINode endMarker; int i; int n; fRB.fTreeRoots[fRootIx].findNodes(endMarkerNodes, RBBINode.endMark); for (i=0; i= 0) { if (sd.fPositions.contains(endMarker)) { // Any non-zero value for fAccepting means this is an accepting node. // The value is what will be returned to the user as the break status. // If no other value was specified, force it to -1. if (sd.fAccepting==0) { // State hasn't been marked as accepting yet. Do it now. sd.fAccepting = endMarker.fVal; if (sd.fAccepting == 0) { sd.fAccepting = -1; } } if (sd.fAccepting==-1 && endMarker.fVal != 0) { // Both lookahead and non-lookahead accepting for this state. // Favor the look-ahead. Expedient for line break. // TODO: need a more elegant resolution for conflicting rules. sd.fAccepting = endMarker.fVal; } // implicit else: // if sd.fAccepting already had a value other than 0 or -1, leave it be. // If the end marker node is from a look-ahead rule, set // the fLookAhead field or this state also. if (endMarker.fLookAheadEnd) { // TODO: don't change value if already set? // TODO: allow for more than one active look-ahead rule in engine. // Make value here an index to a side array in engine? sd.fLookAhead = sd.fAccepting; } } } } } //----------------------------------------------------------------------------- // // flagLookAheadStates Very similar to flagAcceptingStates, above. // //----------------------------------------------------------------------------- void flagLookAheadStates() { List lookAheadNodes = new ArrayList(); RBBINode lookAheadNode; int i; int n; fRB.fTreeRoots[fRootIx].findNodes(lookAheadNodes, RBBINode.lookAhead); for (i=0; i tagNodes = new ArrayList(); RBBINode tagNode; int i; int n; fRB.fTreeRoots[fRootIx].findNodes(tagNodes, RBBINode.tag); for (i=0; i s0 = new TreeSet(); Integer izero = Integer.valueOf(0); fRB.fStatusSets.put(s0, izero); SortedSet s1 = new TreeSet(); s1.add(izero); fRB.fStatusSets.put(s0, izero); } // For each state, check whether the state's status tag values are // already entered into the status values array, and add them if not. for (n=0; n statusVals = sd.fTagVals; Integer arrayIndexI = fRB.fStatusSets.get(statusVals); if (arrayIndexI == null) { // This is the first encounter of this set of status values. // Add them to the statusSets map, This map associates // the set of status values with an index in the runtime status // values array. arrayIndexI = Integer.valueOf(fRB.fRuleStatusVals.size()); fRB.fStatusSets.put(statusVals, arrayIndexI); // Add the new set of status values to the vector of values that // will eventually become the array used by the runtime engine. fRB.fRuleStatusVals.add(Integer.valueOf(statusVals.size())); fRB.fRuleStatusVals.addAll(statusVals); } // Save the runtime array index back into the state descriptor. sd.fTagsIdx = arrayIndexI.intValue(); } } //----------------------------------------------------------------------------- // // printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos // for each node in the tree. // //----------------------------------------------------------------------------- void printPosSets(RBBINode n) { if (n==null) { return; } RBBINode.printNode(n); System.out.print(" Nullable: " + n.fNullable); System.out.print(" firstpos: "); printSet(n.fFirstPosSet); System.out.print(" lastpos: "); printSet(n.fLastPosSet); System.out.print(" followpos: "); printSet(n.fFollowPos); printPosSets(n.fLeftChild); printPosSets(n.fRightChild); } //----------------------------------------------------------------------------- // // getTableSize() Calculate the size in bytes of the runtime form of this // state transition table. // // Note: Refer to common/rbbidata.h from ICU4C for the declarations // of the structures being matched by this calculation. // //----------------------------------------------------------------------------- int getTableSize() { int size = 0; int numRows; int numCols; int rowSize; if (fRB.fTreeRoots[fRootIx] == null) { return 0; } size = /*sizeof(RBBIStateTable) - 4 */ 16; // The header, with no rows to the table. numRows = fDStates.size(); numCols = fRB.fSetBuilder.getNumCharCategories(); // Note The declaration of RBBIStateTableRow is for a table of two columns. // Therefore we subtract two from numCols when determining // how much storage to add to a row for the total columns. // rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2); rowSize = 8 + 2*numCols; size += numRows * rowSize; while (size % 8 > 0) { // Size must be multiple of 8 bytes in size. size++; } return size; } //----------------------------------------------------------------------------- // // exportTable() export the state transition table in the ICU4C format. // // Most of the table is 16 bit shorts. This function exports // the whole thing as an array of shorts. // // The size of the array must be rounded up to a multiple of // 8 bytes. // // See struct RBBIStateTable in ICU4C, common/rbbidata.h // //----------------------------------------------------------------------------- short [] exportTable() { int state; int col; if (fRB.fTreeRoots[fRootIx] == null) { return new short[0]; } Assert.assrt(fRB.fSetBuilder.getNumCharCategories() < 0x7fff && fDStates.size() < 0x7fff); int numStates = fDStates.size(); // Size of table size in shorts. // the "4" is the size of struct RBBIStateTableRow, the row header part only. int rowLen = 4 + fRB.fSetBuilder.getNumCharCategories(); int tableSize = getTableSize() / 2; short [] table = new short[tableSize]; // // Fill in the header fields. // Annoying because they really want to be ints, not shorts. // // RBBIStateTable.fNumStates table[RBBIDataWrapper.NUMSTATES] = (short)(numStates >>> 16); table[RBBIDataWrapper.NUMSTATES+1] = (short)(numStates & 0x0000ffff); // RBBIStateTable.fRowLen table[RBBIDataWrapper.ROWLEN] = (short)(rowLen >>> 16); table[RBBIDataWrapper.ROWLEN+1] = (short)(rowLen & 0x0000ffff); // RBBIStateTable.fFlags int flags = 0; if (fRB.fLookAheadHardBreak) { flags |= RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK; } if (fRB.fSetBuilder.sawBOF()) { flags |= RBBIDataWrapper.RBBI_BOF_REQUIRED; } table[RBBIDataWrapper.FLAGS] = (short)(flags >>> 16); table[RBBIDataWrapper.FLAGS+1] = (short)(flags & 0x0000ffff); int numCharCategories = fRB.fSetBuilder.getNumCharCategories(); for (state=0; state s) { for (RBBINode n : s) { RBBINode.printInt(n.fSerialNum, 8); } System.out.println(); } //----------------------------------------------------------------------------- // // printStates Debug Function. Dump the fully constructed state transition table. // //----------------------------------------------------------------------------- void printStates() { int c; // input "character" int n; // state number System.out.print("state | i n p u t s y m b o l s \n"); System.out.print(" | Acc LA Tag"); for (c=0; c tbl = fRB.fRuleStatusVals; System.out.print("index | tags \n"); System.out.print("-------------------\n"); while (nextRecord < tbl.size()) { thisRecord = nextRecord; nextRecord = thisRecord + tbl.get(thisRecord).intValue() + 1; RBBINode.printInt(thisRecord, 7); for (i=thisRecord+1; i