2 *******************************************************************************
3 * Copyright (C) 2003-2011, International Business Machines Corporation and others. All Rights Reserved.
4 *******************************************************************************
7 package com.ibm.icu.text;
9 import java.text.ParsePosition;
10 import java.util.HashMap;
12 import com.ibm.icu.impl.Assert;
13 import com.ibm.icu.impl.Utility;
14 import com.ibm.icu.lang.UCharacter;
17 * This class is part of the Rule Based Break Iterator rule compiler.
18 * It scans the rules and builds the parse tree.
19 * There is no public API here.
21 class RBBIRuleScanner {
23 private final static int kStackSize = 100; // The size of the state stack for
24 // rules parsing. Corresponds roughly
25 // to the depth of parentheses nesting
26 // that is allowed in the rules.
28 static class RBBIRuleChar {
35 RBBIRuleBuilder fRB; // The rule builder that we are part of.
37 int fScanIndex; // Index of current character being processed
38 // in the rule input string.
39 int fNextIndex; // Index of the next character, which
40 // is the first character not yet scanned.
41 boolean fQuoteMode; // Scan is in a 'quoted region'
42 int fLineNum; // Line number in input file.
43 int fCharNum; // Char position within the line.
44 int fLastChar; // Previous char, needed to count CR-LF
45 // as a single line, not two.
47 RBBIRuleChar fC = new RBBIRuleChar(); // Current char for parse state machine
49 String fVarName; // $variableName, valid when we've just
53 short fStack[] = new short[kStackSize]; // State stack, holds state pushes
54 int fStackPtr; // and pops as specified in the state
57 RBBINode fNodeStack[] = new RBBINode[kStackSize]; // Node stack, holds nodes created
58 // during the parse of a rule
62 boolean fReverseRule; // True if the rule currently being scanned
63 // is a reverse direction rule (if it
66 boolean fLookAheadRule; // True if the rule includes a '/'
67 // somewhere within it.
69 RBBISymbolTable fSymbolTable; // symbol table, holds definitions of
72 HashMap<String, RBBISetTableEl> fSetTable = new HashMap<String, RBBISetTableEl>(); // UnicocodeSet hash table, holds indexes to
73 // the sets created while parsing rules.
74 // The key is the string used for creating
77 UnicodeSet fRuleSets[] = new UnicodeSet[10]; // Unicode Sets that are needed during
78 // the scanning of RBBI rules. The
79 // indicies for these are assigned by the
80 // perl script that builds the state tables.
83 int fRuleNum; // Counts each rule as it is scanned.
85 int fOptionStart; // Input index of start of a !!option
86 // keyword, while being scanned.
90 static private String gRuleSet_rule_char_pattern = "[^[\\p{Z}\\u0020-\\u007f]-[\\p{L}]-[\\p{N}]]";
91 static private String gRuleSet_name_char_pattern = "[_\\p{L}\\p{N}]";
92 static private String gRuleSet_digit_char_pattern = "[0-9]";
93 static private String gRuleSet_name_start_char_pattern = "[_\\p{L}]";
94 static private String gRuleSet_white_space_pattern = "[\\p{Pattern_White_Space}]";
95 static private String kAny = "any";
100 //----------------------------------------------------------------------------------------
104 //----------------------------------------------------------------------------------------
105 RBBIRuleScanner(RBBIRuleBuilder rb) {
110 // Set up the constant Unicode Sets.
111 // Note: These could be made static and shared among
112 // all instances of RBBIRuleScanners.
113 fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128] = new UnicodeSet(gRuleSet_rule_char_pattern);
114 fRuleSets[RBBIRuleParseTable.kRuleSet_white_space - 128] = new UnicodeSet(gRuleSet_white_space_pattern);
115 fRuleSets[RBBIRuleParseTable.kRuleSet_name_char - 128] = new UnicodeSet(gRuleSet_name_char_pattern);
116 fRuleSets[RBBIRuleParseTable.kRuleSet_name_start_char - 128] = new UnicodeSet(gRuleSet_name_start_char_pattern);
117 fRuleSets[RBBIRuleParseTable.kRuleSet_digit_char - 128] = new UnicodeSet(gRuleSet_digit_char_pattern);
119 fSymbolTable = new RBBISymbolTable(this, rb.fRules);
122 //----------------------------------------------------------------------------------------
124 // doParseAction Do some action during rule parsing.
125 // Called by the parse state machine.
126 // Actions build the parse tree and Unicode Sets,
127 // and maintain the parse stack for nested expressions.
129 //----------------------------------------------------------------------------------------
130 boolean doParseActions(int action) {
133 boolean returnVal = true;
137 case RBBIRuleParseTable.doExprStart:
138 pushNewNode(RBBINode.opStart);
142 case RBBIRuleParseTable.doExprOrOperator: {
143 fixOpStack(RBBINode.precOpCat);
144 RBBINode operandNode = fNodeStack[fNodeStackPtr--];
145 RBBINode orNode = pushNewNode(RBBINode.opOr);
146 orNode.fLeftChild = operandNode;
147 operandNode.fParent = orNode;
151 case RBBIRuleParseTable.doExprCatOperator:
152 // concatenation operator.
153 // For the implicit concatenation of adjacent terms in an expression
155 // not separated by any other operator. Action is invoked between the
156 // actions for the two terms.
158 fixOpStack(RBBINode.precOpCat);
159 RBBINode operandNode = fNodeStack[fNodeStackPtr--];
160 RBBINode catNode = pushNewNode(RBBINode.opCat);
161 catNode.fLeftChild = operandNode;
162 operandNode.fParent = catNode;
166 case RBBIRuleParseTable.doLParen:
168 // The openParen node is a dummy operation type with a low
170 // which has the affect of ensuring that any real binary op that
171 // follows within the parens binds more tightly to the operands than
172 // stuff outside of the parens.
173 pushNewNode(RBBINode.opLParen);
176 case RBBIRuleParseTable.doExprRParen:
177 fixOpStack(RBBINode.precLParen);
180 case RBBIRuleParseTable.doNOP:
183 case RBBIRuleParseTable.doStartAssign:
184 // We've just scanned "$variable = "
185 // The top of the node stack has the $variable ref node.
187 // Save the start position of the RHS text in the StartExpression
189 // that precedes the $variableReference node on the stack.
190 // This will eventually be used when saving the full $variable
193 n = fNodeStack[fNodeStackPtr - 1];
194 n.fFirstPos = fNextIndex; // move past the '='
196 // Push a new start-of-expression node; needed to keep parse of the
197 // RHS expression happy.
198 pushNewNode(RBBINode.opStart);
201 case RBBIRuleParseTable.doEndAssign: {
202 // We have reached the end of an assignement statement.
203 // Current scan char is the ';' that terminates the assignment.
205 // Terminate expression, leaves expression parse tree rooted in TOS
207 fixOpStack(RBBINode.precStart);
209 RBBINode startExprNode = fNodeStack[fNodeStackPtr - 2];
210 RBBINode varRefNode = fNodeStack[fNodeStackPtr - 1];
211 RBBINode RHSExprNode = fNodeStack[fNodeStackPtr];
213 // Save original text of right side of assignment, excluding the
215 // in the root of the node for the right-hand-side expression.
216 RHSExprNode.fFirstPos = startExprNode.fFirstPos;
217 RHSExprNode.fLastPos = fScanIndex;
218 // fRB.fRules.extractBetween(RHSExprNode.fFirstPos,
219 // RHSExprNode.fLastPos, RHSExprNode.fText);
220 RHSExprNode.fText = fRB.fRules.substring(RHSExprNode.fFirstPos,
221 RHSExprNode.fLastPos);
223 // Expression parse tree becomes l. child of the $variable reference
225 varRefNode.fLeftChild = RHSExprNode;
226 RHSExprNode.fParent = varRefNode;
228 // Make a symbol table entry for the $variableRef node.
229 fSymbolTable.addEntry(varRefNode.fText, varRefNode);
231 // Clean up the stack.
236 case RBBIRuleParseTable.doEndOfRule: {
237 fixOpStack(RBBINode.precStart); // Terminate expression, leaves
240 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("rtree") >= 0) {
241 printNodeStack("end of rule");
243 Assert.assrt(fNodeStackPtr == 1);
245 // If this rule includes a look-ahead '/', add a endMark node to the
247 if (fLookAheadRule) {
248 RBBINode thisRule = fNodeStack[fNodeStackPtr];
249 RBBINode endNode = pushNewNode(RBBINode.endMark);
250 RBBINode catNode = pushNewNode(RBBINode.opCat);
252 catNode.fLeftChild = thisRule;
253 catNode.fRightChild = endNode;
254 fNodeStack[fNodeStackPtr] = catNode;
255 endNode.fVal = fRuleNum;
256 endNode.fLookAheadEnd = true;
259 // All rule expressions are ORed together.
260 // The ';' that terminates an expression really just functions as a
262 // a low operator prededence.
264 // Each of the four sets of rules are collected separately.
265 // (forward, reverse, safe_forward, safe_reverse)
266 // OR this rule into the appropriate group of them.
269 int destRules = (fReverseRule ? RBBIRuleBuilder.fReverseTree : fRB.fDefaultTree);
271 if (fRB.fTreeRoots[destRules] != null) {
272 // This is not the first rule encounted.
273 // OR previous stuff (from *destRules)
274 // with the current rule expression (on the Node Stack)
275 // with the resulting OR expression going to *destRules
277 RBBINode thisRule = fNodeStack[fNodeStackPtr];
278 RBBINode prevRules = fRB.fTreeRoots[destRules];
279 RBBINode orNode = pushNewNode(RBBINode.opOr);
280 orNode.fLeftChild = prevRules;
281 prevRules.fParent = orNode;
282 orNode.fRightChild = thisRule;
283 thisRule.fParent = orNode;
284 fRB.fTreeRoots[destRules] = orNode;
286 // This is the first rule encountered (for this direction).
287 // Just move its parse tree from the stack to *destRules.
288 fRB.fTreeRoots[destRules] = fNodeStack[fNodeStackPtr];
290 fReverseRule = false; // in preparation for the next rule.
291 fLookAheadRule = false;
296 case RBBIRuleParseTable.doRuleError:
297 error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
301 case RBBIRuleParseTable.doVariableNameExpectedErr:
302 error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
306 // Unary operands + ? *
307 // These all appear after the operand to which they apply.
308 // When we hit one, the operand (may be a whole sub expression)
309 // will be on the top of the stack.
310 // Unary Operator becomes TOS, with the old TOS as its one child.
311 case RBBIRuleParseTable.doUnaryOpPlus: {
312 RBBINode operandNode = fNodeStack[fNodeStackPtr--];
313 RBBINode plusNode = pushNewNode(RBBINode.opPlus);
314 plusNode.fLeftChild = operandNode;
315 operandNode.fParent = plusNode;
319 case RBBIRuleParseTable.doUnaryOpQuestion: {
320 RBBINode operandNode = fNodeStack[fNodeStackPtr--];
321 RBBINode qNode = pushNewNode(RBBINode.opQuestion);
322 qNode.fLeftChild = operandNode;
323 operandNode.fParent = qNode;
327 case RBBIRuleParseTable.doUnaryOpStar: {
328 RBBINode operandNode = fNodeStack[fNodeStackPtr--];
329 RBBINode starNode = pushNewNode(RBBINode.opStar);
330 starNode.fLeftChild = operandNode;
331 operandNode.fParent = starNode;
335 case RBBIRuleParseTable.doRuleChar:
336 // A "Rule Character" is any single character that is a literal part
337 // of the regular expression. Like a, b and c in the expression "(abc*)
339 // These are pretty uncommon in break rules; the terms are more commonly
340 // sets. To keep things uniform, treat these characters like as
341 // sets that just happen to contain only one character.
343 n = pushNewNode(RBBINode.setRef);
344 String s = String.valueOf((char)fC.fChar);
345 findSetFor(s, n, null);
346 n.fFirstPos = fScanIndex;
347 n.fLastPos = fNextIndex;
348 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
352 case RBBIRuleParseTable.doDotAny:
353 // scanned a ".", meaning match any single character.
355 n = pushNewNode(RBBINode.setRef);
356 findSetFor(kAny, n, null);
357 n.fFirstPos = fScanIndex;
358 n.fLastPos = fNextIndex;
359 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
363 case RBBIRuleParseTable.doSlash:
364 // Scanned a '/', which identifies a look-ahead break position in a
366 n = pushNewNode(RBBINode.lookAhead);
368 n.fFirstPos = fScanIndex;
369 n.fLastPos = fNextIndex;
370 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
371 fLookAheadRule = true;
374 case RBBIRuleParseTable.doStartTagValue:
375 // Scanned a '{', the opening delimiter for a tag value within a
377 n = pushNewNode(RBBINode.tag);
379 n.fFirstPos = fScanIndex;
380 n.fLastPos = fNextIndex;
383 case RBBIRuleParseTable.doTagDigit:
384 // Just scanned a decimal digit that's part of a tag value
386 n = fNodeStack[fNodeStackPtr];
387 int v = UCharacter.digit((char) fC.fChar, 10);
388 n.fVal = n.fVal * 10 + v;
392 case RBBIRuleParseTable.doTagValue:
393 n = fNodeStack[fNodeStackPtr];
394 n.fLastPos = fNextIndex;
395 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
398 case RBBIRuleParseTable.doTagExpectedError:
399 error(RBBIRuleBuilder.U_BRK_MALFORMED_RULE_TAG);
403 case RBBIRuleParseTable.doOptionStart:
404 // Scanning a !!option. At the start of string.
405 fOptionStart = fScanIndex;
408 case RBBIRuleParseTable.doOptionEnd: {
409 String opt = fRB.fRules.substring(fOptionStart, fScanIndex);
410 if (opt.equals("chain")) {
411 fRB.fChainRules = true;
412 } else if (opt.equals("LBCMNoChain")) {
413 fRB.fLBCMNoChain = true;
414 } else if (opt.equals("forward")) {
415 fRB.fDefaultTree = RBBIRuleBuilder.fForwardTree;
416 } else if (opt.equals("reverse")) {
417 fRB.fDefaultTree = RBBIRuleBuilder.fReverseTree;
418 } else if (opt.equals("safe_forward")) {
419 fRB.fDefaultTree = RBBIRuleBuilder.fSafeFwdTree;
420 } else if (opt.equals("safe_reverse")) {
421 fRB.fDefaultTree = RBBIRuleBuilder.fSafeRevTree;
422 } else if (opt.equals("lookAheadHardBreak")) {
423 fRB.fLookAheadHardBreak = true;
425 error(RBBIRuleBuilder.U_BRK_UNRECOGNIZED_OPTION);
430 case RBBIRuleParseTable.doReverseDir:
434 case RBBIRuleParseTable.doStartVariableName:
435 n = pushNewNode(RBBINode.varRef);
436 n.fFirstPos = fScanIndex;
439 case RBBIRuleParseTable.doEndVariableName:
440 n = fNodeStack[fNodeStackPtr];
441 if (n == null || n.fType != RBBINode.varRef) {
442 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
445 n.fLastPos = fScanIndex;
446 n.fText = fRB.fRules.substring(n.fFirstPos + 1, n.fLastPos);
447 // Look the newly scanned name up in the symbol table
448 // If there's an entry, set the l. child of the var ref to the
449 // replacement expression.
450 // (We also pass through here when scanning assignments, but no harm
452 // than a slight wasted effort that seems hard to avoid. Lookup will
454 n.fLeftChild = fSymbolTable.lookupNode(n.fText);
457 case RBBIRuleParseTable.doCheckVarDef:
458 n = fNodeStack[fNodeStackPtr];
459 if (n.fLeftChild == null) {
460 error(RBBIRuleBuilder.U_BRK_UNDEFINED_VARIABLE);
465 case RBBIRuleParseTable.doExprFinished:
468 case RBBIRuleParseTable.doRuleErrorAssignExpr:
469 error(RBBIRuleBuilder.U_BRK_ASSIGN_ERROR);
473 case RBBIRuleParseTable.doExit:
477 case RBBIRuleParseTable.doScanUnicodeSet:
482 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
489 //----------------------------------------------------------------------------------------
491 // Error Throw and IllegalArgumentException in response to a rule parse
494 //----------------------------------------------------------------------------------------
496 String s = "Error " + e + " at line " + fLineNum + " column "
498 IllegalArgumentException ex = new IllegalArgumentException(s);
503 //----------------------------------------------------------------------------------------
505 // fixOpStack The parse stack holds partially assembled chunks of the parse
507 // An entry on the stack may be as small as a single setRef node,
508 // or as large as the parse tree
509 // for an entire expression (this will be the one item left on the stack
510 // when the parsing of an RBBI rule completes.
512 // This function is called when a binary operator is encountered.
513 // It looks back up the stack for operators that are not yet associated
514 // with a right operand, and if the precedence of the stacked operator >=
515 // the precedence of the current operator, binds the operand left,
516 // to the previously encountered operator.
518 //----------------------------------------------------------------------------------------
519 void fixOpStack(int p) {
521 // printNodeStack("entering fixOpStack()");
523 n = fNodeStack[fNodeStackPtr - 1]; // an operator node
524 if (n.fPrecedence == 0) {
525 System.out.print("RBBIRuleScanner.fixOpStack, bad operator node");
526 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
530 if (n.fPrecedence < p || n.fPrecedence <= RBBINode.precLParen) {
531 // The most recent operand goes with the current operator,
532 // not with the previously stacked one.
535 // Stack operator is a binary op ( '|' or concatenation)
536 // TOS operand becomes right child of this operator.
537 // Resulting subexpression becomes the TOS operand.
538 n.fRightChild = fNodeStack[fNodeStackPtr];
539 fNodeStack[fNodeStackPtr].fParent = n;
541 // printNodeStack("looping in fixOpStack() ");
544 if (p <= RBBINode.precLParen) {
545 // Scan is at a right paren or end of expression.
546 // The scanned item must match the stack, or else there was an
548 // Discard the left paren (or start expr) node from the stack,
549 // leaving the completed (sub)expression as TOS.
550 if (n.fPrecedence != p) {
551 // Right paren encountered matched start of expression node, or
552 // end of expression matched with a left paren node.
553 error(RBBIRuleBuilder.U_BRK_MISMATCHED_PAREN);
555 fNodeStack[fNodeStackPtr - 1] = fNodeStack[fNodeStackPtr];
557 // Delete the now-discarded LParen or Start node.
560 // printNodeStack("leaving fixOpStack()");
563 //----------------------------------------------------------------------------
565 // RBBISetTableEl is an entry in the hash table of UnicodeSets that have
566 // been encountered. The val Node will be of nodetype uset
567 // and contain pointers to the actual UnicodeSets.
568 // The Key is the source string for initializing the set.
570 // The hash table is used to avoid creating duplicate
571 // unnamed (not $var references) UnicodeSets.
573 //----------------------------------------------------------------------------
574 static class RBBISetTableEl {
581 //----------------------------------------------------------------------------------------
583 // findSetFor given a String,
584 // - find the corresponding Unicode Set (uset node)
585 // (create one if necessary)
586 // - Set fLeftChild of the caller's node (should be a setRef node)
588 // Maintain a hash table of uset nodes, so the same one is always used
589 // for the same string.
590 // If a "to adopt" set is provided and we haven't seen this key before,
591 // add the provided set to the hash table.
592 // If the string is one (32 bit) char in length, the set contains
593 // just one element which is the char in question.
594 // If the string is "any", return a set containing all chars.
596 //----------------------------------------------------------------------------------------
597 void findSetFor(String s, RBBINode node, UnicodeSet setToAdopt) {
601 // First check whether we've already cached a set for this string.
602 // If so, just use the cached set in the new node.
603 // delete any set provided by the caller, since we own it.
604 el = fSetTable.get(s);
606 node.fLeftChild = el.val;
607 Assert.assrt(node.fLeftChild.fType == RBBINode.uset);
611 // Haven't seen this set before.
612 // If the caller didn't provide us with a prebuilt set,
613 // create a new UnicodeSet now.
614 if (setToAdopt == null) {
615 if (s.equals(kAny)) {
616 setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
619 c = UTF16.charAt(s, 0);
620 setToAdopt = new UnicodeSet(c, c);
625 // Make a new uset node to refer to this UnicodeSet
626 // This new uset node becomes the child of the caller's setReference
629 RBBINode usetNode = new RBBINode(RBBINode.uset);
630 usetNode.fInputSet = setToAdopt;
631 usetNode.fParent = node;
632 node.fLeftChild = usetNode;
636 // Add the new uset node to the list of all uset nodes.
638 fRB.fUSetNodes.add(usetNode);
641 // Add the new set to the set hash table.
643 el = new RBBISetTableEl();
646 fSetTable.put(el.key, el);
652 // Assorted Unicode character constants.
653 // Numeric because there is no portable way to enter them as literals.
656 static final int chNEL = 0x85; // NEL newline variant
658 static final int chLS = 0x2028; // Unicode Line Separator
660 //----------------------------------------------------------------------------------------
662 // stripRules Return a rules string without unnecessary
665 //----------------------------------------------------------------------------------------
666 static String stripRules(String rules) {
667 StringBuilder strippedRules = new StringBuilder();
668 int rulesLength = rules.length();
669 for (int idx = 0; idx < rulesLength;) {
670 char ch = rules.charAt(idx++);
672 while (idx < rulesLength
673 && ch != '\r' && ch != '\n' && ch != chNEL) {
674 ch = rules.charAt(idx++);
677 if (!UCharacter.isISOControl(ch)) {
678 strippedRules.append(ch);
681 return strippedRules.toString();
684 //----------------------------------------------------------------------------------------
686 // nextCharLL Low Level Next Char from rule input source.
687 // Get a char from the input character iterator,
688 // keep track of input position for error reporting.
690 //----------------------------------------------------------------------------------------
694 if (fNextIndex >= fRB.fRules.length()) {
697 ch = UTF16.charAt(fRB.fRules, fNextIndex);
698 fNextIndex = UTF16.moveCodePointOffset(fRB.fRules, fNextIndex, 1);
703 ch == '\n' && fLastChar != '\r') {
704 // Character is starting a new line. Bump up the line number, and
705 // reset the column to 0.
709 error(RBBIRuleBuilder.U_BRK_NEW_LINE_IN_QUOTED_STRING);
713 // Character is not starting a new line. Except in the case of a
714 // LF following a CR, increment the column position.
723 //---------------------------------------------------------------------------------
725 // nextChar for rules scanning. At this level, we handle stripping
726 // out comments and processing backslash character escapes.
727 // The rest of the rules grammar is handled at the next level up.
729 //---------------------------------------------------------------------------------
730 void nextChar(RBBIRuleChar c) {
732 // Unicode Character constants needed for the processing done by nextChar(),
733 // in hex because literals wont work on EBCDIC machines.
735 fScanIndex = fNextIndex;
736 c.fChar = nextCharLL();
740 // check for '' sequence.
741 // These are recognized in all contexts, whether in quoted text or not.
743 if (c.fChar == '\'') {
744 if (UTF16.charAt(fRB.fRules, fNextIndex) == '\'') {
745 c.fChar = nextCharLL(); // get nextChar officially so character counts
746 c.fEscaped = true; // stay correct.
748 // Single quote, by itself.
749 // Toggle quoting mode.
750 // Return either '(' or ')', because quotes cause a grouping of the quoted text.
751 fQuoteMode = !fQuoteMode;
752 if (fQuoteMode == true) {
757 c.fEscaped = false; // The paren that we return is not escaped.
765 // We are not in a 'quoted region' of the source.
767 if (c.fChar == '#') {
768 // Start of a comment. Consume the rest of it.
769 // The new-line char that terminates the comment is always returned.
770 // It will be treated as white-space, and serves to break up anything
771 // that might otherwise incorrectly clump together with a comment in
772 // the middle (a variable name, for example.)
774 c.fChar = nextCharLL();
775 if (c.fChar == -1 || // EOF
790 // check for backslash escaped characters.
791 // Use String.unescapeAt() to handle them.
793 if (c.fChar == '\\') {
795 int[] unescapeIndex = new int[1];
796 unescapeIndex[0] = fNextIndex;
797 c.fChar = Utility.unescapeAt(fRB.fRules, unescapeIndex);
798 if (unescapeIndex[0] == fNextIndex) {
799 error(RBBIRuleBuilder.U_BRK_HEX_DIGITS_EXPECTED);
802 fCharNum += unescapeIndex[0] - fNextIndex;
803 fNextIndex = unescapeIndex[0];
806 // putc(c.fChar, stdout);
809 //---------------------------------------------------------------------------------
811 // Parse RBBI rules. The state machine for rules parsing is here.
812 // The state tables are hand-written in the file rbbirpt.txt,
813 // and converted to the form used here by a perl
816 //---------------------------------------------------------------------------------
819 RBBIRuleParseTable.RBBIRuleTableElement tableEl;
824 // Main loop for the rule parsing state machine.
825 // Runs once per state transition.
826 // Each time through optionally performs, depending on the state table,
827 // - an advance to the the next input char
828 // - an action to be performed.
829 // - pushing or popping a state to/from the local state return stack.
832 // Quit if state == 0. This is the normal way to exit the state machine.
838 // Find the state table element that matches the input char from the rule, or the
839 // class of the input character. Start with the first table row for this
840 // state, then linearly scan forward until we find a row that matches the
841 // character. The last row for each state always matches all characters, so
842 // the search will stop there, if not before.
844 tableEl = RBBIRuleParseTable.gRuleParseStateTable[state];
845 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
846 System.out.println("char, line, col = (\'" + (char) fC.fChar
847 + "\', " + fLineNum + ", " + fCharNum + " state = "
848 + tableEl.fStateName);
851 for (int tableRow = state;; tableRow++) { // loop over the state table rows associated with this state.
852 tableEl = RBBIRuleParseTable.gRuleParseStateTable[tableRow];
853 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
854 System.out.print(".");
856 if (tableEl.fCharClass < 127 && fC.fEscaped == false
857 && tableEl.fCharClass == fC.fChar) {
858 // Table row specified an individual character, not a set, and
859 // the input character is not escaped, and
860 // the input character matched it.
863 if (tableEl.fCharClass == 255) {
864 // Table row specified default, match anything character class.
867 if (tableEl.fCharClass == 254 && fC.fEscaped) {
868 // Table row specified "escaped" and the char was escaped.
871 if (tableEl.fCharClass == 253 && fC.fEscaped
872 && (fC.fChar == 0x50 || fC.fChar == 0x70)) {
873 // Table row specified "escaped P" and the char is either 'p' or 'P'.
876 if (tableEl.fCharClass == 252 && fC.fChar == -1) {
877 // Table row specified eof and we hit eof on the input.
881 if (tableEl.fCharClass >= 128 && tableEl.fCharClass < 240 && // Table specs a char class &&
882 fC.fEscaped == false && // char is not escaped &&
883 fC.fChar != -1) { // char is not EOF
884 UnicodeSet uniset = fRuleSets[tableEl.fCharClass - 128];
885 if (uniset.contains(fC.fChar)) {
886 // Table row specified a character class, or set of characters,
887 // and the current char matches it.
893 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
894 System.out.println("");
897 // We've found the row of the state table that matches the current input
898 // character from the rules string.
899 // Perform any action specified by this row in the state table.
900 if (doParseActions(tableEl.fAction) == false) {
901 // Break out of the state machine loop if the
902 // the action signalled some kind of error, or
903 // the action was to exit, occurs on normal end-of-rules-input.
907 if (tableEl.fPushState != 0) {
909 if (fStackPtr >= kStackSize) {
910 System.out.println("RBBIRuleScanner.parse() - state stack overflow.");
911 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
913 fStack[fStackPtr] = tableEl.fPushState;
916 if (tableEl.fNextChar) {
920 // Get the next state from the table entry, or from the
921 // state stack if the next state was specified as "pop".
922 if (tableEl.fNextState != 255) {
923 state = tableEl.fNextState;
925 state = fStack[fStackPtr];
928 System.out.println("RBBIRuleScanner.parse() - state stack underflow.");
929 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
936 // If there were NO user specified reverse rules, set up the equivalent of ".*;"
938 if (fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] == null) {
939 fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] = pushNewNode(RBBINode.opStar);
940 RBBINode operand = pushNewNode(RBBINode.setRef);
941 findSetFor(kAny, operand, null);
942 fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].fLeftChild = operand;
943 operand.fParent = fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree];
948 // Parsing of the input RBBI rules is complete.
949 // We now have a parse tree for the rule expressions
950 // and a list of all UnicodeSets that are referenced.
952 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("symbols") >= 0) {
953 fSymbolTable.rbbiSymtablePrint();
955 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("ptree") >= 0) {
956 System.out.println("Completed Forward Rules Parse Tree...");
957 fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree].printTree(true);
958 System.out.println("\nCompleted Reverse Rules Parse Tree...");
959 fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].printTree(true);
960 System.out.println("\nCompleted Safe Point Forward Rules Parse Tree...");
961 if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree] == null) {
962 System.out.println(" -- null -- ");
964 fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree].printTree(true);
966 System.out.println("\nCompleted Safe Point Reverse Rules Parse Tree...");
967 if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree] == null) {
968 System.out.println(" -- null -- ");
970 fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree].printTree(true);
975 //---------------------------------------------------------------------------------
977 // printNodeStack for debugging...
979 //---------------------------------------------------------------------------------
981 void printNodeStack(String title) {
983 System.out.println(title + ". Dumping node stack...\n");
984 for (i = fNodeStackPtr; i > 0; i--) {
985 fNodeStack[i].printTree(true);
990 //---------------------------------------------------------------------------------
992 // pushNewNode create a new RBBINode of the specified type and push it
993 // onto the stack of nodes.
995 //---------------------------------------------------------------------------------
996 RBBINode pushNewNode(int nodeType) {
998 if (fNodeStackPtr >= kStackSize) {
999 System.out.println("RBBIRuleScanner.pushNewNode - stack overflow.");
1000 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
1002 fNodeStack[fNodeStackPtr] = new RBBINode(nodeType);
1003 return fNodeStack[fNodeStackPtr];
1006 //---------------------------------------------------------------------------------
1008 // scanSet Construct a UnicodeSet from the text at the current scan
1009 // position. Advance the scan position to the first character
1012 // A new RBBI setref node referring to the set is pushed onto the node
1015 // The scan position is normally under the control of the state machine
1016 // that controls rule parsing. UnicodeSets, however, are parsed by
1017 // the UnicodeSet constructor, not by the RBBI rule parser.
1019 //---------------------------------------------------------------------------------
1021 UnicodeSet uset = null;
1023 ParsePosition pos = new ParsePosition(fScanIndex);
1026 startPos = fScanIndex;
1028 uset = new UnicodeSet(fRB.fRules, pos, fSymbolTable, UnicodeSet.IGNORE_SPACE);
1029 } catch (Exception e) { // TODO: catch fewer exception types.
1030 // Repackage UnicodeSet errors as RBBI rule builder errors, with location info.
1031 error(RBBIRuleBuilder.U_BRK_MALFORMED_SET);
1034 // Verify that the set contains at least one code point.
1036 if (uset.isEmpty()) {
1037 // This set is empty.
1038 // Make it an error, because it almost certainly is not what the user wanted.
1039 // Also, avoids having to think about corner cases in the tree manipulation code
1040 // that occurs later on.
1041 // TODO: this shouldn't be an error; it does happen.
1042 error(RBBIRuleBuilder.U_BRK_RULE_EMPTY_SET);
1045 // Advance the RBBI parse postion over the UnicodeSet pattern.
1046 // Don't just set fScanIndex because the line/char positions maintained
1047 // for error reporting would be thrown off.
1050 if (fNextIndex >= i) {
1058 n = pushNewNode(RBBINode.setRef);
1059 n.fFirstPos = startPos;
1060 n.fLastPos = fNextIndex;
1061 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
1062 // findSetFor() serves several purposes here:
1063 // - Adopts storage for the UnicodeSet, will be responsible for deleting.
1064 // - Mantains collection of all sets in use, needed later for establishing
1065 // character categories for run time engine.
1066 // - Eliminates mulitiple instances of the same set.
1067 // - Creates a new uset node if necessary (if this isn't a duplicate.)
1068 findSetFor(n.fText, n, uset);