2 *******************************************************************************
\r
3 * Copyright (C) 2003-2010, International Business Machines Corporation and others. All Rights Reserved.
\r
4 *******************************************************************************
\r
7 package com.ibm.icu.text;
\r
9 import java.text.ParsePosition;
\r
10 import java.util.HashMap;
\r
12 import com.ibm.icu.impl.Assert;
\r
13 import com.ibm.icu.impl.Utility;
\r
14 import com.ibm.icu.lang.UCharacter;
\r
17 * This class is part of the Rule Based Break Iterator rule compiler.
\r
18 * It scans the rules and builds the parse tree.
\r
19 * There is no public API here.
\r
21 class RBBIRuleScanner {
\r
23 private final int kStackSize = 100; // The size of the state stack for
\r
24 // rules parsing. Corresponds roughly
\r
25 // to the depth of parentheses nesting
\r
26 // that is allowed in the rules.
\r
28 static class RBBIRuleChar {
\r
35 RBBIRuleBuilder fRB; // The rule builder that we are part of.
\r
37 int fScanIndex; // Index of current character being processed
\r
38 // in the rule input string.
\r
39 int fNextIndex; // Index of the next character, which
\r
40 // is the first character not yet scanned.
\r
41 boolean fQuoteMode; // Scan is in a 'quoted region'
\r
42 int fLineNum; // Line number in input file.
\r
43 int fCharNum; // Char position within the line.
\r
44 int fLastChar; // Previous char, needed to count CR-LF
\r
45 // as a single line, not two.
\r
47 RBBIRuleChar fC = new RBBIRuleChar(); // Current char for parse state machine
\r
49 String fVarName; // $variableName, valid when we've just
\r
53 short fStack[] = new short[kStackSize]; // State stack, holds state pushes
\r
54 int fStackPtr; // and pops as specified in the state
\r
55 // transition rules.
\r
57 RBBINode fNodeStack[] = new RBBINode[kStackSize]; // Node stack, holds nodes created
\r
58 // during the parse of a rule
\r
62 boolean fReverseRule; // True if the rule currently being scanned
\r
63 // is a reverse direction rule (if it
\r
64 // starts with a '!')
\r
66 boolean fLookAheadRule; // True if the rule includes a '/'
\r
67 // somewhere within it.
\r
69 RBBISymbolTable fSymbolTable; // symbol table, holds definitions of
\r
70 // $variable symbols.
\r
72 HashMap<String, RBBISetTableEl> fSetTable = new HashMap<String, RBBISetTableEl>(); // UnicocodeSet hash table, holds indexes to
\r
73 // the sets created while parsing rules.
\r
74 // The key is the string used for creating
\r
77 UnicodeSet fRuleSets[] = new UnicodeSet[10]; // Unicode Sets that are needed during
\r
78 // the scanning of RBBI rules. The
\r
79 // indicies for these are assigned by the
\r
80 // perl script that builds the state tables.
\r
83 int fRuleNum; // Counts each rule as it is scanned.
\r
85 int fOptionStart; // Input index of start of a !!option
\r
86 // keyword, while being scanned.
\r
90 static private String gRuleSet_rule_char_pattern = "[^[\\p{Z}\\u0020-\\u007f]-[\\p{L}]-[\\p{N}]]";
\r
91 static private String gRuleSet_name_char_pattern = "[_\\p{L}\\p{N}]";
\r
92 static private String gRuleSet_digit_char_pattern = "[0-9]";
\r
93 static private String gRuleSet_name_start_char_pattern = "[_\\p{L}]";
\r
94 static private String gRuleSet_white_space_pattern = "[\\p{Pattern_White_Space}]";
\r
95 static private String kAny = "any";
\r
100 //----------------------------------------------------------------------------------------
\r
104 //----------------------------------------------------------------------------------------
\r
105 RBBIRuleScanner(RBBIRuleBuilder rb) {
\r
110 // Set up the constant Unicode Sets.
\r
111 // Note: These could be made static and shared among
\r
112 // all instances of RBBIRuleScanners.
\r
113 fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128] = new UnicodeSet(gRuleSet_rule_char_pattern);
\r
114 fRuleSets[RBBIRuleParseTable.kRuleSet_white_space - 128] = new UnicodeSet(gRuleSet_white_space_pattern);
\r
115 fRuleSets[RBBIRuleParseTable.kRuleSet_name_char - 128] = new UnicodeSet(gRuleSet_name_char_pattern);
\r
116 fRuleSets[RBBIRuleParseTable.kRuleSet_name_start_char - 128] = new UnicodeSet(gRuleSet_name_start_char_pattern);
\r
117 fRuleSets[RBBIRuleParseTable.kRuleSet_digit_char - 128] = new UnicodeSet(gRuleSet_digit_char_pattern);
\r
119 fSymbolTable = new RBBISymbolTable(this, rb.fRules);
\r
122 //----------------------------------------------------------------------------------------
\r
124 // doParseAction Do some action during rule parsing.
\r
125 // Called by the parse state machine.
\r
126 // Actions build the parse tree and Unicode Sets,
\r
127 // and maintain the parse stack for nested expressions.
\r
129 //----------------------------------------------------------------------------------------
\r
130 boolean doParseActions(int action) {
\r
133 boolean returnVal = true;
\r
137 case RBBIRuleParseTable.doExprStart:
\r
138 pushNewNode(RBBINode.opStart);
\r
142 case RBBIRuleParseTable.doExprOrOperator: {
\r
143 fixOpStack(RBBINode.precOpCat);
\r
144 RBBINode operandNode = fNodeStack[fNodeStackPtr--];
\r
145 RBBINode orNode = pushNewNode(RBBINode.opOr);
\r
146 orNode.fLeftChild = operandNode;
\r
147 operandNode.fParent = orNode;
\r
151 case RBBIRuleParseTable.doExprCatOperator:
\r
152 // concatenation operator.
\r
153 // For the implicit concatenation of adjacent terms in an expression
\r
155 // not separated by any other operator. Action is invoked between the
\r
156 // actions for the two terms.
\r
158 fixOpStack(RBBINode.precOpCat);
\r
159 RBBINode operandNode = fNodeStack[fNodeStackPtr--];
\r
160 RBBINode catNode = pushNewNode(RBBINode.opCat);
\r
161 catNode.fLeftChild = operandNode;
\r
162 operandNode.fParent = catNode;
\r
166 case RBBIRuleParseTable.doLParen:
\r
168 // The openParen node is a dummy operation type with a low
\r
170 // which has the affect of ensuring that any real binary op that
\r
171 // follows within the parens binds more tightly to the operands than
\r
172 // stuff outside of the parens.
\r
173 pushNewNode(RBBINode.opLParen);
\r
176 case RBBIRuleParseTable.doExprRParen:
\r
177 fixOpStack(RBBINode.precLParen);
\r
180 case RBBIRuleParseTable.doNOP:
\r
183 case RBBIRuleParseTable.doStartAssign:
\r
184 // We've just scanned "$variable = "
\r
185 // The top of the node stack has the $variable ref node.
\r
187 // Save the start position of the RHS text in the StartExpression
\r
189 // that precedes the $variableReference node on the stack.
\r
190 // This will eventually be used when saving the full $variable
\r
192 // text as a string.
\r
193 n = fNodeStack[fNodeStackPtr - 1];
\r
194 n.fFirstPos = fNextIndex; // move past the '='
\r
196 // Push a new start-of-expression node; needed to keep parse of the
\r
197 // RHS expression happy.
\r
198 pushNewNode(RBBINode.opStart);
\r
201 case RBBIRuleParseTable.doEndAssign: {
\r
202 // We have reached the end of an assignement statement.
\r
203 // Current scan char is the ';' that terminates the assignment.
\r
205 // Terminate expression, leaves expression parse tree rooted in TOS
\r
207 fixOpStack(RBBINode.precStart);
\r
209 RBBINode startExprNode = fNodeStack[fNodeStackPtr - 2];
\r
210 RBBINode varRefNode = fNodeStack[fNodeStackPtr - 1];
\r
211 RBBINode RHSExprNode = fNodeStack[fNodeStackPtr];
\r
213 // Save original text of right side of assignment, excluding the
\r
215 // in the root of the node for the right-hand-side expression.
\r
216 RHSExprNode.fFirstPos = startExprNode.fFirstPos;
\r
217 RHSExprNode.fLastPos = fScanIndex;
\r
218 // fRB.fRules.extractBetween(RHSExprNode.fFirstPos,
\r
219 // RHSExprNode.fLastPos, RHSExprNode.fText);
\r
220 RHSExprNode.fText = fRB.fRules.substring(RHSExprNode.fFirstPos,
\r
221 RHSExprNode.fLastPos);
\r
223 // Expression parse tree becomes l. child of the $variable reference
\r
225 varRefNode.fLeftChild = RHSExprNode;
\r
226 RHSExprNode.fParent = varRefNode;
\r
228 // Make a symbol table entry for the $variableRef node.
\r
229 fSymbolTable.addEntry(varRefNode.fText, varRefNode);
\r
231 // Clean up the stack.
\r
232 fNodeStackPtr -= 3;
\r
236 case RBBIRuleParseTable.doEndOfRule: {
\r
237 fixOpStack(RBBINode.precStart); // Terminate expression, leaves
\r
240 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("rtree") >= 0) {
\r
241 printNodeStack("end of rule");
\r
243 Assert.assrt(fNodeStackPtr == 1);
\r
245 // If this rule includes a look-ahead '/', add a endMark node to the
\r
246 // expression tree.
\r
247 if (fLookAheadRule) {
\r
248 RBBINode thisRule = fNodeStack[fNodeStackPtr];
\r
249 RBBINode endNode = pushNewNode(RBBINode.endMark);
\r
250 RBBINode catNode = pushNewNode(RBBINode.opCat);
\r
251 fNodeStackPtr -= 2;
\r
252 catNode.fLeftChild = thisRule;
\r
253 catNode.fRightChild = endNode;
\r
254 fNodeStack[fNodeStackPtr] = catNode;
\r
255 endNode.fVal = fRuleNum;
\r
256 endNode.fLookAheadEnd = true;
\r
259 // All rule expressions are ORed together.
\r
260 // The ';' that terminates an expression really just functions as a
\r
262 // a low operator prededence.
\r
264 // Each of the four sets of rules are collected separately.
\r
265 // (forward, reverse, safe_forward, safe_reverse)
\r
266 // OR this rule into the appropriate group of them.
\r
269 int destRules = (fReverseRule ? RBBIRuleBuilder.fReverseTree : fRB.fDefaultTree);
\r
271 if (fRB.fTreeRoots[destRules] != null) {
\r
272 // This is not the first rule encounted.
\r
273 // OR previous stuff (from *destRules)
\r
274 // with the current rule expression (on the Node Stack)
\r
275 // with the resulting OR expression going to *destRules
\r
277 RBBINode thisRule = fNodeStack[fNodeStackPtr];
\r
278 RBBINode prevRules = fRB.fTreeRoots[destRules];
\r
279 RBBINode orNode = pushNewNode(RBBINode.opOr);
\r
280 orNode.fLeftChild = prevRules;
\r
281 prevRules.fParent = orNode;
\r
282 orNode.fRightChild = thisRule;
\r
283 thisRule.fParent = orNode;
\r
284 fRB.fTreeRoots[destRules] = orNode;
\r
286 // This is the first rule encountered (for this direction).
\r
287 // Just move its parse tree from the stack to *destRules.
\r
288 fRB.fTreeRoots[destRules] = fNodeStack[fNodeStackPtr];
\r
290 fReverseRule = false; // in preparation for the next rule.
\r
291 fLookAheadRule = false;
\r
296 case RBBIRuleParseTable.doRuleError:
\r
297 error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
\r
301 case RBBIRuleParseTable.doVariableNameExpectedErr:
\r
302 error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
\r
306 // Unary operands + ? *
\r
307 // These all appear after the operand to which they apply.
\r
308 // When we hit one, the operand (may be a whole sub expression)
\r
309 // will be on the top of the stack.
\r
310 // Unary Operator becomes TOS, with the old TOS as its one child.
\r
311 case RBBIRuleParseTable.doUnaryOpPlus: {
\r
312 RBBINode operandNode = fNodeStack[fNodeStackPtr--];
\r
313 RBBINode plusNode = pushNewNode(RBBINode.opPlus);
\r
314 plusNode.fLeftChild = operandNode;
\r
315 operandNode.fParent = plusNode;
\r
319 case RBBIRuleParseTable.doUnaryOpQuestion: {
\r
320 RBBINode operandNode = fNodeStack[fNodeStackPtr--];
\r
321 RBBINode qNode = pushNewNode(RBBINode.opQuestion);
\r
322 qNode.fLeftChild = operandNode;
\r
323 operandNode.fParent = qNode;
\r
327 case RBBIRuleParseTable.doUnaryOpStar: {
\r
328 RBBINode operandNode = fNodeStack[fNodeStackPtr--];
\r
329 RBBINode starNode = pushNewNode(RBBINode.opStar);
\r
330 starNode.fLeftChild = operandNode;
\r
331 operandNode.fParent = starNode;
\r
335 case RBBIRuleParseTable.doRuleChar:
\r
336 // A "Rule Character" is any single character that is a literal part
\r
337 // of the regular expression. Like a, b and c in the expression "(abc*)
\r
339 // These are pretty uncommon in break rules; the terms are more commonly
\r
340 // sets. To keep things uniform, treat these characters like as
\r
341 // sets that just happen to contain only one character.
\r
343 n = pushNewNode(RBBINode.setRef);
\r
344 String s = String.valueOf((char)fC.fChar);
\r
345 findSetFor(s, n, null);
\r
346 n.fFirstPos = fScanIndex;
\r
347 n.fLastPos = fNextIndex;
\r
348 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
\r
352 case RBBIRuleParseTable.doDotAny:
\r
353 // scanned a ".", meaning match any single character.
\r
355 n = pushNewNode(RBBINode.setRef);
\r
356 findSetFor(kAny, n, null);
\r
357 n.fFirstPos = fScanIndex;
\r
358 n.fLastPos = fNextIndex;
\r
359 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
\r
363 case RBBIRuleParseTable.doSlash:
\r
364 // Scanned a '/', which identifies a look-ahead break position in a
\r
366 n = pushNewNode(RBBINode.lookAhead);
\r
368 n.fFirstPos = fScanIndex;
\r
369 n.fLastPos = fNextIndex;
\r
370 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
\r
371 fLookAheadRule = true;
\r
374 case RBBIRuleParseTable.doStartTagValue:
\r
375 // Scanned a '{', the opening delimiter for a tag value within a
\r
377 n = pushNewNode(RBBINode.tag);
\r
379 n.fFirstPos = fScanIndex;
\r
380 n.fLastPos = fNextIndex;
\r
383 case RBBIRuleParseTable.doTagDigit:
\r
384 // Just scanned a decimal digit that's part of a tag value
\r
386 n = fNodeStack[fNodeStackPtr];
\r
387 int v = UCharacter.digit((char) fC.fChar, 10);
\r
388 n.fVal = n.fVal * 10 + v;
\r
392 case RBBIRuleParseTable.doTagValue:
\r
393 n = fNodeStack[fNodeStackPtr];
\r
394 n.fLastPos = fNextIndex;
\r
395 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
\r
398 case RBBIRuleParseTable.doTagExpectedError:
\r
399 error(RBBIRuleBuilder.U_BRK_MALFORMED_RULE_TAG);
\r
403 case RBBIRuleParseTable.doOptionStart:
\r
404 // Scanning a !!option. At the start of string.
\r
405 fOptionStart = fScanIndex;
\r
408 case RBBIRuleParseTable.doOptionEnd: {
\r
409 String opt = fRB.fRules.substring(fOptionStart, fScanIndex);
\r
410 if (opt.equals("chain")) {
\r
411 fRB.fChainRules = true;
\r
412 } else if (opt.equals("LBCMNoChain")) {
\r
413 fRB.fLBCMNoChain = true;
\r
414 } else if (opt.equals("forward")) {
\r
415 fRB.fDefaultTree = RBBIRuleBuilder.fForwardTree;
\r
416 } else if (opt.equals("reverse")) {
\r
417 fRB.fDefaultTree = RBBIRuleBuilder.fReverseTree;
\r
418 } else if (opt.equals("safe_forward")) {
\r
419 fRB.fDefaultTree = RBBIRuleBuilder.fSafeFwdTree;
\r
420 } else if (opt.equals("safe_reverse")) {
\r
421 fRB.fDefaultTree = RBBIRuleBuilder.fSafeRevTree;
\r
422 } else if (opt.equals("lookAheadHardBreak")) {
\r
423 fRB.fLookAheadHardBreak = true;
\r
425 error(RBBIRuleBuilder.U_BRK_UNRECOGNIZED_OPTION);
\r
430 case RBBIRuleParseTable.doReverseDir:
\r
431 fReverseRule = true;
\r
434 case RBBIRuleParseTable.doStartVariableName:
\r
435 n = pushNewNode(RBBINode.varRef);
\r
436 n.fFirstPos = fScanIndex;
\r
439 case RBBIRuleParseTable.doEndVariableName:
\r
440 n = fNodeStack[fNodeStackPtr];
\r
441 if (n == null || n.fType != RBBINode.varRef) {
\r
442 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
\r
445 n.fLastPos = fScanIndex;
\r
446 n.fText = fRB.fRules.substring(n.fFirstPos + 1, n.fLastPos);
\r
447 // Look the newly scanned name up in the symbol table
\r
448 // If there's an entry, set the l. child of the var ref to the
\r
449 // replacement expression.
\r
450 // (We also pass through here when scanning assignments, but no harm
\r
452 // than a slight wasted effort that seems hard to avoid. Lookup will
\r
454 n.fLeftChild = fSymbolTable.lookupNode(n.fText);
\r
457 case RBBIRuleParseTable.doCheckVarDef:
\r
458 n = fNodeStack[fNodeStackPtr];
\r
459 if (n.fLeftChild == null) {
\r
460 error(RBBIRuleBuilder.U_BRK_UNDEFINED_VARIABLE);
\r
465 case RBBIRuleParseTable.doExprFinished:
\r
468 case RBBIRuleParseTable.doRuleErrorAssignExpr:
\r
469 error(RBBIRuleBuilder.U_BRK_ASSIGN_ERROR);
\r
473 case RBBIRuleParseTable.doExit:
\r
477 case RBBIRuleParseTable.doScanUnicodeSet:
\r
482 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
\r
489 //----------------------------------------------------------------------------------------
\r
491 // Error Throw and IllegalArgumentException in response to a rule parse
\r
494 //----------------------------------------------------------------------------------------
\r
495 void error(int e) {
\r
496 String s = "Error " + e + " at line " + fLineNum + " column "
\r
498 IllegalArgumentException ex = new IllegalArgumentException(s);
\r
503 //----------------------------------------------------------------------------------------
\r
505 // fixOpStack The parse stack holds partially assembled chunks of the parse
\r
507 // An entry on the stack may be as small as a single setRef node,
\r
508 // or as large as the parse tree
\r
509 // for an entire expression (this will be the one item left on the stack
\r
510 // when the parsing of an RBBI rule completes.
\r
512 // This function is called when a binary operator is encountered.
\r
513 // It looks back up the stack for operators that are not yet associated
\r
514 // with a right operand, and if the precedence of the stacked operator >=
\r
515 // the precedence of the current operator, binds the operand left,
\r
516 // to the previously encountered operator.
\r
518 //----------------------------------------------------------------------------------------
\r
519 void fixOpStack(int p) {
\r
521 // printNodeStack("entering fixOpStack()");
\r
523 n = fNodeStack[fNodeStackPtr - 1]; // an operator node
\r
524 if (n.fPrecedence == 0) {
\r
525 System.out.print("RBBIRuleScanner.fixOpStack, bad operator node");
\r
526 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
\r
530 if (n.fPrecedence < p || n.fPrecedence <= RBBINode.precLParen) {
\r
531 // The most recent operand goes with the current operator,
\r
532 // not with the previously stacked one.
\r
535 // Stack operator is a binary op ( '|' or concatenation)
\r
536 // TOS operand becomes right child of this operator.
\r
537 // Resulting subexpression becomes the TOS operand.
\r
538 n.fRightChild = fNodeStack[fNodeStackPtr];
\r
539 fNodeStack[fNodeStackPtr].fParent = n;
\r
541 // printNodeStack("looping in fixOpStack() ");
\r
544 if (p <= RBBINode.precLParen) {
\r
545 // Scan is at a right paren or end of expression.
\r
546 // The scanned item must match the stack, or else there was an
\r
548 // Discard the left paren (or start expr) node from the stack,
\r
549 // leaving the completed (sub)expression as TOS.
\r
550 if (n.fPrecedence != p) {
\r
551 // Right paren encountered matched start of expression node, or
\r
552 // end of expression matched with a left paren node.
\r
553 error(RBBIRuleBuilder.U_BRK_MISMATCHED_PAREN);
\r
555 fNodeStack[fNodeStackPtr - 1] = fNodeStack[fNodeStackPtr];
\r
557 // Delete the now-discarded LParen or Start node.
\r
560 // printNodeStack("leaving fixOpStack()");
\r
563 //----------------------------------------------------------------------------
\r
565 // RBBISetTableEl is an entry in the hash table of UnicodeSets that have
\r
566 // been encountered. The val Node will be of nodetype uset
\r
567 // and contain pointers to the actual UnicodeSets.
\r
568 // The Key is the source string for initializing the set.
\r
570 // The hash table is used to avoid creating duplicate
\r
571 // unnamed (not $var references) UnicodeSets.
\r
573 //----------------------------------------------------------------------------
\r
574 static class RBBISetTableEl {
\r
581 //----------------------------------------------------------------------------------------
\r
583 // findSetFor given a String,
\r
584 // - find the corresponding Unicode Set (uset node)
\r
585 // (create one if necessary)
\r
586 // - Set fLeftChild of the caller's node (should be a setRef node)
\r
587 // to the uset node
\r
588 // Maintain a hash table of uset nodes, so the same one is always used
\r
589 // for the same string.
\r
590 // If a "to adopt" set is provided and we haven't seen this key before,
\r
591 // add the provided set to the hash table.
\r
592 // If the string is one (32 bit) char in length, the set contains
\r
593 // just one element which is the char in question.
\r
594 // If the string is "any", return a set containing all chars.
\r
596 //----------------------------------------------------------------------------------------
\r
597 void findSetFor(String s, RBBINode node, UnicodeSet setToAdopt) {
\r
601 // First check whether we've already cached a set for this string.
\r
602 // If so, just use the cached set in the new node.
\r
603 // delete any set provided by the caller, since we own it.
\r
604 el = fSetTable.get(s);
\r
606 node.fLeftChild = el.val;
\r
607 Assert.assrt(node.fLeftChild.fType == RBBINode.uset);
\r
611 // Haven't seen this set before.
\r
612 // If the caller didn't provide us with a prebuilt set,
\r
613 // create a new UnicodeSet now.
\r
614 if (setToAdopt == null) {
\r
615 if (s.equals(kAny)) {
\r
616 setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
\r
619 c = UTF16.charAt(s, 0);
\r
620 setToAdopt = new UnicodeSet(c, c);
\r
625 // Make a new uset node to refer to this UnicodeSet
\r
626 // This new uset node becomes the child of the caller's setReference
\r
629 RBBINode usetNode = new RBBINode(RBBINode.uset);
\r
630 usetNode.fInputSet = setToAdopt;
\r
631 usetNode.fParent = node;
\r
632 node.fLeftChild = usetNode;
\r
633 usetNode.fText = s;
\r
636 // Add the new uset node to the list of all uset nodes.
\r
638 fRB.fUSetNodes.add(usetNode);
\r
641 // Add the new set to the set hash table.
\r
643 el = new RBBISetTableEl();
\r
646 fSetTable.put(el.key, el);
\r
652 // Assorted Unicode character constants.
\r
653 // Numeric because there is no portable way to enter them as literals.
\r
656 static final int chNEL = 0x85; // NEL newline variant
\r
658 static final int chLS = 0x2028; // Unicode Line Separator
\r
660 //----------------------------------------------------------------------------------------
\r
662 // stripRules Return a rules string without unnecessary
\r
665 //----------------------------------------------------------------------------------------
\r
666 static String stripRules(String rules) {
\r
667 StringBuilder strippedRules = new StringBuilder();
\r
668 int rulesLength = rules.length();
\r
669 for (int idx = 0; idx < rulesLength;) {
\r
670 char ch = rules.charAt(idx++);
\r
672 while (idx < rulesLength
\r
673 && ch != '\r' && ch != '\n' && ch != chNEL) {
\r
674 ch = rules.charAt(idx++);
\r
677 if (!UCharacter.isISOControl(ch)) {
\r
678 strippedRules.append(ch);
\r
681 return strippedRules.toString();
\r
684 //----------------------------------------------------------------------------------------
\r
686 // nextCharLL Low Level Next Char from rule input source.
\r
687 // Get a char from the input character iterator,
\r
688 // keep track of input position for error reporting.
\r
690 //----------------------------------------------------------------------------------------
\r
694 if (fNextIndex >= fRB.fRules.length()) {
\r
697 ch = UTF16.charAt(fRB.fRules, fNextIndex);
\r
698 fNextIndex = UTF16.moveCodePointOffset(fRB.fRules, fNextIndex, 1);
\r
703 ch == '\n' && fLastChar != '\r') {
\r
704 // Character is starting a new line. Bump up the line number, and
\r
705 // reset the column to 0.
\r
709 error(RBBIRuleBuilder.U_BRK_NEW_LINE_IN_QUOTED_STRING);
\r
710 fQuoteMode = false;
\r
713 // Character is not starting a new line. Except in the case of a
\r
714 // LF following a CR, increment the column position.
\r
723 //---------------------------------------------------------------------------------
\r
725 // nextChar for rules scanning. At this level, we handle stripping
\r
726 // out comments and processing backslash character escapes.
\r
727 // The rest of the rules grammar is handled at the next level up.
\r
729 //---------------------------------------------------------------------------------
\r
730 void nextChar(RBBIRuleChar c) {
\r
732 // Unicode Character constants needed for the processing done by nextChar(),
\r
733 // in hex because literals wont work on EBCDIC machines.
\r
735 fScanIndex = fNextIndex;
\r
736 c.fChar = nextCharLL();
\r
737 c.fEscaped = false;
\r
740 // check for '' sequence.
\r
741 // These are recognized in all contexts, whether in quoted text or not.
\r
743 if (c.fChar == '\'') {
\r
744 if (UTF16.charAt(fRB.fRules, fNextIndex) == '\'') {
\r
745 c.fChar = nextCharLL(); // get nextChar officially so character counts
\r
746 c.fEscaped = true; // stay correct.
\r
748 // Single quote, by itself.
\r
749 // Toggle quoting mode.
\r
750 // Return either '(' or ')', because quotes cause a grouping of the quoted text.
\r
751 fQuoteMode = !fQuoteMode;
\r
752 if (fQuoteMode == true) {
\r
757 c.fEscaped = false; // The paren that we return is not escaped.
\r
765 // We are not in a 'quoted region' of the source.
\r
767 if (c.fChar == '#') {
\r
768 // Start of a comment. Consume the rest of it.
\r
769 // The new-line char that terminates the comment is always returned.
\r
770 // It will be treated as white-space, and serves to break up anything
\r
771 // that might otherwise incorrectly clump together with a comment in
\r
772 // the middle (a variable name, for example.)
\r
774 c.fChar = nextCharLL();
\r
775 if (c.fChar == -1 || // EOF
\r
778 c.fChar == chNEL ||
\r
785 if (c.fChar == -1) {
\r
790 // check for backslash escaped characters.
\r
791 // Use String.unescapeAt() to handle them.
\r
793 if (c.fChar == '\\') {
\r
795 int[] unescapeIndex = new int[1];
\r
796 unescapeIndex[0] = fNextIndex;
\r
797 c.fChar = Utility.unescapeAt(fRB.fRules, unescapeIndex);
\r
798 if (unescapeIndex[0] == fNextIndex) {
\r
799 error(RBBIRuleBuilder.U_BRK_HEX_DIGITS_EXPECTED);
\r
802 fCharNum += unescapeIndex[0] - fNextIndex;
\r
803 fNextIndex = unescapeIndex[0];
\r
806 // putc(c.fChar, stdout);
\r
809 //---------------------------------------------------------------------------------
\r
811 // Parse RBBI rules. The state machine for rules parsing is here.
\r
812 // The state tables are hand-written in the file rbbirpt.txt,
\r
813 // and converted to the form used here by a perl
\r
814 // script rbbicst.pl
\r
816 //---------------------------------------------------------------------------------
\r
819 RBBIRuleParseTable.RBBIRuleTableElement tableEl;
\r
824 // Main loop for the rule parsing state machine.
\r
825 // Runs once per state transition.
\r
826 // Each time through optionally performs, depending on the state table,
\r
827 // - an advance to the the next input char
\r
828 // - an action to be performed.
\r
829 // - pushing or popping a state to/from the local state return stack.
\r
832 // Quit if state == 0. This is the normal way to exit the state machine.
\r
838 // Find the state table element that matches the input char from the rule, or the
\r
839 // class of the input character. Start with the first table row for this
\r
840 // state, then linearly scan forward until we find a row that matches the
\r
841 // character. The last row for each state always matches all characters, so
\r
842 // the search will stop there, if not before.
\r
844 tableEl = RBBIRuleParseTable.gRuleParseStateTable[state];
\r
845 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
\r
846 System.out.println("char, line, col = (\'" + (char) fC.fChar
\r
847 + "\', " + fLineNum + ", " + fCharNum + " state = "
\r
848 + tableEl.fStateName);
\r
851 for (int tableRow = state;; tableRow++) { // loop over the state table rows associated with this state.
\r
852 tableEl = RBBIRuleParseTable.gRuleParseStateTable[tableRow];
\r
853 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
\r
854 System.out.print(".");
\r
856 if (tableEl.fCharClass < 127 && fC.fEscaped == false
\r
857 && tableEl.fCharClass == fC.fChar) {
\r
858 // Table row specified an individual character, not a set, and
\r
859 // the input character is not escaped, and
\r
860 // the input character matched it.
\r
863 if (tableEl.fCharClass == 255) {
\r
864 // Table row specified default, match anything character class.
\r
867 if (tableEl.fCharClass == 254 && fC.fEscaped) {
\r
868 // Table row specified "escaped" and the char was escaped.
\r
871 if (tableEl.fCharClass == 253 && fC.fEscaped
\r
872 && (fC.fChar == 0x50 || fC.fChar == 0x70)) {
\r
873 // Table row specified "escaped P" and the char is either 'p' or 'P'.
\r
876 if (tableEl.fCharClass == 252 && fC.fChar == -1) {
\r
877 // Table row specified eof and we hit eof on the input.
\r
881 if (tableEl.fCharClass >= 128 && tableEl.fCharClass < 240 && // Table specs a char class &&
\r
882 fC.fEscaped == false && // char is not escaped &&
\r
883 fC.fChar != -1) { // char is not EOF
\r
884 UnicodeSet uniset = fRuleSets[tableEl.fCharClass - 128];
\r
885 if (uniset.contains(fC.fChar)) {
\r
886 // Table row specified a character class, or set of characters,
\r
887 // and the current char matches it.
\r
893 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
\r
894 System.out.println("");
\r
897 // We've found the row of the state table that matches the current input
\r
898 // character from the rules string.
\r
899 // Perform any action specified by this row in the state table.
\r
900 if (doParseActions(tableEl.fAction) == false) {
\r
901 // Break out of the state machine loop if the
\r
902 // the action signalled some kind of error, or
\r
903 // the action was to exit, occurs on normal end-of-rules-input.
\r
907 if (tableEl.fPushState != 0) {
\r
909 if (fStackPtr >= kStackSize) {
\r
910 System.out.println("RBBIRuleScanner.parse() - state stack overflow.");
\r
911 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
\r
913 fStack[fStackPtr] = tableEl.fPushState;
\r
916 if (tableEl.fNextChar) {
\r
920 // Get the next state from the table entry, or from the
\r
921 // state stack if the next state was specified as "pop".
\r
922 if (tableEl.fNextState != 255) {
\r
923 state = tableEl.fNextState;
\r
925 state = fStack[fStackPtr];
\r
927 if (fStackPtr < 0) {
\r
928 System.out.println("RBBIRuleScanner.parse() - state stack underflow.");
\r
929 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
\r
936 // If there were NO user specified reverse rules, set up the equivalent of ".*;"
\r
938 if (fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] == null) {
\r
939 fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] = pushNewNode(RBBINode.opStar);
\r
940 RBBINode operand = pushNewNode(RBBINode.setRef);
\r
941 findSetFor(kAny, operand, null);
\r
942 fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].fLeftChild = operand;
\r
943 operand.fParent = fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree];
\r
944 fNodeStackPtr -= 2;
\r
948 // Parsing of the input RBBI rules is complete.
\r
949 // We now have a parse tree for the rule expressions
\r
950 // and a list of all UnicodeSets that are referenced.
\r
952 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("symbols") >= 0) {
\r
953 fSymbolTable.rbbiSymtablePrint();
\r
955 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("ptree") >= 0) {
\r
956 System.out.println("Completed Forward Rules Parse Tree...");
\r
957 fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree].printTree(true);
\r
958 System.out.println("\nCompleted Reverse Rules Parse Tree...");
\r
959 fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].printTree(true);
\r
960 System.out.println("\nCompleted Safe Point Forward Rules Parse Tree...");
\r
961 if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree] == null) {
\r
962 System.out.println(" -- null -- ");
\r
964 fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree].printTree(true);
\r
966 System.out.println("\nCompleted Safe Point Reverse Rules Parse Tree...");
\r
967 if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree] == null) {
\r
968 System.out.println(" -- null -- ");
\r
970 fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree].printTree(true);
\r
975 //---------------------------------------------------------------------------------
\r
977 // printNodeStack for debugging...
\r
979 //---------------------------------------------------------------------------------
\r
981 void printNodeStack(String title) {
\r
983 System.out.println(title + ". Dumping node stack...\n");
\r
984 for (i = fNodeStackPtr; i > 0; i--) {
\r
985 fNodeStack[i].printTree(true);
\r
990 //---------------------------------------------------------------------------------
\r
992 // pushNewNode create a new RBBINode of the specified type and push it
\r
993 // onto the stack of nodes.
\r
995 //---------------------------------------------------------------------------------
\r
996 RBBINode pushNewNode(int nodeType) {
\r
998 if (fNodeStackPtr >= kStackSize) {
\r
999 System.out.println("RBBIRuleScanner.pushNewNode - stack overflow.");
\r
1000 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
\r
1002 fNodeStack[fNodeStackPtr] = new RBBINode(nodeType);
\r
1003 return fNodeStack[fNodeStackPtr];
\r
1006 //---------------------------------------------------------------------------------
\r
1008 // scanSet Construct a UnicodeSet from the text at the current scan
\r
1009 // position. Advance the scan position to the first character
\r
1012 // A new RBBI setref node referring to the set is pushed onto the node
\r
1015 // The scan position is normally under the control of the state machine
\r
1016 // that controls rule parsing. UnicodeSets, however, are parsed by
\r
1017 // the UnicodeSet constructor, not by the RBBI rule parser.
\r
1019 //---------------------------------------------------------------------------------
\r
1021 UnicodeSet uset = null;
\r
1023 ParsePosition pos = new ParsePosition(fScanIndex);
\r
1026 startPos = fScanIndex;
\r
1028 uset = new UnicodeSet(fRB.fRules, pos, fSymbolTable, UnicodeSet.IGNORE_SPACE);
\r
1029 } catch (Exception e) { // TODO: catch fewer exception types.
\r
1030 // Repackage UnicodeSet errors as RBBI rule builder errors, with location info.
\r
1031 error(RBBIRuleBuilder.U_BRK_MALFORMED_SET);
\r
1034 // Verify that the set contains at least one code point.
\r
1036 if (uset.isEmpty()) {
\r
1037 // This set is empty.
\r
1038 // Make it an error, because it almost certainly is not what the user wanted.
\r
1039 // Also, avoids having to think about corner cases in the tree manipulation code
\r
1040 // that occurs later on.
\r
1041 // TODO: this shouldn't be an error; it does happen.
\r
1042 error(RBBIRuleBuilder.U_BRK_RULE_EMPTY_SET);
\r
1045 // Advance the RBBI parse postion over the UnicodeSet pattern.
\r
1046 // Don't just set fScanIndex because the line/char positions maintained
\r
1047 // for error reporting would be thrown off.
\r
1048 i = pos.getIndex();
\r
1050 if (fNextIndex >= i) {
\r
1058 n = pushNewNode(RBBINode.setRef);
\r
1059 n.fFirstPos = startPos;
\r
1060 n.fLastPos = fNextIndex;
\r
1061 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
\r
1062 // findSetFor() serves several purposes here:
\r
1063 // - Adopts storage for the UnicodeSet, will be responsible for deleting.
\r
1064 // - Mantains collection of all sets in use, needed later for establishing
\r
1065 // character categories for run time engine.
\r
1066 // - Eliminates mulitiple instances of the same set.
\r
1067 // - Creates a new uset node if necessary (if this isn't a duplicate.)
\r
1068 findSetFor(n.fText, n, uset);
\r