2 *******************************************************************************
\r
3 * Copyright (C) 2003-2008, International Business Machines Corporation and others. All Rights Reserved.
\r
4 *******************************************************************************
\r
7 package com.ibm.icu.text;
\r
9 import java.util.HashMap;
\r
10 import java.lang.IllegalArgumentException;
\r
11 import java.text.ParsePosition;
\r
13 import com.ibm.icu.lang.UCharacter;
\r
14 import com.ibm.icu.impl.Assert;
\r
15 import com.ibm.icu.impl.Utility;
\r
18 * This class is part of the Rule Based Break Iterator rule compiler.
\r
19 * It scans the rules and builds the parse tree.
\r
20 * There is no public API here.
\r
23 class RBBIRuleScanner {
\r
25 private final int kStackSize = 100; // The size of the state stack for
\r
26 // rules parsing. Corresponds roughly
\r
27 // to the depth of parentheses nesting
\r
28 // that is allowed in the rules.
\r
30 static class RBBIRuleChar {
\r
37 RBBIRuleBuilder fRB; // The rule builder that we are part of.
\r
39 int fScanIndex; // Index of current character being processed
\r
40 // in the rule input string.
\r
41 int fNextIndex; // Index of the next character, which
\r
42 // is the first character not yet scanned.
\r
43 boolean fQuoteMode; // Scan is in a 'quoted region'
\r
44 int fLineNum; // Line number in input file.
\r
45 int fCharNum; // Char position within the line.
\r
46 int fLastChar; // Previous char, needed to count CR-LF
\r
47 // as a single line, not two.
\r
49 RBBIRuleChar fC = new RBBIRuleChar(); // Current char for parse state machine
\r
51 String fVarName; // $variableName, valid when we've just
\r
55 short fStack[] = new short[kStackSize]; // State stack, holds state pushes
\r
56 int fStackPtr; // and pops as specified in the state
\r
57 // transition rules.
\r
59 RBBINode fNodeStack[] = new RBBINode[kStackSize]; // Node stack, holds nodes created
\r
60 // during the parse of a rule
\r
64 boolean fReverseRule; // True if the rule currently being scanned
\r
65 // is a reverse direction rule (if it
\r
66 // starts with a '!')
\r
68 boolean fLookAheadRule; // True if the rule includes a '/'
\r
69 // somewhere within it.
\r
71 RBBISymbolTable fSymbolTable; // symbol table, holds definitions of
\r
72 // $variable symbols.
\r
74 HashMap fSetTable = new HashMap(); // UnicocodeSet hash table, holds indexes to
\r
75 // the sets created while parsing rules.
\r
76 // The key is the string used for creating
\r
79 UnicodeSet fRuleSets[] = new UnicodeSet[10]; // Unicode Sets that are needed during
\r
80 // the scanning of RBBI rules. The
\r
81 // indicies for these are assigned by the
\r
82 // perl script that builds the state tables.
\r
85 int fRuleNum; // Counts each rule as it is scanned.
\r
87 int fOptionStart; // Input index of start of a !!option
\r
88 // keyword, while being scanned.
\r
92 static private String gRuleSet_rule_char_pattern = "[^[\\p{Z}\\u0020-\\u007f]-[\\p{L}]-[\\p{N}]]";
\r
93 static private String gRuleSet_name_char_pattern = "[_\\p{L}\\p{N}]";
\r
94 static private String gRuleSet_digit_char_pattern = "[0-9]";
\r
95 static private String gRuleSet_name_start_char_pattern = "[_\\p{L}]";
\r
96 static private String gRuleSet_white_space_pattern = "[\\p{Pattern_White_Space}]";
\r
97 static private String kAny = "any";
\r
102 //----------------------------------------------------------------------------------------
\r
106 //----------------------------------------------------------------------------------------
\r
107 RBBIRuleScanner(RBBIRuleBuilder rb) {
\r
112 // Set up the constant Unicode Sets.
\r
113 // Note: These could be made static and shared among
\r
114 // all instances of RBBIRuleScanners.
\r
115 fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128] = new UnicodeSet(gRuleSet_rule_char_pattern);
\r
116 fRuleSets[RBBIRuleParseTable.kRuleSet_white_space - 128] = new UnicodeSet(gRuleSet_white_space_pattern);
\r
117 fRuleSets[RBBIRuleParseTable.kRuleSet_name_char - 128] = new UnicodeSet(gRuleSet_name_char_pattern);
\r
118 fRuleSets[RBBIRuleParseTable.kRuleSet_name_start_char - 128] = new UnicodeSet(gRuleSet_name_start_char_pattern);
\r
119 fRuleSets[RBBIRuleParseTable.kRuleSet_digit_char - 128] = new UnicodeSet(gRuleSet_digit_char_pattern);
\r
121 fSymbolTable = new RBBISymbolTable(this, rb.fRules);
\r
124 //----------------------------------------------------------------------------------------
\r
126 // doParseAction Do some action during rule parsing.
\r
127 // Called by the parse state machine.
\r
128 // Actions build the parse tree and Unicode Sets,
\r
129 // and maintain the parse stack for nested expressions.
\r
131 //----------------------------------------------------------------------------------------
\r
132 boolean doParseActions(int action) {
\r
135 boolean returnVal = true;
\r
139 case RBBIRuleParseTable.doExprStart:
\r
140 pushNewNode(RBBINode.opStart);
\r
144 case RBBIRuleParseTable.doExprOrOperator: {
\r
145 fixOpStack(RBBINode.precOpCat);
\r
146 RBBINode operandNode = fNodeStack[fNodeStackPtr--];
\r
147 RBBINode orNode = pushNewNode(RBBINode.opOr);
\r
148 orNode.fLeftChild = operandNode;
\r
149 operandNode.fParent = orNode;
\r
153 case RBBIRuleParseTable.doExprCatOperator:
\r
154 // concatenation operator.
\r
155 // For the implicit concatenation of adjacent terms in an expression
\r
157 // not separated by any other operator. Action is invoked between the
\r
158 // actions for the two terms.
\r
160 fixOpStack(RBBINode.precOpCat);
\r
161 RBBINode operandNode = fNodeStack[fNodeStackPtr--];
\r
162 RBBINode catNode = pushNewNode(RBBINode.opCat);
\r
163 catNode.fLeftChild = operandNode;
\r
164 operandNode.fParent = catNode;
\r
168 case RBBIRuleParseTable.doLParen:
\r
170 // The openParen node is a dummy operation type with a low
\r
172 // which has the affect of ensuring that any real binary op that
\r
173 // follows within the parens binds more tightly to the operands than
\r
174 // stuff outside of the parens.
\r
175 pushNewNode(RBBINode.opLParen);
\r
178 case RBBIRuleParseTable.doExprRParen:
\r
179 fixOpStack(RBBINode.precLParen);
\r
182 case RBBIRuleParseTable.doNOP:
\r
185 case RBBIRuleParseTable.doStartAssign:
\r
186 // We've just scanned "$variable = "
\r
187 // The top of the node stack has the $variable ref node.
\r
189 // Save the start position of the RHS text in the StartExpression
\r
191 // that precedes the $variableReference node on the stack.
\r
192 // This will eventually be used when saving the full $variable
\r
194 // text as a string.
\r
195 n = fNodeStack[fNodeStackPtr - 1];
\r
196 n.fFirstPos = fNextIndex; // move past the '='
\r
198 // Push a new start-of-expression node; needed to keep parse of the
\r
199 // RHS expression happy.
\r
200 pushNewNode(RBBINode.opStart);
\r
203 case RBBIRuleParseTable.doEndAssign: {
\r
204 // We have reached the end of an assignement statement.
\r
205 // Current scan char is the ';' that terminates the assignment.
\r
207 // Terminate expression, leaves expression parse tree rooted in TOS
\r
209 fixOpStack(RBBINode.precStart);
\r
211 RBBINode startExprNode = fNodeStack[fNodeStackPtr - 2];
\r
212 RBBINode varRefNode = fNodeStack[fNodeStackPtr - 1];
\r
213 RBBINode RHSExprNode = fNodeStack[fNodeStackPtr];
\r
215 // Save original text of right side of assignment, excluding the
\r
217 // in the root of the node for the right-hand-side expression.
\r
218 RHSExprNode.fFirstPos = startExprNode.fFirstPos;
\r
219 RHSExprNode.fLastPos = fScanIndex;
\r
220 // fRB.fRules.extractBetween(RHSExprNode.fFirstPos,
\r
221 // RHSExprNode.fLastPos, RHSExprNode.fText);
\r
222 RHSExprNode.fText = fRB.fRules.substring(RHSExprNode.fFirstPos,
\r
223 RHSExprNode.fLastPos);
\r
225 // Expression parse tree becomes l. child of the $variable reference
\r
227 varRefNode.fLeftChild = RHSExprNode;
\r
228 RHSExprNode.fParent = varRefNode;
\r
230 // Make a symbol table entry for the $variableRef node.
\r
231 fSymbolTable.addEntry(varRefNode.fText, varRefNode);
\r
233 // Clean up the stack.
\r
234 fNodeStackPtr -= 3;
\r
238 case RBBIRuleParseTable.doEndOfRule: {
\r
239 fixOpStack(RBBINode.precStart); // Terminate expression, leaves
\r
242 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("rtree") >= 0) {
\r
243 printNodeStack("end of rule");
\r
245 Assert.assrt(fNodeStackPtr == 1);
\r
247 // If this rule includes a look-ahead '/', add a endMark node to the
\r
248 // expression tree.
\r
249 if (fLookAheadRule) {
\r
250 RBBINode thisRule = fNodeStack[fNodeStackPtr];
\r
251 RBBINode endNode = pushNewNode(RBBINode.endMark);
\r
252 RBBINode catNode = pushNewNode(RBBINode.opCat);
\r
253 fNodeStackPtr -= 2;
\r
254 catNode.fLeftChild = thisRule;
\r
255 catNode.fRightChild = endNode;
\r
256 fNodeStack[fNodeStackPtr] = catNode;
\r
257 endNode.fVal = fRuleNum;
\r
258 endNode.fLookAheadEnd = true;
\r
261 // All rule expressions are ORed together.
\r
262 // The ';' that terminates an expression really just functions as a
\r
264 // a low operator prededence.
\r
266 // Each of the four sets of rules are collected separately.
\r
267 // (forward, reverse, safe_forward, safe_reverse)
\r
268 // OR this rule into the appropriate group of them.
\r
271 int destRules = (fReverseRule ? RBBIRuleBuilder.fReverseTree : fRB.fDefaultTree);
\r
273 if (fRB.fTreeRoots[destRules] != null) {
\r
274 // This is not the first rule encounted.
\r
275 // OR previous stuff (from *destRules)
\r
276 // with the current rule expression (on the Node Stack)
\r
277 // with the resulting OR expression going to *destRules
\r
279 RBBINode thisRule = fNodeStack[fNodeStackPtr];
\r
280 RBBINode prevRules = fRB.fTreeRoots[destRules];
\r
281 RBBINode orNode = pushNewNode(RBBINode.opOr);
\r
282 orNode.fLeftChild = prevRules;
\r
283 prevRules.fParent = orNode;
\r
284 orNode.fRightChild = thisRule;
\r
285 thisRule.fParent = orNode;
\r
286 fRB.fTreeRoots[destRules] = orNode;
\r
288 // This is the first rule encountered (for this direction).
\r
289 // Just move its parse tree from the stack to *destRules.
\r
290 fRB.fTreeRoots[destRules] = fNodeStack[fNodeStackPtr];
\r
292 fReverseRule = false; // in preparation for the next rule.
\r
293 fLookAheadRule = false;
\r
298 case RBBIRuleParseTable.doRuleError:
\r
299 error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
\r
303 case RBBIRuleParseTable.doVariableNameExpectedErr:
\r
304 error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
\r
308 // Unary operands + ? *
\r
309 // These all appear after the operand to which they apply.
\r
310 // When we hit one, the operand (may be a whole sub expression)
\r
311 // will be on the top of the stack.
\r
312 // Unary Operator becomes TOS, with the old TOS as its one child.
\r
313 case RBBIRuleParseTable.doUnaryOpPlus: {
\r
314 RBBINode operandNode = fNodeStack[fNodeStackPtr--];
\r
315 RBBINode plusNode = pushNewNode(RBBINode.opPlus);
\r
316 plusNode.fLeftChild = operandNode;
\r
317 operandNode.fParent = plusNode;
\r
321 case RBBIRuleParseTable.doUnaryOpQuestion: {
\r
322 RBBINode operandNode = fNodeStack[fNodeStackPtr--];
\r
323 RBBINode qNode = pushNewNode(RBBINode.opQuestion);
\r
324 qNode.fLeftChild = operandNode;
\r
325 operandNode.fParent = qNode;
\r
329 case RBBIRuleParseTable.doUnaryOpStar: {
\r
330 RBBINode operandNode = fNodeStack[fNodeStackPtr--];
\r
331 RBBINode starNode = pushNewNode(RBBINode.opStar);
\r
332 starNode.fLeftChild = operandNode;
\r
333 operandNode.fParent = starNode;
\r
337 case RBBIRuleParseTable.doRuleChar:
\r
338 // A "Rule Character" is any single character that is a literal part
\r
339 // of the regular expression. Like a, b and c in the expression "(abc*)
\r
341 // These are pretty uncommon in break rules; the terms are more commonly
\r
342 // sets. To keep things uniform, treat these characters like as
\r
343 // sets that just happen to contain only one character.
\r
345 n = pushNewNode(RBBINode.setRef);
\r
346 String s = (new StringBuffer().append((char) fC.fChar)).toString();
\r
347 findSetFor(s, n, null);
\r
348 n.fFirstPos = fScanIndex;
\r
349 n.fLastPos = fNextIndex;
\r
350 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
\r
354 case RBBIRuleParseTable.doDotAny:
\r
355 // scanned a ".", meaning match any single character.
\r
357 n = pushNewNode(RBBINode.setRef);
\r
358 findSetFor(kAny, n, null);
\r
359 n.fFirstPos = fScanIndex;
\r
360 n.fLastPos = fNextIndex;
\r
361 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
\r
365 case RBBIRuleParseTable.doSlash:
\r
366 // Scanned a '/', which identifies a look-ahead break position in a
\r
368 n = pushNewNode(RBBINode.lookAhead);
\r
370 n.fFirstPos = fScanIndex;
\r
371 n.fLastPos = fNextIndex;
\r
372 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
\r
373 fLookAheadRule = true;
\r
376 case RBBIRuleParseTable.doStartTagValue:
\r
377 // Scanned a '{', the opening delimiter for a tag value within a
\r
379 n = pushNewNode(RBBINode.tag);
\r
381 n.fFirstPos = fScanIndex;
\r
382 n.fLastPos = fNextIndex;
\r
385 case RBBIRuleParseTable.doTagDigit:
\r
386 // Just scanned a decimal digit that's part of a tag value
\r
388 n = fNodeStack[fNodeStackPtr];
\r
389 int v = UCharacter.digit((char) fC.fChar, 10);
\r
390 n.fVal = n.fVal * 10 + v;
\r
394 case RBBIRuleParseTable.doTagValue:
\r
395 n = fNodeStack[fNodeStackPtr];
\r
396 n.fLastPos = fNextIndex;
\r
397 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
\r
400 case RBBIRuleParseTable.doTagExpectedError:
\r
401 error(RBBIRuleBuilder.U_BRK_MALFORMED_RULE_TAG);
\r
405 case RBBIRuleParseTable.doOptionStart:
\r
406 // Scanning a !!option. At the start of string.
\r
407 fOptionStart = fScanIndex;
\r
410 case RBBIRuleParseTable.doOptionEnd: {
\r
411 String opt = fRB.fRules.substring(fOptionStart, fScanIndex);
\r
412 if (opt.equals("chain")) {
\r
413 fRB.fChainRules = true;
\r
414 } else if (opt.equals("LBCMNoChain")) {
\r
415 fRB.fLBCMNoChain = true;
\r
416 } else if (opt.equals("forward")) {
\r
417 fRB.fDefaultTree = RBBIRuleBuilder.fForwardTree;
\r
418 } else if (opt.equals("reverse")) {
\r
419 fRB.fDefaultTree = RBBIRuleBuilder.fReverseTree;
\r
420 } else if (opt.equals("safe_forward")) {
\r
421 fRB.fDefaultTree = RBBIRuleBuilder.fSafeFwdTree;
\r
422 } else if (opt.equals("safe_reverse")) {
\r
423 fRB.fDefaultTree = RBBIRuleBuilder.fSafeRevTree;
\r
424 } else if (opt.equals("lookAheadHardBreak")) {
\r
425 fRB.fLookAheadHardBreak = true;
\r
427 error(RBBIRuleBuilder.U_BRK_UNRECOGNIZED_OPTION);
\r
432 case RBBIRuleParseTable.doReverseDir:
\r
433 fReverseRule = true;
\r
436 case RBBIRuleParseTable.doStartVariableName:
\r
437 n = pushNewNode(RBBINode.varRef);
\r
438 n.fFirstPos = fScanIndex;
\r
441 case RBBIRuleParseTable.doEndVariableName:
\r
442 n = fNodeStack[fNodeStackPtr];
\r
443 if (n == null || n.fType != RBBINode.varRef) {
\r
444 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
\r
447 n.fLastPos = fScanIndex;
\r
448 n.fText = fRB.fRules.substring(n.fFirstPos + 1, n.fLastPos);
\r
449 // Look the newly scanned name up in the symbol table
\r
450 // If there's an entry, set the l. child of the var ref to the
\r
451 // replacement expression.
\r
452 // (We also pass through here when scanning assignments, but no harm
\r
454 // than a slight wasted effort that seems hard to avoid. Lookup will
\r
456 n.fLeftChild = fSymbolTable.lookupNode(n.fText);
\r
459 case RBBIRuleParseTable.doCheckVarDef:
\r
460 n = fNodeStack[fNodeStackPtr];
\r
461 if (n.fLeftChild == null) {
\r
462 error(RBBIRuleBuilder.U_BRK_UNDEFINED_VARIABLE);
\r
467 case RBBIRuleParseTable.doExprFinished:
\r
470 case RBBIRuleParseTable.doRuleErrorAssignExpr:
\r
471 error(RBBIRuleBuilder.U_BRK_ASSIGN_ERROR);
\r
475 case RBBIRuleParseTable.doExit:
\r
479 case RBBIRuleParseTable.doScanUnicodeSet:
\r
484 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
\r
491 //----------------------------------------------------------------------------------------
\r
493 // Error Throw and IllegalArgumentException in response to a rule parse
\r
496 //----------------------------------------------------------------------------------------
\r
497 void error(int e) {
\r
498 String s = "Error " + e + " at line " + fLineNum + " column "
\r
500 IllegalArgumentException ex = new IllegalArgumentException(s);
\r
505 //----------------------------------------------------------------------------------------
\r
507 // fixOpStack The parse stack holds partially assembled chunks of the parse
\r
509 // An entry on the stack may be as small as a single setRef node,
\r
510 // or as large as the parse tree
\r
511 // for an entire expression (this will be the one item left on the stack
\r
512 // when the parsing of an RBBI rule completes.
\r
514 // This function is called when a binary operator is encountered.
\r
515 // It looks back up the stack for operators that are not yet associated
\r
516 // with a right operand, and if the precedence of the stacked operator >=
\r
517 // the precedence of the current operator, binds the operand left,
\r
518 // to the previously encountered operator.
\r
520 //----------------------------------------------------------------------------------------
\r
521 void fixOpStack(int p) {
\r
523 // printNodeStack("entering fixOpStack()");
\r
525 n = fNodeStack[fNodeStackPtr - 1]; // an operator node
\r
526 if (n.fPrecedence == 0) {
\r
527 System.out.print("RBBIRuleScanner.fixOpStack, bad operator node");
\r
528 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
\r
532 if (n.fPrecedence < p || n.fPrecedence <= RBBINode.precLParen) {
\r
533 // The most recent operand goes with the current operator,
\r
534 // not with the previously stacked one.
\r
537 // Stack operator is a binary op ( '|' or concatenation)
\r
538 // TOS operand becomes right child of this operator.
\r
539 // Resulting subexpression becomes the TOS operand.
\r
540 n.fRightChild = fNodeStack[fNodeStackPtr];
\r
541 fNodeStack[fNodeStackPtr].fParent = n;
\r
543 // printNodeStack("looping in fixOpStack() ");
\r
546 if (p <= RBBINode.precLParen) {
\r
547 // Scan is at a right paren or end of expression.
\r
548 // The scanned item must match the stack, or else there was an
\r
550 // Discard the left paren (or start expr) node from the stack,
\r
551 // leaving the completed (sub)expression as TOS.
\r
552 if (n.fPrecedence != p) {
\r
553 // Right paren encountered matched start of expression node, or
\r
554 // end of expression matched with a left paren node.
\r
555 error(RBBIRuleBuilder.U_BRK_MISMATCHED_PAREN);
\r
557 fNodeStack[fNodeStackPtr - 1] = fNodeStack[fNodeStackPtr];
\r
559 // Delete the now-discarded LParen or Start node.
\r
562 // printNodeStack("leaving fixOpStack()");
\r
565 //----------------------------------------------------------------------------
\r
567 // RBBISetTableEl is an entry in the hash table of UnicodeSets that have
\r
568 // been encountered. The val Node will be of nodetype uset
\r
569 // and contain pointers to the actual UnicodeSets.
\r
570 // The Key is the source string for initializing the set.
\r
572 // The hash table is used to avoid creating duplicate
\r
573 // unnamed (not $var references) UnicodeSets.
\r
575 //----------------------------------------------------------------------------
\r
576 static class RBBISetTableEl {
\r
583 //----------------------------------------------------------------------------------------
\r
585 // findSetFor given a String,
\r
586 // - find the corresponding Unicode Set (uset node)
\r
587 // (create one if necessary)
\r
588 // - Set fLeftChild of the caller's node (should be a setRef node)
\r
589 // to the uset node
\r
590 // Maintain a hash table of uset nodes, so the same one is always used
\r
591 // for the same string.
\r
592 // If a "to adopt" set is provided and we haven't seen this key before,
\r
593 // add the provided set to the hash table.
\r
594 // If the string is one (32 bit) char in length, the set contains
\r
595 // just one element which is the char in question.
\r
596 // If the string is "any", return a set containing all chars.
\r
598 //----------------------------------------------------------------------------------------
\r
599 void findSetFor(String s, RBBINode node, UnicodeSet setToAdopt) {
\r
603 // First check whether we've already cached a set for this string.
\r
604 // If so, just use the cached set in the new node.
\r
605 // delete any set provided by the caller, since we own it.
\r
606 el = (RBBISetTableEl) fSetTable.get(s);
\r
608 node.fLeftChild = el.val;
\r
609 Assert.assrt(node.fLeftChild.fType == RBBINode.uset);
\r
613 // Haven't seen this set before.
\r
614 // If the caller didn't provide us with a prebuilt set,
\r
615 // create a new UnicodeSet now.
\r
616 if (setToAdopt == null) {
\r
617 if (s.equals(kAny)) {
\r
618 setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
\r
621 c = UTF16.charAt(s, 0);
\r
622 setToAdopt = new UnicodeSet(c, c);
\r
627 // Make a new uset node to refer to this UnicodeSet
\r
628 // This new uset node becomes the child of the caller's setReference
\r
631 RBBINode usetNode = new RBBINode(RBBINode.uset);
\r
632 usetNode.fInputSet = setToAdopt;
\r
633 usetNode.fParent = node;
\r
634 node.fLeftChild = usetNode;
\r
635 usetNode.fText = s;
\r
638 // Add the new uset node to the list of all uset nodes.
\r
640 fRB.fUSetNodes.add(usetNode);
\r
643 // Add the new set to the set hash table.
\r
645 el = new RBBISetTableEl();
\r
648 fSetTable.put(el.key, el);
\r
654 // Assorted Unicode character constants.
\r
655 // Numeric because there is no portable way to enter them as literals.
\r
658 static final int chNEL = 0x85; // NEL newline variant
\r
660 static final int chLS = 0x2028; // Unicode Line Separator
\r
662 //----------------------------------------------------------------------------------------
\r
664 // stripRules Return a rules string without unnecessary
\r
667 //----------------------------------------------------------------------------------------
\r
668 static String stripRules(String rules) {
\r
669 StringBuffer strippedRules = new StringBuffer();
\r
670 int rulesLength = rules.length();
\r
671 for (int idx = 0; idx < rulesLength;) {
\r
672 char ch = rules.charAt(idx++);
\r
674 while (idx < rulesLength
\r
675 && ch != '\r' && ch != '\n' && ch != chNEL) {
\r
676 ch = rules.charAt(idx++);
\r
679 if (!UCharacter.isISOControl(ch)) {
\r
680 strippedRules.append(ch);
\r
683 return strippedRules.toString();
\r
686 //----------------------------------------------------------------------------------------
\r
688 // nextCharLL Low Level Next Char from rule input source.
\r
689 // Get a char from the input character iterator,
\r
690 // keep track of input position for error reporting.
\r
692 //----------------------------------------------------------------------------------------
\r
696 if (fNextIndex >= fRB.fRules.length()) {
\r
699 ch = UTF16.charAt(fRB.fRules, fNextIndex);
\r
700 fNextIndex = UTF16.moveCodePointOffset(fRB.fRules, fNextIndex, 1);
\r
705 ch == '\n' && fLastChar != '\r') {
\r
706 // Character is starting a new line. Bump up the line number, and
\r
707 // reset the column to 0.
\r
711 error(RBBIRuleBuilder.U_BRK_NEW_LINE_IN_QUOTED_STRING);
\r
712 fQuoteMode = false;
\r
715 // Character is not starting a new line. Except in the case of a
\r
716 // LF following a CR, increment the column position.
\r
725 //---------------------------------------------------------------------------------
\r
727 // nextChar for rules scanning. At this level, we handle stripping
\r
728 // out comments and processing backslash character escapes.
\r
729 // The rest of the rules grammar is handled at the next level up.
\r
731 //---------------------------------------------------------------------------------
\r
732 void nextChar(RBBIRuleChar c) {
\r
734 // Unicode Character constants needed for the processing done by nextChar(),
\r
735 // in hex because literals wont work on EBCDIC machines.
\r
737 fScanIndex = fNextIndex;
\r
738 c.fChar = nextCharLL();
\r
739 c.fEscaped = false;
\r
742 // check for '' sequence.
\r
743 // These are recognized in all contexts, whether in quoted text or not.
\r
745 if (c.fChar == '\'') {
\r
746 if (UTF16.charAt(fRB.fRules, fNextIndex) == '\'') {
\r
747 c.fChar = nextCharLL(); // get nextChar officially so character counts
\r
748 c.fEscaped = true; // stay correct.
\r
750 // Single quote, by itself.
\r
751 // Toggle quoting mode.
\r
752 // Return either '(' or ')', because quotes cause a grouping of the quoted text.
\r
753 fQuoteMode = !fQuoteMode;
\r
754 if (fQuoteMode == true) {
\r
759 c.fEscaped = false; // The paren that we return is not escaped.
\r
767 // We are not in a 'quoted region' of the source.
\r
769 if (c.fChar == '#') {
\r
770 // Start of a comment. Consume the rest of it.
\r
771 // The new-line char that terminates the comment is always returned.
\r
772 // It will be treated as white-space, and serves to break up anything
\r
773 // that might otherwise incorrectly clump together with a comment in
\r
774 // the middle (a variable name, for example.)
\r
776 c.fChar = nextCharLL();
\r
777 if (c.fChar == (int) -1 || // EOF
\r
780 c.fChar == chNEL ||
\r
787 if (c.fChar == (int) -1) {
\r
792 // check for backslash escaped characters.
\r
793 // Use String.unescapeAt() to handle them.
\r
795 if (c.fChar == '\\') {
\r
797 int[] unescapeIndex = new int[1];
\r
798 unescapeIndex[0] = fNextIndex;
\r
799 c.fChar = Utility.unescapeAt(fRB.fRules, unescapeIndex);
\r
800 if (unescapeIndex[0] == fNextIndex) {
\r
801 error(RBBIRuleBuilder.U_BRK_HEX_DIGITS_EXPECTED);
\r
804 fCharNum += unescapeIndex[0] - fNextIndex;
\r
805 fNextIndex = unescapeIndex[0];
\r
808 // putc(c.fChar, stdout);
\r
811 //---------------------------------------------------------------------------------
\r
813 // Parse RBBI rules. The state machine for rules parsing is here.
\r
814 // The state tables are hand-written in the file rbbirpt.txt,
\r
815 // and converted to the form used here by a perl
\r
816 // script rbbicst.pl
\r
818 //---------------------------------------------------------------------------------
\r
821 RBBIRuleParseTable.RBBIRuleTableElement tableEl;
\r
826 // Main loop for the rule parsing state machine.
\r
827 // Runs once per state transition.
\r
828 // Each time through optionally performs, depending on the state table,
\r
829 // - an advance to the the next input char
\r
830 // - an action to be performed.
\r
831 // - pushing or popping a state to/from the local state return stack.
\r
834 // Quit if state == 0. This is the normal way to exit the state machine.
\r
840 // Find the state table element that matches the input char from the rule, or the
\r
841 // class of the input character. Start with the first table row for this
\r
842 // state, then linearly scan forward until we find a row that matches the
\r
843 // character. The last row for each state always matches all characters, so
\r
844 // the search will stop there, if not before.
\r
846 tableEl = RBBIRuleParseTable.gRuleParseStateTable[state];
\r
847 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
\r
848 System.out.println("char, line, col = (\'" + (char) fC.fChar
\r
849 + "\', " + fLineNum + ", " + fCharNum + " state = "
\r
850 + tableEl.fStateName);
\r
853 for (int tableRow = state;; tableRow++) { // loop over the state table rows associated with this state.
\r
854 tableEl = RBBIRuleParseTable.gRuleParseStateTable[tableRow];
\r
855 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
\r
856 System.out.print(".");
\r
858 if (tableEl.fCharClass < 127 && fC.fEscaped == false
\r
859 && tableEl.fCharClass == fC.fChar) {
\r
860 // Table row specified an individual character, not a set, and
\r
861 // the input character is not escaped, and
\r
862 // the input character matched it.
\r
865 if (tableEl.fCharClass == 255) {
\r
866 // Table row specified default, match anything character class.
\r
869 if (tableEl.fCharClass == 254 && fC.fEscaped) {
\r
870 // Table row specified "escaped" and the char was escaped.
\r
873 if (tableEl.fCharClass == 253 && fC.fEscaped
\r
874 && (fC.fChar == 0x50 || fC.fChar == 0x70)) {
\r
875 // Table row specified "escaped P" and the char is either 'p' or 'P'.
\r
878 if (tableEl.fCharClass == 252 && fC.fChar == (int) -1) {
\r
879 // Table row specified eof and we hit eof on the input.
\r
883 if (tableEl.fCharClass >= 128 && tableEl.fCharClass < 240 && // Table specs a char class &&
\r
884 fC.fEscaped == false && // char is not escaped &&
\r
885 fC.fChar != (int) -1) { // char is not EOF
\r
886 UnicodeSet uniset = fRuleSets[tableEl.fCharClass - 128];
\r
887 if (uniset.contains(fC.fChar)) {
\r
888 // Table row specified a character class, or set of characters,
\r
889 // and the current char matches it.
\r
895 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
\r
896 System.out.println("");
\r
899 // We've found the row of the state table that matches the current input
\r
900 // character from the rules string.
\r
901 // Perform any action specified by this row in the state table.
\r
902 if (doParseActions(tableEl.fAction) == false) {
\r
903 // Break out of the state machine loop if the
\r
904 // the action signalled some kind of error, or
\r
905 // the action was to exit, occurs on normal end-of-rules-input.
\r
909 if (tableEl.fPushState != 0) {
\r
911 if (fStackPtr >= kStackSize) {
\r
912 System.out.println("RBBIRuleScanner.parse() - state stack overflow.");
\r
913 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
\r
915 fStack[fStackPtr] = tableEl.fPushState;
\r
918 if (tableEl.fNextChar) {
\r
922 // Get the next state from the table entry, or from the
\r
923 // state stack if the next state was specified as "pop".
\r
924 if (tableEl.fNextState != 255) {
\r
925 state = tableEl.fNextState;
\r
927 state = fStack[fStackPtr];
\r
929 if (fStackPtr < 0) {
\r
930 System.out.println("RBBIRuleScanner.parse() - state stack underflow.");
\r
931 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
\r
938 // If there were NO user specified reverse rules, set up the equivalent of ".*;"
\r
940 if (fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] == null) {
\r
941 fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] = pushNewNode(RBBINode.opStar);
\r
942 RBBINode operand = pushNewNode(RBBINode.setRef);
\r
943 findSetFor(kAny, operand, null);
\r
944 fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].fLeftChild = operand;
\r
945 operand.fParent = fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree];
\r
946 fNodeStackPtr -= 2;
\r
950 // Parsing of the input RBBI rules is complete.
\r
951 // We now have a parse tree for the rule expressions
\r
952 // and a list of all UnicodeSets that are referenced.
\r
954 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("symbols") >= 0) {
\r
955 fSymbolTable.rbbiSymtablePrint();
\r
957 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("ptree") >= 0) {
\r
958 System.out.println("Completed Forward Rules Parse Tree...");
\r
959 fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree].printTree(true);
\r
960 System.out.println("\nCompleted Reverse Rules Parse Tree...");
\r
961 fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].printTree(true);
\r
962 System.out.println("\nCompleted Safe Point Forward Rules Parse Tree...");
\r
963 if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree] == null) {
\r
964 System.out.println(" -- null -- ");
\r
966 fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree].printTree(true);
\r
968 System.out.println("\nCompleted Safe Point Reverse Rules Parse Tree...");
\r
969 if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree] == null) {
\r
970 System.out.println(" -- null -- ");
\r
972 fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree].printTree(true);
\r
977 //---------------------------------------------------------------------------------
\r
979 // printNodeStack for debugging...
\r
981 //---------------------------------------------------------------------------------
\r
983 void printNodeStack(String title) {
\r
985 System.out.println(title + ". Dumping node stack...\n");
\r
986 for (i = fNodeStackPtr; i > 0; i--) {
\r
987 fNodeStack[i].printTree(true);
\r
992 //---------------------------------------------------------------------------------
\r
994 // pushNewNode create a new RBBINode of the specified type and push it
\r
995 // onto the stack of nodes.
\r
997 //---------------------------------------------------------------------------------
\r
998 RBBINode pushNewNode(int nodeType) {
\r
1000 if (fNodeStackPtr >= kStackSize) {
\r
1001 System.out.println("RBBIRuleScanner.pushNewNode - stack overflow.");
\r
1002 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
\r
1004 fNodeStack[fNodeStackPtr] = new RBBINode(nodeType);
\r
1005 return fNodeStack[fNodeStackPtr];
\r
1008 //---------------------------------------------------------------------------------
\r
1010 // scanSet Construct a UnicodeSet from the text at the current scan
\r
1011 // position. Advance the scan position to the first character
\r
1014 // A new RBBI setref node referring to the set is pushed onto the node
\r
1017 // The scan position is normally under the control of the state machine
\r
1018 // that controls rule parsing. UnicodeSets, however, are parsed by
\r
1019 // the UnicodeSet constructor, not by the RBBI rule parser.
\r
1021 //---------------------------------------------------------------------------------
\r
1023 UnicodeSet uset = null;
\r
1025 ParsePosition pos = new ParsePosition(fScanIndex);
\r
1028 startPos = fScanIndex;
\r
1030 uset = new UnicodeSet(fRB.fRules, pos, fSymbolTable, UnicodeSet.IGNORE_SPACE);
\r
1031 } catch (Exception e) { // TODO: catch fewer exception types.
\r
1032 // Repackage UnicodeSet errors as RBBI rule builder errors, with location info.
\r
1033 error(RBBIRuleBuilder.U_BRK_MALFORMED_SET);
\r
1036 // Verify that the set contains at least one code point.
\r
1038 if (uset.isEmpty()) {
\r
1039 // This set is empty.
\r
1040 // Make it an error, because it almost certainly is not what the user wanted.
\r
1041 // Also, avoids having to think about corner cases in the tree manipulation code
\r
1042 // that occurs later on.
\r
1043 // TODO: this shouldn't be an error; it does happen.
\r
1044 error(RBBIRuleBuilder.U_BRK_RULE_EMPTY_SET);
\r
1047 // Advance the RBBI parse postion over the UnicodeSet pattern.
\r
1048 // Don't just set fScanIndex because the line/char positions maintained
\r
1049 // for error reporting would be thrown off.
\r
1050 i = pos.getIndex();
\r
1052 if (fNextIndex >= i) {
\r
1060 n = pushNewNode(RBBINode.setRef);
\r
1061 n.fFirstPos = startPos;
\r
1062 n.fLastPos = fNextIndex;
\r
1063 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
\r
1064 // findSetFor() serves several purposes here:
\r
1065 // - Adopts storage for the UnicodeSet, will be responsible for deleting.
\r
1066 // - Mantains collection of all sets in use, needed later for establishing
\r
1067 // character categories for run time engine.
\r
1068 // - Eliminates mulitiple instances of the same set.
\r
1069 // - Creates a new uset node if necessary (if this isn't a duplicate.)
\r
1070 findSetFor(n.fText, n, uset);
\r