3 **********************************************************************
4 * Copyright (c) 2001-2009, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 **********************************************************************
8 package com.ibm.icu.text;
10 import com.ibm.icu.impl.IllegalIcuArgumentException;
11 import com.ibm.icu.impl.Utility;
13 import java.util.ArrayList;
14 import java.util.List;
15 import java.util.Vector;
16 import java.util.Hashtable;
17 import java.text.ParsePosition;
18 import com.ibm.icu.lang.*;
19 import com.ibm.icu.impl.UCharacterProperty;
21 class TransliteratorParser {
23 //----------------------------------------------------------------------
25 //----------------------------------------------------------------------
29 * A Vector of RuleBasedTransliterator.Data objects, one for each discrete group
30 * of rules in the rule set
32 public Vector dataVector;
36 * A Vector of Strings containing all of the ID blocks in the rule set
38 public Vector idBlockVector;
41 * The current data object for which we are parsing rules
43 private RuleBasedTransliterator.Data curData;
46 * PUBLIC data member containing the parsed compound filter, if any.
48 public UnicodeSet compoundFilter;
51 private int direction;
54 * Temporary symbol table used during parsing.
56 private ParseData parseData;
59 * Temporary vector of set variables. When parsing is complete, this
60 * is copied into the array data.variables. As with data.variables,
61 * element 0 corresponds to character data.variablesBase.
63 private Vector variablesVector;
66 * Temporary table of variable names. When parsing is complete, this is
67 * copied into data.variableNames.
69 private Hashtable variableNames;
72 * String of standins for segments. Used during the parsing of a single
73 * rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds
74 * to StringMatcher object segmentObjects.elementAt(0), etc.
76 private StringBuffer segmentStandins;
79 * Vector of StringMatcher objects for segments. Used during the
80 * parsing of a single rule.
81 * segmentStandins.charAt(0) is the standin for "$1" and corresponds
82 * to StringMatcher object segmentObjects.elementAt(0), etc.
84 private Vector segmentObjects;
87 * The next available stand-in for variables. This starts at some point in
88 * the private use area (discovered dynamically) and increments up toward
89 * <code>variableLimit</code>. At any point during parsing, available
90 * variables are <code>variableNext..variableLimit-1</code>.
92 private char variableNext;
95 * The last available stand-in for variables. This is discovered
96 * dynamically. At any point during parsing, available variables are
97 * <code>variableNext..variableLimit-1</code>. During variable definition
98 * we use the special value variableLimit-1 as a placeholder.
100 private char variableLimit;
103 * When we encounter an undefined variable, we do not immediately signal
104 * an error, in case we are defining this variable, e.g., "$a = [a-z];".
105 * Instead, we save the name of the undefined variable, and substitute
106 * in the placeholder char variableLimit - 1, and decrement
109 private String undefinedVariableName;
112 * The stand-in character for the 'dot' set, represented by '.' in
113 * patterns. This is allocated the first time it is needed, and
116 private int dotStandIn = -1;
118 //----------------------------------------------------------------------
120 //----------------------------------------------------------------------
122 // Indicator for ID blocks
123 private static final String ID_TOKEN = "::";
124 private static final int ID_TOKEN_LEN = 2;
127 (reserved for future expansion)
128 // markers for beginning and end of rule groups
129 private static final String BEGIN_TOKEN = "BEGIN";
130 private static final String END_TOKEN = "END";
134 private static final char VARIABLE_DEF_OP = '=';
135 private static final char FORWARD_RULE_OP = '>';
136 private static final char REVERSE_RULE_OP = '<';
137 private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op
139 private static final String OPERATORS = "=><\u2190\u2192\u2194";
140 private static final String HALF_ENDERS = "=><\u2190\u2192\u2194;";
142 // Other special characters
143 private static final char QUOTE = '\'';
144 private static final char ESCAPE = '\\';
145 private static final char END_OF_RULE = ';';
146 private static final char RULE_COMMENT_CHAR = '#';
148 private static final char CONTEXT_ANTE = '{'; // ante{key
149 private static final char CONTEXT_POST = '}'; // key}post
150 private static final char CURSOR_POS = '|';
151 private static final char CURSOR_OFFSET = '@';
152 private static final char ANCHOR_START = '^';
154 private static final char KLEENE_STAR = '*';
155 private static final char ONE_OR_MORE = '+';
156 private static final char ZERO_OR_ONE = '?';
158 private static final char DOT = '.';
159 private static final String DOT_SET = "[^[:Zp:][:Zl:]\\r\\n$]";
161 // By definition, the ANCHOR_END special character is a
162 // trailing SymbolTable.SYMBOL_REF character.
163 // private static final char ANCHOR_END = '$';
165 // Segments of the input string are delimited by "(" and ")". In the
166 // output string these segments are referenced as "$1", "$2", etc.
167 private static final char SEGMENT_OPEN = '(';
168 private static final char SEGMENT_CLOSE = ')';
170 // A function is denoted &Source-Target/Variant(text)
171 private static final char FUNCTION = '&';
173 // Aliases for some of the syntax characters. These are provided so
174 // transliteration rules can be expressed in XML without clashing with
175 // XML syntax characters '<', '>', and '&'.
176 private static final char ALT_REVERSE_RULE_OP = '\u2190'; // Left Arrow
177 private static final char ALT_FORWARD_RULE_OP = '\u2192'; // Right Arrow
178 private static final char ALT_FWDREV_RULE_OP = '\u2194'; // Left Right Arrow
179 private static final char ALT_FUNCTION = '\u2206'; // Increment (~Greek Capital Delta)
181 // Special characters disallowed at the top level
182 private static UnicodeSet ILLEGAL_TOP = new UnicodeSet("[\\)]");
184 // Special characters disallowed within a segment
185 private static UnicodeSet ILLEGAL_SEG = new UnicodeSet("[\\{\\}\\|\\@]");
187 // Special characters disallowed within a function argument
188 private static UnicodeSet ILLEGAL_FUNC = new UnicodeSet("[\\^\\(\\.\\*\\+\\?\\{\\}\\|\\@]");
190 //----------------------------------------------------------------------
192 //----------------------------------------------------------------------
195 * This class implements the SymbolTable interface. It is used
196 * during parsing to give UnicodeSet access to variables that
197 * have been defined so far. Note that it uses variablesVector,
198 * _not_ data.variables.
200 private class ParseData implements SymbolTable {
203 * Implement SymbolTable API.
205 public char[] lookup(String name) {
206 return (char[]) variableNames.get(name);
210 * Implement SymbolTable API.
212 public UnicodeMatcher lookupMatcher(int ch) {
213 // Note that we cannot use data.lookup() because the
214 // set array has not been constructed yet.
215 int i = ch - curData.variablesBase;
216 if (i >= 0 && i < variablesVector.size()) {
217 return (UnicodeMatcher) variablesVector.elementAt(i);
223 * Implement SymbolTable API. Parse out a symbol reference
226 public String parseReference(String text, ParsePosition pos, int limit) {
227 int start = pos.getIndex();
230 char c = text.charAt(i);
231 if ((i==start && !UCharacter.isUnicodeIdentifierStart(c)) ||
232 !UCharacter.isUnicodeIdentifierPart(c)) {
237 if (i == start) { // No valid name chars
241 return text.substring(start, i);
245 * Return true if the given character is a matcher standin or a plain
246 * character (non standin).
248 public boolean isMatcher(int ch) {
249 // Note that we cannot use data.lookup() because the
250 // set array has not been constructed yet.
251 int i = ch - curData.variablesBase;
252 if (i >= 0 && i < variablesVector.size()) {
253 return variablesVector.elementAt(i) instanceof UnicodeMatcher;
259 * Return true if the given character is a replacer standin or a plain
260 * character (non standin).
262 public boolean isReplacer(int ch) {
263 // Note that we cannot use data.lookup() because the
264 // set array has not been constructed yet.
265 int i = ch - curData.variablesBase;
266 if (i >= 0 && i < variablesVector.size()) {
267 return variablesVector.elementAt(i) instanceof UnicodeReplacer;
273 //----------------------------------------------------------------------
274 // classes RuleBody, RuleArray, and RuleReader
275 //----------------------------------------------------------------------
278 * A private abstract class representing the interface to rule
279 * source code that is broken up into lines. Handles the
280 * folding of lines terminated by a backslash. This folding
281 * is limited; it does not account for comments, quotes, or
282 * escapes, so its use to be limited.
284 private static abstract class RuleBody {
287 * Retrieve the next line of the source, or return null if
288 * none. Folds lines terminated by a backslash into the
289 * next line, without regard for comments, quotes, or
293 String s = handleNextLine();
296 s.charAt(s.length() - 1) == '\\') {
298 StringBuffer b = new StringBuffer(s);
300 b.deleteCharAt(b.length()-1);
301 s = handleNextLine();
306 } while (s.length() > 0 &&
307 s.charAt(s.length() - 1) == '\\');
315 * Reset to the first line of the source.
317 abstract void reset();
320 * Subclass method to return the next line of the source.
322 abstract String handleNextLine();
326 * RuleBody subclass for a String[] array.
328 private static class RuleArray extends RuleBody {
331 public RuleArray(String[] array) { this.array = array; i = 0; }
332 public String handleNextLine() {
333 return (i < array.length) ? array[i++] : null;
335 public void reset() {
341 * RuleBody subclass for a ResourceReader.
343 /* private static class RuleReader extends RuleBody {
344 ResourceReader reader;
345 public RuleReader(ResourceReader reader) { this.reader = reader; }
346 public String handleNextLine() {
348 return reader.readLine();
349 } catch (java.io.IOException e) {}
352 public void reset() {
357 //----------------------------------------------------------------------
359 //----------------------------------------------------------------------
362 * A class representing one side of a rule. This class knows how to
363 * parse half of a rule. It is tightly coupled to the method
364 * TransliteratorParser.parseRule().
366 private static class RuleHalf {
370 public int cursor = -1; // position of cursor in text
371 public int ante = -1; // position of ante context marker '{' in text
372 public int post = -1; // position of post context marker '}' in text
374 // Record the offset to the cursor either to the left or to the
375 // right of the key. This is indicated by characters on the output
376 // side that allow the cursor to be positioned arbitrarily within
377 // the matching text. For example, abc{def} > | @@@ xyz; changes
378 // def to xyz and moves the cursor to before abc. Offset characters
379 // must be at the start or end, and they cannot move the cursor past
380 // the ante- or postcontext text. Placeholders are only valid in
381 // output text. The length of the ante and post context is
382 // determined at runtime, because of supplementals and quantifiers.
383 public int cursorOffset = 0; // only nonzero on output side
385 // Position of first CURSOR_OFFSET on _right_. This will be -1
386 // for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc.
387 private int cursorOffsetPos = 0;
389 public boolean anchorStart = false;
390 public boolean anchorEnd = false;
393 * The segment number from 1..n of the next '(' we see
394 * during parsing; 1-based.
396 private int nextSegmentNumber = 1;
399 * Parse one side of a rule, stopping at either the limit,
400 * the END_OF_RULE character, or an operator.
401 * @return the index after the terminating character, or
402 * if limit was reached, limit
404 public int parse(String rule, int pos, int limit,
405 TransliteratorParser parser) {
407 StringBuffer buf = new StringBuffer();
408 pos = parseSection(rule, pos, limit, parser, buf, ILLEGAL_TOP, false);
409 text = buf.toString();
411 if (cursorOffset > 0 && cursor != cursorOffsetPos) {
412 syntaxError("Misplaced " + CURSOR_POS, rule, start);
419 * Parse a section of one side of a rule, stopping at either
420 * the limit, the END_OF_RULE character, an operator, or a
421 * segment close character. This method parses both a
422 * top-level rule half and a segment within such a rule half.
423 * It calls itself recursively to parse segments and nested
425 * @param buf buffer into which to accumulate the rule pattern
426 * characters, either literal characters from the rule or
427 * standins for UnicodeMatcher objects including segments.
428 * @param illegal the set of special characters that is illegal during
430 * @param isSegment if true, then we've already seen a '(' and
431 * pos on entry points right after it. Accumulate everything
432 * up to the closing ')', put it in a segment matcher object,
433 * generate a standin for it, and add the standin to buf. As
434 * a side effect, update the segments vector with a reference
435 * to the segment matcher. This works recursively for nested
436 * segments. If isSegment is false, just accumulate
437 * characters into buf.
438 * @return the index after the terminating character, or
439 * if limit was reached, limit
441 private int parseSection(String rule, int pos, int limit,
442 TransliteratorParser parser,
447 ParsePosition pp = null;
448 int quoteStart = -1; // Most recent 'single quoted string'
450 int varStart = -1; // Most recent $variableReference
452 int[] iref = new int[1];
453 int bufStart = buf.length();
456 while (pos < limit) {
457 // Since all syntax characters are in the BMP, fetching
458 // 16-bit code units suffices here.
459 char c = rule.charAt(pos++);
460 if (UCharacterProperty.isRuleWhiteSpace(c)) {
463 // HALF_ENDERS is all chars that end a rule half: "<>=;"
464 if (HALF_ENDERS.indexOf(c) >= 0) {
466 syntaxError("Unclosed segment", rule, start);
471 // Text after a presumed end anchor is a syntax err
472 syntaxError("Malformed variable reference", rule, start);
474 if (UnicodeSet.resemblesPattern(rule, pos-1)) {
476 pp = new ParsePosition(0);
478 pp.setIndex(pos-1); // Backup to opening '['
479 buf.append(parser.parseSet(rule, pp));
486 syntaxError("Trailing backslash", rule, start);
489 int escaped = Utility.unescapeAt(rule, iref);
492 syntaxError("Malformed escape", rule, start);
494 parser.checkVariableRange(escaped, rule, start);
495 UTF16.append(buf, escaped);
498 // Handle quoted matter
500 int iq = rule.indexOf(QUOTE, pos);
502 buf.append(c); // Parse [''] outside quotes as [']
505 /* This loop picks up a run of quoted text of the
506 * form 'aaaa' each time through. If this run
507 * hasn't really ended ('aaaa''bbbb') then it keeps
508 * looping, each time adding on a new run. When it
509 * reaches the final quote it breaks.
511 quoteStart = buf.length();
514 syntaxError("Unterminated quote", rule, start);
516 buf.append(rule.substring(pos, iq));
518 if (pos < limit && rule.charAt(pos) == QUOTE) {
519 // Parse [''] inside quotes as [']
520 iq = rule.indexOf(QUOTE, pos+1);
526 quoteLimit = buf.length();
528 for (iq=quoteStart; iq<quoteLimit; ++iq) {
529 parser.checkVariableRange(buf.charAt(iq), rule, start);
535 parser.checkVariableRange(c, rule, start);
537 if (illegal.contains(c)) {
538 syntaxError("Illegal character '" + c + '\'', rule, start);
543 //------------------------------------------------------
544 // Elements allowed within and out of segments
545 //------------------------------------------------------
547 if (buf.length() == 0 && !anchorStart) {
550 syntaxError("Misplaced anchor start",
556 // bufSegStart is the offset in buf to the first
557 // character of the segment we are parsing.
558 int bufSegStart = buf.length();
560 // Record segment number now, since nextSegmentNumber
561 // will be incremented during the call to parseSection
562 // if there are nested segments.
563 int segmentNumber = nextSegmentNumber++; // 1-based
566 pos = parseSection(rule, pos, limit, parser, buf, ILLEGAL_SEG, true);
568 // After parsing a segment, the relevant characters are
569 // in buf, starting at offset bufSegStart. Extract them
570 // into a string matcher, and replace them with a
571 // standin for that matcher.
573 new StringMatcher(buf.substring(bufSegStart),
574 segmentNumber, parser.curData);
576 // Record and associate object and segment number
577 parser.setSegmentObject(segmentNumber, m);
578 buf.setLength(bufSegStart);
579 buf.append(parser.getSegmentStandin(segmentNumber));
586 TransliteratorIDParser.SingleID single = TransliteratorIDParser.parseFilterID(rule, iref);
587 // The next character MUST be a segment open
588 if (single == null ||
589 !Utility.parseChar(rule, iref, SEGMENT_OPEN)) {
590 syntaxError("Invalid function", rule, start);
593 Transliterator t = single.getInstance();
595 syntaxError("Invalid function ID", rule, start);
598 // bufSegStart is the offset in buf to the first
599 // character of the segment we are parsing.
600 int bufSegStart = buf.length();
603 pos = parseSection(rule, iref[0], limit, parser, buf, ILLEGAL_FUNC, true);
605 // After parsing a segment, the relevant characters are
606 // in buf, starting at offset bufSegStart.
608 new FunctionReplacer(t,
609 new StringReplacer(buf.substring(bufSegStart), parser.curData));
611 // Replace the buffer contents with a stand-in
612 buf.setLength(bufSegStart);
613 buf.append(parser.generateStandInFor(r));
616 case SymbolTable.SYMBOL_REF:
617 // Handle variable references and segment references "$1" .. "$9"
619 // A variable reference must be followed immediately
620 // by a Unicode identifier start and zero or more
621 // Unicode identifier part characters, or by a digit
622 // 1..9 if it is a segment reference.
624 // A variable ref character at the end acts as
625 // an anchor to the context limit, as in perl.
629 // Parse "$1" "$2" .. "$9" .. (no upper limit)
630 c = rule.charAt(pos);
631 int r = UCharacter.digit(c, 10);
632 if (r >= 1 && r <= 9) {
634 r = Utility.parseNumber(rule, iref, 10);
636 syntaxError("Undefined segment reference",
640 buf.append(parser.getSegmentStandin(r));
642 if (pp == null) { // Lazy create
643 pp = new ParsePosition(0);
646 String name = parser.parseData.
647 parseReference(rule, pp, limit);
649 // This means the '$' was not followed by a
650 // valid name. Try to interpret it as an
651 // end anchor then. If this also doesn't work
652 // (if we see a following character) then signal
658 // If this is a variable definition statement,
659 // then the LHS variable will be undefined. In
660 // that case appendVariableDef() will append the
661 // special placeholder char variableLimit-1.
662 varStart = buf.length();
663 parser.appendVariableDef(name, buf);
664 varLimit = buf.length();
669 buf.append(parser.getDotStandIn());
674 // Quantifiers. We handle single characters, quoted strings,
675 // variable references, and segments.
677 // 'foo'+ matches foofoofoo
678 // $v+ matches xyxyxy if $v == xy
679 // (seg)+ matches segsegseg
681 if (isSegment && buf.length() == bufStart) {
682 // The */+ immediately follows '('
683 syntaxError("Misplaced quantifier", rule, start);
688 // The */+ follows an isolated character or quote
689 // or variable reference
690 if (buf.length() == quoteLimit) {
691 // The */+ follows a 'quoted string'
694 } else if (buf.length() == varLimit) {
695 // The */+ follows a $variableReference
699 // The */+ follows a single character, possibly
701 qstart = buf.length() - 1;
707 m = new StringMatcher(buf.toString(), qstart, qlimit,
709 } catch (RuntimeException e) {
710 final String precontext = pos < 50 ? rule.substring(0, pos) : "..." + rule.substring(pos - 50, pos);
711 final String postContext = limit-pos <= 50 ? rule.substring(pos, limit) : rule.substring(pos, pos+50) + "...";
712 throw (RuntimeException)
713 new IllegalIcuArgumentException("Failure in rule: " + precontext + "$$$"
715 //#if defined(FOUNDATION10) || defined(J2SE13)
722 int max = Quantifier.MAX;
732 // do nothing -- min, max already set
734 m = new Quantifier(m, min, max);
735 buf.setLength(qstart);
736 buf.append(parser.generateStandInFor(m));
740 //------------------------------------------------------
741 // Elements allowed ONLY WITHIN segments
742 //------------------------------------------------------
744 // assert(isSegment);
745 // We're done parsing a segment.
748 //------------------------------------------------------
749 // Elements allowed ONLY OUTSIDE segments
750 //------------------------------------------------------
753 syntaxError("Multiple ante contexts", rule, start);
759 syntaxError("Multiple post contexts", rule, start);
765 syntaxError("Multiple cursors", rule, start);
767 cursor = buf.length();
770 if (cursorOffset < 0) {
771 if (buf.length() > 0) {
772 syntaxError("Misplaced " + c, rule, start);
775 } else if (cursorOffset > 0) {
776 if (buf.length() != cursorOffsetPos || cursor >= 0) {
777 syntaxError("Misplaced " + c, rule, start);
781 if (cursor == 0 && buf.length() == 0) {
783 } else if (cursor < 0) {
784 cursorOffsetPos = buf.length();
787 syntaxError("Misplaced " + c, rule, start);
792 //------------------------------------------------------
793 // Non-special characters
794 //------------------------------------------------------
796 // Disallow unquoted characters other than [0-9A-Za-z]
797 // in the printable ASCII range. These characters are
798 // reserved for possible future use.
799 if (c >= 0x0021 && c <= 0x007E &&
800 !((c >= '0' && c <= '9') ||
801 (c >= 'A' && c <= 'Z') ||
802 (c >= 'a' && c <= 'z'))) {
803 syntaxError("Unquoted " + c, rule, start);
815 void removeContext() {
816 text = text.substring(ante < 0 ? 0 : ante,
817 post < 0 ? text.length() : post);
819 anchorStart = anchorEnd = false;
823 * Return true if this half looks like valid output, that is, does not
824 * contain quantifiers or other special input-only elements.
826 public boolean isValidOutput(TransliteratorParser parser) {
827 for (int i=0; i<text.length(); ) {
828 int c = UTF16.charAt(text, i);
829 i += UTF16.getCharCount(c);
830 if (!parser.parseData.isReplacer(c)) {
838 * Return true if this half looks like valid input, that is, does not
839 * contain functions or other special output-only elements.
841 public boolean isValidInput(TransliteratorParser parser) {
842 for (int i=0; i<text.length(); ) {
843 int c = UTF16.charAt(text, i);
844 i += UTF16.getCharCount(c);
845 if (!parser.parseData.isMatcher(c)) {
853 //----------------------------------------------------------------------
855 //----------------------------------------------------------------------
860 public TransliteratorParser() {
864 * Parse a set of rules. After the parse completes, examine the public
865 * data members for results.
867 public void parse(String rules, int dir) {
868 parseRules(new RuleArray(new String[] { rules }), dir);
872 * Parse a set of rules. After the parse completes, examine the public
873 * data members for results.
875 /* public void parse(ResourceReader rules, int direction) {
876 parseRules(new RuleReader(rules), direction);
879 //----------------------------------------------------------------------
881 //----------------------------------------------------------------------
884 * Parse an array of zero or more rules. The strings in the array are
885 * treated as if they were concatenated together, with rule terminators
886 * inserted between array elements if not present already.
888 * Any previous rules are discarded. Typically this method is called exactly
889 * once, during construction.
891 * The member this.data will be set to null if there are no rules.
893 * @exception IllegalIcuArgumentException if there is a syntax error in the
896 void parseRules(RuleBody ruleArray, int dir) {
897 boolean parsingIDs = true;
900 dataVector = new Vector();
901 idBlockVector = new Vector();
904 compoundFilter = null;
905 variablesVector = new Vector();
906 variableNames = new Hashtable();
907 parseData = new ParseData();
909 List errors = new ArrayList();
914 StringBuffer idBlockResult = new StringBuffer();
916 // The compound filter offset is an index into idBlockResult.
917 // If it is 0, then the compound filter occurred at the start,
918 // and it is the offset to the _start_ of the compound filter
919 // pattern. Otherwise it is the offset to the _limit_ of the
920 // compound filter pattern within idBlockResult.
921 this.compoundFilter = null;
922 int compoundFilterOffset = -1;
926 String rule = ruleArray.nextLine();
931 int limit = rule.length();
932 while (pos < limit) {
933 char c = rule.charAt(pos++);
934 if (UCharacterProperty.isRuleWhiteSpace(c)) {
937 // Skip lines starting with the comment character
938 if (c == RULE_COMMENT_CHAR) {
939 pos = rule.indexOf("\n", pos) + 1;
941 break; // No "\n" found; rest of rule is a commnet
943 continue; // Either fall out or restart with next line
947 if (c == END_OF_RULE)
950 // Often a rule file contains multiple errors. It's
951 // convenient to the rule author if these are all reported
952 // at once. We keep parsing rules even after a failure, up
953 // to a specified limit, and report all errors at once.
957 // We've found the start of a rule or ID. c is its first
958 // character, and pos points past c.
960 // Look for an ID token. Must have at least ID_TOKEN_LEN + 1
962 if ((pos + ID_TOKEN_LEN + 1) <= limit &&
963 rule.regionMatches(pos, ID_TOKEN, 0, ID_TOKEN_LEN)) {
965 c = rule.charAt(pos);
966 while (UCharacterProperty.isRuleWhiteSpace(c) && pos < limit) {
968 c = rule.charAt(pos);
970 int[] p = new int[] { pos };
973 if (curData != null) {
974 if (direction == Transliterator.FORWARD)
975 dataVector.add(curData);
977 dataVector.insertElementAt(curData, 0);
983 TransliteratorIDParser.SingleID id =
984 TransliteratorIDParser.parseSingleID(
986 if (p[0] != pos && Utility.parseChar(rule, p, END_OF_RULE)) {
987 // Successful ::ID parse.
989 if (direction == Transliterator.FORWARD) {
990 idBlockResult.append(id.canonID).append(END_OF_RULE);
992 idBlockResult.insert(0, id.canonID + END_OF_RULE);
996 // Couldn't parse an ID. Try to parse a global filter
997 int[] withParens = new int[] { -1 };
998 UnicodeSet f = TransliteratorIDParser.parseGlobalFilter(rule, p, direction, withParens, null);
999 if (f != null && Utility.parseChar(rule, p, END_OF_RULE)) {
1000 if ((direction == Transliterator.FORWARD) ==
1001 (withParens[0] == 0)) {
1002 if (compoundFilter != null) {
1003 // Multiple compound filters
1004 syntaxError("Multiple global filters", rule, pos);
1007 compoundFilterOffset = ruleCount;
1011 // Can be parsed as neither an ID nor a global filter
1012 syntaxError("Invalid ::ID", rule, pos);
1019 if (direction == Transliterator.FORWARD)
1020 idBlockVector.add(idBlockResult.toString());
1022 idBlockVector.insertElementAt(idBlockResult.toString(), 0);
1023 idBlockResult.delete(0, idBlockResult.length());
1025 curData = new RuleBasedTransliterator.Data();
1027 // By default, rules use part of the private use area
1028 // E000..F8FF for variables and other stand-ins. Currently
1029 // the range F000..F8FF is typically sufficient. The 'use
1030 // variable range' pragma allows rule sets to modify this.
1031 setVariableRange(0xF000, 0xF8FF);
1034 if (resemblesPragma(rule, pos, limit)) {
1035 int ppp = parsePragma(rule, pos, limit);
1037 syntaxError("Unrecognized pragma", rule, pos);
1042 pos = parseRule(rule, pos, limit);
1045 } catch (IllegalArgumentException e) {
1046 if (errorCount == 30) {
1047 errors.add(new IllegalIcuArgumentException("\nMore than 30 errors; further messages squelched")
1048 //#if defined(FOUNDATION10) || defined(J2SE13)
1055 e.fillInStackTrace();
1058 pos = ruleEnd(rule, pos, limit) + 1; // +1 advances past ';'
1062 if (parsingIDs && idBlockResult.length() > 0) {
1063 if (direction == Transliterator.FORWARD)
1064 idBlockVector.add(idBlockResult.toString());
1066 idBlockVector.insertElementAt(idBlockResult.toString(), 0);
1068 else if (!parsingIDs && curData != null) {
1069 if (direction == Transliterator.FORWARD)
1070 dataVector.add(curData);
1072 dataVector.insertElementAt(curData, 0);
1075 // Convert the set vector to an array
1076 for (int i = 0; i < dataVector.size(); i++) {
1077 RuleBasedTransliterator.Data data = (RuleBasedTransliterator.Data)dataVector.get(i);
1078 data.variables = new Object[variablesVector.size()];
1079 variablesVector.copyInto(data.variables);
1080 data.variableNames = new Hashtable();
1081 data.variableNames.putAll(variableNames);
1083 variablesVector = null;
1085 // Do more syntax checking and index the rules
1087 if (compoundFilter != null) {
1088 if ((direction == Transliterator.FORWARD &&
1089 compoundFilterOffset != 1) ||
1090 (direction == Transliterator.REVERSE &&
1091 compoundFilterOffset != ruleCount)) {
1092 throw new IllegalIcuArgumentException("Compound filters misplaced");
1096 for (int i = 0; i < dataVector.size(); i++) {
1097 RuleBasedTransliterator.Data data = (RuleBasedTransliterator.Data)dataVector.get(i);
1098 data.ruleSet.freeze();
1101 if (idBlockVector.size() == 1 && ((String)idBlockVector.get(0)).length() == 0)
1102 idBlockVector.remove(0);
1104 } catch (IllegalArgumentException e) {
1105 e.fillInStackTrace();
1109 if (errors.size() != 0) {
1110 //#if defined(FOUNDATION10) || defined(J2SE13)
1112 for (int i = errors.size()-1; i > 0; --i) {
1113 RuntimeException previous = (RuntimeException) errors.get(i-1);
1114 while (previous.getCause() != null) {
1115 previous = (RuntimeException) previous.getCause(); // chain specially
1117 previous.initCause((RuntimeException) errors.get(i));
1120 throw (RuntimeException) errors.get(0);
1121 // if initCause not supported: throw new IllegalArgumentException(errors.toString());
1126 * MAIN PARSER. Parse the next rule in the given rule string, starting
1127 * at pos. Return the index after the last character parsed. Do not
1128 * parse characters at or after limit.
1130 * Important: The character at pos must be a non-whitespace character
1131 * that is not the comment character.
1133 * This method handles quoting, escaping, and whitespace removal. It
1134 * parses the end-of-rule character. It recognizes context and cursor
1135 * indicators. Once it does a lexical breakdown of the rule at pos, it
1136 * creates a rule object and adds it to our rule list.
1138 * This method is tightly coupled to the inner class RuleHalf.
1140 private int parseRule(String rule, int pos, int limit) {
1141 // Locate the left side, operator, and right side
1145 // Set up segments data
1146 segmentStandins = new StringBuffer();
1147 segmentObjects = new Vector();
1149 RuleHalf left = new RuleHalf();
1150 RuleHalf right = new RuleHalf();
1152 undefinedVariableName = null;
1153 pos = left.parse(rule, pos, limit, this);
1156 OPERATORS.indexOf(operator = rule.charAt(--pos)) < 0) {
1157 syntaxError("No operator pos=" + pos, rule, start);
1161 // Found an operator char. Check for forward-reverse operator.
1162 if (operator == REVERSE_RULE_OP &&
1163 (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
1165 operator = FWDREV_RULE_OP;
1168 // Translate alternate op characters.
1170 case ALT_FORWARD_RULE_OP:
1171 operator = FORWARD_RULE_OP;
1173 case ALT_REVERSE_RULE_OP:
1174 operator = REVERSE_RULE_OP;
1176 case ALT_FWDREV_RULE_OP:
1177 operator = FWDREV_RULE_OP;
1181 pos = right.parse(rule, pos, limit, this);
1184 if (rule.charAt(--pos) == END_OF_RULE) {
1187 // RuleHalf parser must have terminated at an operator
1188 syntaxError("Unquoted operator", rule, start);
1192 if (operator == VARIABLE_DEF_OP) {
1193 // LHS is the name. RHS is a single character, either a literal
1194 // or a set (already parsed). If RHS is longer than one
1195 // character, it is either a multi-character string, or multiple
1196 // sets, or a mixture of chars and sets -- syntax error.
1198 // We expect to see a single undefined variable (the one being
1200 if (undefinedVariableName == null) {
1201 syntaxError("Missing '$' or duplicate definition", rule, start);
1203 if (left.text.length() != 1 || left.text.charAt(0) != variableLimit) {
1204 syntaxError("Malformed LHS", rule, start);
1206 if (left.anchorStart || left.anchorEnd ||
1207 right.anchorStart || right.anchorEnd) {
1208 syntaxError("Malformed variable def", rule, start);
1210 // We allow anything on the right, including an empty string.
1211 int n = right.text.length();
1212 char[] value = new char[n];
1213 right.text.getChars(0, n, value, 0);
1214 variableNames.put(undefinedVariableName, value);
1220 // If this is not a variable definition rule, we shouldn't have
1221 // any undefined variable names.
1222 if (undefinedVariableName != null) {
1223 syntaxError("Undefined variable $" + undefinedVariableName,
1228 if (segmentStandins.length() > segmentObjects.size()) {
1229 syntaxError("Undefined segment reference", rule, start);
1231 for (int i=0; i<segmentStandins.length(); ++i) {
1232 if (segmentStandins.charAt(i) == 0) {
1233 syntaxError("Internal error", rule, start); // will never happen
1236 for (int i=0; i<segmentObjects.size(); ++i) {
1237 if (segmentObjects.elementAt(i) == null) {
1238 syntaxError("Internal error", rule, start); // will never happen
1242 // If the direction we want doesn't match the rule
1243 // direction, do nothing.
1244 if (operator != FWDREV_RULE_OP &&
1245 ((direction == Transliterator.FORWARD) != (operator == FORWARD_RULE_OP))) {
1249 // Transform the rule into a forward rule by swapping the
1250 // sides if necessary.
1251 if (direction == Transliterator.REVERSE) {
1252 RuleHalf temp = left;
1257 // Remove non-applicable elements in forward-reverse
1258 // rules. Bidirectional rules ignore elements that do not
1260 if (operator == FWDREV_RULE_OP) {
1261 right.removeContext();
1263 left.cursorOffset = 0;
1266 // Normalize context
1267 if (left.ante < 0) {
1270 if (left.post < 0) {
1271 left.post = left.text.length();
1274 // Context is only allowed on the input side. Cursors are only
1275 // allowed on the output side. Segment delimiters can only appear
1276 // on the left, and references on the right. Cursor offset
1277 // cannot appear without an explicit cursor. Cursor offset
1278 // cannot place the cursor outside the limits of the context.
1279 // Anchors are only allowed on the input side.
1280 if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 ||
1281 (right.cursorOffset != 0 && right.cursor < 0) ||
1282 // - The following two checks were used to ensure that the
1283 // - the cursor offset stayed within the ante- or postcontext.
1284 // - However, with the addition of quantifiers, we have to
1285 // - allow arbitrary cursor offsets and do runtime checking.
1286 //(right.cursorOffset > (left.text.length() - left.post)) ||
1287 //(-right.cursorOffset > left.ante) ||
1288 right.anchorStart || right.anchorEnd ||
1289 !left.isValidInput(this) || !right.isValidOutput(this) ||
1290 left.ante > left.post) {
1291 syntaxError("Malformed rule", rule, start);
1294 // Flatten segment objects vector to an array
1295 UnicodeMatcher[] segmentsArray = null;
1296 if (segmentObjects.size() > 0) {
1297 segmentsArray = new UnicodeMatcher[segmentObjects.size()];
1298 segmentObjects.toArray(segmentsArray);
1301 curData.ruleSet.addRule(new TransliterationRule(
1302 left.text, left.ante, left.post,
1303 right.text, right.cursor, right.cursorOffset,
1305 left.anchorStart, left.anchorEnd,
1312 * Set the variable range to [start, end] (inclusive).
1314 private void setVariableRange(int start, int end) {
1315 if (start > end || start < 0 || end > 0xFFFF) {
1316 throw new IllegalIcuArgumentException("Invalid variable range " + start + ", " + end);
1319 curData.variablesBase = (char) start; // first private use
1321 if (dataVector.size() == 0) {
1322 variableNext = (char) start;
1323 variableLimit = (char) (end + 1);
1328 * Assert that the given character is NOT within the variable range.
1329 * If it is, signal an error. This is neccesary to ensure that the
1330 * variable range does not overlap characters used in a rule.
1332 private void checkVariableRange(int ch, String rule, int start) {
1333 if (ch >= curData.variablesBase && ch < variableLimit) {
1334 syntaxError("Variable range character in rule", rule, start);
1338 // (The following method is part of an unimplemented feature.
1339 // Remove this clover pragma after the feature is implemented.
1340 // 2003-06-11 ICU 2.6 Alan)
1343 * Set the maximum backup to 'backup', in response to a pragma
1346 private void pragmaMaximumBackup(int backup) {
1348 throw new IllegalIcuArgumentException("use maximum backup pragma not implemented yet");
1352 // (The following method is part of an unimplemented feature.
1353 // Remove this clover pragma after the feature is implemented.
1354 // 2003-06-11 ICU 2.6 Alan)
1357 * Begin normalizing all rules using the given mode, in response
1358 * to a pragma statement.
1360 private void pragmaNormalizeRules(Normalizer.Mode mode) {
1362 throw new IllegalIcuArgumentException("use normalize rules pragma not implemented yet");
1367 * Return true if the given rule looks like a pragma.
1368 * @param pos offset to the first non-whitespace character
1370 * @param limit pointer past the last character of the rule.
1372 static boolean resemblesPragma(String rule, int pos, int limit) {
1373 // Must start with /use\s/i
1374 return Utility.parsePattern(rule, pos, limit, "use ", null) >= 0;
1378 * Parse a pragma. This method assumes resemblesPragma() has
1379 * already returned true.
1380 * @param pos offset to the first non-whitespace character
1382 * @param limit pointer past the last character of the rule.
1383 * @return the position index after the final ';' of the pragma,
1386 private int parsePragma(String rule, int pos, int limit) {
1387 int[] array = new int[2];
1389 // resemblesPragma() has already returned true, so we
1390 // know that pos points to /use\s/i; we can skip 4 characters
1394 // Here are the pragmas we recognize:
1395 // use variable range 0xE000 0xEFFF;
1396 // use maximum backup 16;
1398 int p = Utility.parsePattern(rule, pos, limit, "~variable range # #~;", array);
1400 setVariableRange(array[0], array[1]);
1404 p = Utility.parsePattern(rule, pos, limit, "~maximum backup #~;", array);
1406 pragmaMaximumBackup(array[0]);
1410 p = Utility.parsePattern(rule, pos, limit, "~nfd rules~;", null);
1412 pragmaNormalizeRules(Normalizer.NFD);
1416 p = Utility.parsePattern(rule, pos, limit, "~nfc rules~;", null);
1418 pragmaNormalizeRules(Normalizer.NFC);
1422 // Syntax error: unable to parse pragma
1427 * Throw an exception indicating a syntax error. Search the rule string
1428 * for the probable end of the rule. Of course, if the error is that
1429 * the end of rule marker is missing, then the rule end will not be found.
1430 * In any case the rule start will be correctly reported.
1431 * @param msg error description
1432 * @param rule pattern string
1433 * @param start position of first character of current rule
1435 static final void syntaxError(String msg, String rule, int start) {
1436 int end = ruleEnd(rule, start, rule.length());
1437 throw new IllegalIcuArgumentException(msg + " in \"" +
1438 Utility.escape(rule.substring(start, end)) + '"');
1441 static final int ruleEnd(String rule, int start, int limit) {
1442 int end = Utility.quotedIndexOf(rule, start, limit, ";");
1450 * Parse a UnicodeSet out, store it, and return the stand-in character
1451 * used to represent it.
1453 private final char parseSet(String rule, ParsePosition pos) {
1454 UnicodeSet set = new UnicodeSet(rule, pos, parseData);
1455 if (variableNext >= variableLimit) {
1456 throw new RuntimeException("Private use variables exhausted");
1459 return generateStandInFor(set);
1463 * Generate and return a stand-in for a new UnicodeMatcher or UnicodeReplacer.
1466 char generateStandInFor(Object obj) {
1467 // assert(obj != null);
1469 // Look up previous stand-in, if any. This is a short list
1470 // (typical n is 0, 1, or 2); linear search is optimal.
1471 for (int i=0; i<variablesVector.size(); ++i) {
1472 if (variablesVector.elementAt(i) == obj) { // [sic] pointer comparison
1473 return (char) (curData.variablesBase + i);
1477 if (variableNext >= variableLimit) {
1478 throw new RuntimeException("Variable range exhausted");
1480 variablesVector.addElement(obj);
1481 return variableNext++;
1485 * Return the standin for segment seg (1-based).
1487 public char getSegmentStandin(int seg) {
1488 if (segmentStandins.length() < seg) {
1489 segmentStandins.setLength(seg);
1491 char c = segmentStandins.charAt(seg-1);
1493 if (variableNext >= variableLimit) {
1494 throw new RuntimeException("Variable range exhausted");
1497 // Set a placeholder in the master variables vector that will be
1498 // filled in later by setSegmentObject(). We know that we will get
1499 // called first because setSegmentObject() will call us.
1500 variablesVector.addElement(null);
1501 segmentStandins.setCharAt(seg-1, c);
1507 * Set the object for segment seg (1-based).
1509 public void setSegmentObject(int seg, StringMatcher obj) {
1510 // Since we call parseSection() recursively, nested
1511 // segments will result in segment i+1 getting parsed
1512 // and stored before segment i; be careful with the
1513 // vector handling here.
1514 if (segmentObjects.size() < seg) {
1515 segmentObjects.setSize(seg);
1517 int index = getSegmentStandin(seg) - curData.variablesBase;
1518 if (segmentObjects.elementAt(seg-1) != null ||
1519 variablesVector.elementAt(index) != null) {
1520 throw new RuntimeException(); // should never happen
1522 segmentObjects.setElementAt(obj, seg-1);
1523 variablesVector.setElementAt(obj, index);
1527 * Return the stand-in for the dot set. It is allocated the first
1528 * time and reused thereafter.
1530 char getDotStandIn() {
1531 if (dotStandIn == -1) {
1532 dotStandIn = generateStandInFor(new UnicodeSet(DOT_SET));
1534 return (char) dotStandIn;
1538 * Append the value of the given variable name to the given
1540 * @exception IllegalIcuArgumentException if the name is unknown.
1542 private void appendVariableDef(String name, StringBuffer buf) {
1543 char[] ch = (char[]) variableNames.get(name);
1545 // We allow one undefined variable so that variable definition
1546 // statements work. For the first undefined variable we return
1547 // the special placeholder variableLimit-1, and save the variable
1549 if (undefinedVariableName == null) {
1550 undefinedVariableName = name;
1551 if (variableNext >= variableLimit) {
1552 throw new RuntimeException("Private use variables exhausted");
1554 buf.append((char) --variableLimit);
1556 throw new IllegalIcuArgumentException("Undefined variable $"