-//##header\r
-/*\r
-**********************************************************************\r
-* Copyright (c) 2001-2009, International Business Machines\r
-* Corporation and others. All Rights Reserved.\r
-**********************************************************************\r
-*/\r
-package com.ibm.icu.text;\r
-\r
-import com.ibm.icu.impl.IllegalIcuArgumentException;\r
-import com.ibm.icu.impl.Utility;\r
-\r
-import java.util.ArrayList;\r
-import java.util.List;\r
-import java.util.Vector;\r
-import java.util.Hashtable;\r
-import java.text.ParsePosition;\r
-import com.ibm.icu.lang.*;\r
-import com.ibm.icu.impl.UCharacterProperty;\r
-\r
-class TransliteratorParser {\r
-\r
- //----------------------------------------------------------------------\r
- // Data members\r
- //----------------------------------------------------------------------\r
-\r
- /**\r
- * PUBLIC data member.\r
- * A Vector of RuleBasedTransliterator.Data objects, one for each discrete group\r
- * of rules in the rule set\r
- */\r
- public Vector dataVector;\r
-\r
- /**\r
- * PUBLIC data member.\r
- * A Vector of Strings containing all of the ID blocks in the rule set\r
- */\r
- public Vector idBlockVector;\r
-\r
- /**\r
- * The current data object for which we are parsing rules\r
- */\r
- private RuleBasedTransliterator.Data curData;\r
-\r
- /**\r
- * PUBLIC data member containing the parsed compound filter, if any.\r
- */\r
- public UnicodeSet compoundFilter;\r
-\r
-\r
- private int direction;\r
-\r
- /**\r
- * Temporary symbol table used during parsing.\r
- */\r
- private ParseData parseData;\r
-\r
- /**\r
- * Temporary vector of set variables. When parsing is complete, this\r
- * is copied into the array data.variables. As with data.variables,\r
- * element 0 corresponds to character data.variablesBase.\r
- */\r
- private Vector variablesVector;\r
-\r
- /**\r
- * Temporary table of variable names. When parsing is complete, this is\r
- * copied into data.variableNames.\r
- */\r
- private Hashtable variableNames;\r
-\r
- /**\r
- * String of standins for segments. Used during the parsing of a single\r
- * rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds\r
- * to StringMatcher object segmentObjects.elementAt(0), etc.\r
- */\r
- private StringBuffer segmentStandins;\r
-\r
- /**\r
- * Vector of StringMatcher objects for segments. Used during the\r
- * parsing of a single rule. \r
- * segmentStandins.charAt(0) is the standin for "$1" and corresponds\r
- * to StringMatcher object segmentObjects.elementAt(0), etc.\r
- */\r
- private Vector segmentObjects;\r
-\r
- /**\r
- * The next available stand-in for variables. This starts at some point in\r
- * the private use area (discovered dynamically) and increments up toward\r
- * <code>variableLimit</code>. At any point during parsing, available\r
- * variables are <code>variableNext..variableLimit-1</code>.\r
- */\r
- private char variableNext;\r
-\r
- /**\r
- * The last available stand-in for variables. This is discovered\r
- * dynamically. At any point during parsing, available variables are\r
- * <code>variableNext..variableLimit-1</code>. During variable definition\r
- * we use the special value variableLimit-1 as a placeholder.\r
- */\r
- private char variableLimit;\r
-\r
- /**\r
- * When we encounter an undefined variable, we do not immediately signal\r
- * an error, in case we are defining this variable, e.g., "$a = [a-z];".\r
- * Instead, we save the name of the undefined variable, and substitute\r
- * in the placeholder char variableLimit - 1, and decrement\r
- * variableLimit.\r
- */\r
- private String undefinedVariableName;\r
-\r
- /**\r
- * The stand-in character for the 'dot' set, represented by '.' in\r
- * patterns. This is allocated the first time it is needed, and\r
- * reused thereafter.\r
- */\r
- private int dotStandIn = -1;\r
-\r
- //----------------------------------------------------------------------\r
- // Constants\r
- //----------------------------------------------------------------------\r
-\r
- // Indicator for ID blocks\r
- private static final String ID_TOKEN = "::";\r
- private static final int ID_TOKEN_LEN = 2;\r
-\r
-/*\r
-(reserved for future expansion)\r
- // markers for beginning and end of rule groups\r
- private static final String BEGIN_TOKEN = "BEGIN";\r
- private static final String END_TOKEN = "END";\r
-*/\r
-\r
- // Operators\r
- private static final char VARIABLE_DEF_OP = '=';\r
- private static final char FORWARD_RULE_OP = '>';\r
- private static final char REVERSE_RULE_OP = '<';\r
- private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op\r
-\r
- private static final String OPERATORS = "=><\u2190\u2192\u2194";\r
- private static final String HALF_ENDERS = "=><\u2190\u2192\u2194;";\r
-\r
- // Other special characters\r
- private static final char QUOTE = '\'';\r
- private static final char ESCAPE = '\\';\r
- private static final char END_OF_RULE = ';';\r
- private static final char RULE_COMMENT_CHAR = '#';\r
-\r
- private static final char CONTEXT_ANTE = '{'; // ante{key\r
- private static final char CONTEXT_POST = '}'; // key}post\r
- private static final char CURSOR_POS = '|';\r
- private static final char CURSOR_OFFSET = '@';\r
- private static final char ANCHOR_START = '^';\r
-\r
- private static final char KLEENE_STAR = '*';\r
- private static final char ONE_OR_MORE = '+';\r
- private static final char ZERO_OR_ONE = '?';\r
-\r
- private static final char DOT = '.';\r
- private static final String DOT_SET = "[^[:Zp:][:Zl:]\\r\\n$]";\r
-\r
- // By definition, the ANCHOR_END special character is a\r
- // trailing SymbolTable.SYMBOL_REF character.\r
- // private static final char ANCHOR_END = '$';\r
-\r
- // Segments of the input string are delimited by "(" and ")". In the\r
- // output string these segments are referenced as "$1", "$2", etc.\r
- private static final char SEGMENT_OPEN = '(';\r
- private static final char SEGMENT_CLOSE = ')';\r
-\r
- // A function is denoted &Source-Target/Variant(text)\r
- private static final char FUNCTION = '&';\r
-\r
- // Aliases for some of the syntax characters. These are provided so\r
- // transliteration rules can be expressed in XML without clashing with\r
- // XML syntax characters '<', '>', and '&'.\r
- private static final char ALT_REVERSE_RULE_OP = '\u2190'; // Left Arrow\r
- private static final char ALT_FORWARD_RULE_OP = '\u2192'; // Right Arrow\r
- private static final char ALT_FWDREV_RULE_OP = '\u2194'; // Left Right Arrow\r
- private static final char ALT_FUNCTION = '\u2206'; // Increment (~Greek Capital Delta)\r
- \r
- // Special characters disallowed at the top level\r
- private static UnicodeSet ILLEGAL_TOP = new UnicodeSet("[\\)]");\r
-\r
- // Special characters disallowed within a segment\r
- private static UnicodeSet ILLEGAL_SEG = new UnicodeSet("[\\{\\}\\|\\@]");\r
-\r
- // Special characters disallowed within a function argument\r
- private static UnicodeSet ILLEGAL_FUNC = new UnicodeSet("[\\^\\(\\.\\*\\+\\?\\{\\}\\|\\@]");\r
-\r
- //----------------------------------------------------------------------\r
- // class ParseData\r
- //----------------------------------------------------------------------\r
-\r
- /**\r
- * This class implements the SymbolTable interface. It is used\r
- * during parsing to give UnicodeSet access to variables that\r
- * have been defined so far. Note that it uses variablesVector,\r
- * _not_ data.variables.\r
- */\r
- private class ParseData implements SymbolTable {\r
-\r
- /**\r
- * Implement SymbolTable API.\r
- */\r
- public char[] lookup(String name) {\r
- return (char[]) variableNames.get(name);\r
- }\r
-\r
- /**\r
- * Implement SymbolTable API.\r
- */\r
- public UnicodeMatcher lookupMatcher(int ch) {\r
- // Note that we cannot use data.lookup() because the\r
- // set array has not been constructed yet.\r
- int i = ch - curData.variablesBase;\r
- if (i >= 0 && i < variablesVector.size()) {\r
- return (UnicodeMatcher) variablesVector.elementAt(i);\r
- }\r
- return null;\r
- }\r
-\r
- /**\r
- * Implement SymbolTable API. Parse out a symbol reference\r
- * name.\r
- */\r
- public String parseReference(String text, ParsePosition pos, int limit) {\r
- int start = pos.getIndex();\r
- int i = start;\r
- while (i < limit) {\r
- char c = text.charAt(i);\r
- if ((i==start && !UCharacter.isUnicodeIdentifierStart(c)) ||\r
- !UCharacter.isUnicodeIdentifierPart(c)) {\r
- break;\r
- }\r
- ++i;\r
- }\r
- if (i == start) { // No valid name chars\r
- return null;\r
- }\r
- pos.setIndex(i);\r
- return text.substring(start, i);\r
- }\r
-\r
- /**\r
- * Return true if the given character is a matcher standin or a plain\r
- * character (non standin).\r
- */\r
- public boolean isMatcher(int ch) {\r
- // Note that we cannot use data.lookup() because the\r
- // set array has not been constructed yet.\r
- int i = ch - curData.variablesBase;\r
- if (i >= 0 && i < variablesVector.size()) {\r
- return variablesVector.elementAt(i) instanceof UnicodeMatcher;\r
- }\r
- return true;\r
- }\r
-\r
- /**\r
- * Return true if the given character is a replacer standin or a plain\r
- * character (non standin).\r
- */\r
- public boolean isReplacer(int ch) {\r
- // Note that we cannot use data.lookup() because the\r
- // set array has not been constructed yet.\r
- int i = ch - curData.variablesBase;\r
- if (i >= 0 && i < variablesVector.size()) {\r
- return variablesVector.elementAt(i) instanceof UnicodeReplacer;\r
- }\r
- return true;\r
- }\r
- }\r
-\r
- //----------------------------------------------------------------------\r
- // classes RuleBody, RuleArray, and RuleReader\r
- //----------------------------------------------------------------------\r
-\r
- /**\r
- * A private abstract class representing the interface to rule\r
- * source code that is broken up into lines. Handles the\r
- * folding of lines terminated by a backslash. This folding\r
- * is limited; it does not account for comments, quotes, or\r
- * escapes, so its use to be limited.\r
- */\r
- private static abstract class RuleBody {\r
-\r
- /**\r
- * Retrieve the next line of the source, or return null if\r
- * none. Folds lines terminated by a backslash into the\r
- * next line, without regard for comments, quotes, or\r
- * escapes.\r
- */\r
- String nextLine() {\r
- String s = handleNextLine();\r
- if (s != null &&\r
- s.length() > 0 &&\r
- s.charAt(s.length() - 1) == '\\') {\r
-\r
- StringBuffer b = new StringBuffer(s);\r
- do {\r
- b.deleteCharAt(b.length()-1);\r
- s = handleNextLine();\r
- if (s == null) {\r
- break;\r
- }\r
- b.append(s);\r
- } while (s.length() > 0 &&\r
- s.charAt(s.length() - 1) == '\\');\r
-\r
- s = b.toString();\r
- }\r
- return s;\r
- }\r
-\r
- /**\r
- * Reset to the first line of the source.\r
- */\r
- abstract void reset();\r
-\r
- /**\r
- * Subclass method to return the next line of the source.\r
- */\r
- abstract String handleNextLine();\r
- }\r
-\r
- /**\r
- * RuleBody subclass for a String[] array.\r
- */\r
- private static class RuleArray extends RuleBody {\r
- String[] array;\r
- int i;\r
- public RuleArray(String[] array) { this.array = array; i = 0; }\r
- public String handleNextLine() {\r
- return (i < array.length) ? array[i++] : null;\r
- }\r
- public void reset() {\r
- i = 0;\r
- }\r
- }\r
-\r
- /*\r
- * RuleBody subclass for a ResourceReader.\r
- */\r
-/* private static class RuleReader extends RuleBody {\r
- ResourceReader reader;\r
- public RuleReader(ResourceReader reader) { this.reader = reader; }\r
- public String handleNextLine() {\r
- try {\r
- return reader.readLine();\r
- } catch (java.io.IOException e) {}\r
- return null;\r
- }\r
- public void reset() {\r
- reader.reset();\r
- }\r
- }*/\r
-\r
- //----------------------------------------------------------------------\r
- // class RuleHalf\r
- //----------------------------------------------------------------------\r
-\r
- /**\r
- * A class representing one side of a rule. This class knows how to\r
- * parse half of a rule. It is tightly coupled to the method\r
- * TransliteratorParser.parseRule().\r
- */\r
- private static class RuleHalf {\r
-\r
- public String text;\r
-\r
- public int cursor = -1; // position of cursor in text\r
- public int ante = -1; // position of ante context marker '{' in text\r
- public int post = -1; // position of post context marker '}' in text\r
-\r
- // Record the offset to the cursor either to the left or to the\r
- // right of the key. This is indicated by characters on the output\r
- // side that allow the cursor to be positioned arbitrarily within\r
- // the matching text. For example, abc{def} > | @@@ xyz; changes\r
- // def to xyz and moves the cursor to before abc. Offset characters\r
- // must be at the start or end, and they cannot move the cursor past\r
- // the ante- or postcontext text. Placeholders are only valid in\r
- // output text. The length of the ante and post context is\r
- // determined at runtime, because of supplementals and quantifiers.\r
- public int cursorOffset = 0; // only nonzero on output side\r
-\r
- // Position of first CURSOR_OFFSET on _right_. This will be -1\r
- // for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc.\r
- private int cursorOffsetPos = 0;\r
-\r
- public boolean anchorStart = false;\r
- public boolean anchorEnd = false;\r
-\r
- /**\r
- * The segment number from 1..n of the next '(' we see\r
- * during parsing; 1-based.\r
- */\r
- private int nextSegmentNumber = 1;\r
-\r
- /**\r
- * Parse one side of a rule, stopping at either the limit,\r
- * the END_OF_RULE character, or an operator.\r
- * @return the index after the terminating character, or\r
- * if limit was reached, limit\r
- */\r
- public int parse(String rule, int pos, int limit,\r
- TransliteratorParser parser) {\r
- int start = pos;\r
- StringBuffer buf = new StringBuffer();\r
- pos = parseSection(rule, pos, limit, parser, buf, ILLEGAL_TOP, false);\r
- text = buf.toString();\r
-\r
- if (cursorOffset > 0 && cursor != cursorOffsetPos) {\r
- syntaxError("Misplaced " + CURSOR_POS, rule, start);\r
- }\r
-\r
- return pos;\r
- }\r
-\r
- /**\r
- * Parse a section of one side of a rule, stopping at either\r
- * the limit, the END_OF_RULE character, an operator, or a\r
- * segment close character. This method parses both a\r
- * top-level rule half and a segment within such a rule half.\r
- * It calls itself recursively to parse segments and nested\r
- * segments.\r
- * @param buf buffer into which to accumulate the rule pattern\r
- * characters, either literal characters from the rule or\r
- * standins for UnicodeMatcher objects including segments.\r
- * @param illegal the set of special characters that is illegal during\r
- * this parse.\r
- * @param isSegment if true, then we've already seen a '(' and\r
- * pos on entry points right after it. Accumulate everything\r
- * up to the closing ')', put it in a segment matcher object,\r
- * generate a standin for it, and add the standin to buf. As\r
- * a side effect, update the segments vector with a reference\r
- * to the segment matcher. This works recursively for nested\r
- * segments. If isSegment is false, just accumulate\r
- * characters into buf.\r
- * @return the index after the terminating character, or\r
- * if limit was reached, limit\r
- */\r
- private int parseSection(String rule, int pos, int limit,\r
- TransliteratorParser parser,\r
- StringBuffer buf,\r
- UnicodeSet illegal,\r
- boolean isSegment) {\r
- int start = pos;\r
- ParsePosition pp = null;\r
- int quoteStart = -1; // Most recent 'single quoted string'\r
- int quoteLimit = -1;\r
- int varStart = -1; // Most recent $variableReference\r
- int varLimit = -1;\r
- int[] iref = new int[1];\r
- int bufStart = buf.length();\r
-\r
- main:\r
- while (pos < limit) {\r
- // Since all syntax characters are in the BMP, fetching\r
- // 16-bit code units suffices here.\r
- char c = rule.charAt(pos++);\r
- if (UCharacterProperty.isRuleWhiteSpace(c)) {\r
- continue;\r
- }\r
- // HALF_ENDERS is all chars that end a rule half: "<>=;"\r
- if (HALF_ENDERS.indexOf(c) >= 0) {\r
- if (isSegment) {\r
- syntaxError("Unclosed segment", rule, start);\r
- }\r
- break main;\r
- }\r
- if (anchorEnd) {\r
- // Text after a presumed end anchor is a syntax err\r
- syntaxError("Malformed variable reference", rule, start);\r
- }\r
- if (UnicodeSet.resemblesPattern(rule, pos-1)) {\r
- if (pp == null) {\r
- pp = new ParsePosition(0);\r
- }\r
- pp.setIndex(pos-1); // Backup to opening '['\r
- buf.append(parser.parseSet(rule, pp));\r
- pos = pp.getIndex(); \r
- continue;\r
- }\r
- // Handle escapes\r
- if (c == ESCAPE) {\r
- if (pos == limit) {\r
- syntaxError("Trailing backslash", rule, start);\r
- }\r
- iref[0] = pos;\r
- int escaped = Utility.unescapeAt(rule, iref);\r
- pos = iref[0];\r
- if (escaped == -1) {\r
- syntaxError("Malformed escape", rule, start);\r
- }\r
- parser.checkVariableRange(escaped, rule, start);\r
- UTF16.append(buf, escaped);\r
- continue;\r
- }\r
- // Handle quoted matter\r
- if (c == QUOTE) {\r
- int iq = rule.indexOf(QUOTE, pos);\r
- if (iq == pos) {\r
- buf.append(c); // Parse [''] outside quotes as [']\r
- ++pos;\r
- } else {\r
- /* This loop picks up a run of quoted text of the\r
- * form 'aaaa' each time through. If this run\r
- * hasn't really ended ('aaaa''bbbb') then it keeps\r
- * looping, each time adding on a new run. When it\r
- * reaches the final quote it breaks.\r
- */\r
- quoteStart = buf.length();\r
- for (;;) {\r
- if (iq < 0) {\r
- syntaxError("Unterminated quote", rule, start);\r
- }\r
- buf.append(rule.substring(pos, iq));\r
- pos = iq+1;\r
- if (pos < limit && rule.charAt(pos) == QUOTE) {\r
- // Parse [''] inside quotes as [']\r
- iq = rule.indexOf(QUOTE, pos+1);\r
- // Continue looping\r
- } else {\r
- break;\r
- }\r
- }\r
- quoteLimit = buf.length();\r
- \r
- for (iq=quoteStart; iq<quoteLimit; ++iq) {\r
- parser.checkVariableRange(buf.charAt(iq), rule, start);\r
- }\r
- }\r
- continue;\r
- }\r
-\r
- parser.checkVariableRange(c, rule, start);\r
-\r
- if (illegal.contains(c)) {\r
- syntaxError("Illegal character '" + c + '\'', rule, start);\r
- }\r
-\r
- switch (c) {\r
- \r
- //------------------------------------------------------\r
- // Elements allowed within and out of segments\r
- //------------------------------------------------------\r
- case ANCHOR_START:\r
- if (buf.length() == 0 && !anchorStart) {\r
- anchorStart = true;\r
- } else {\r
- syntaxError("Misplaced anchor start",\r
- rule, start);\r
- }\r
- break;\r
- case SEGMENT_OPEN:\r
- {\r
- // bufSegStart is the offset in buf to the first\r
- // character of the segment we are parsing.\r
- int bufSegStart = buf.length();\r
-\r
- // Record segment number now, since nextSegmentNumber\r
- // will be incremented during the call to parseSection\r
- // if there are nested segments.\r
- int segmentNumber = nextSegmentNumber++; // 1-based\r
-\r
- // Parse the segment\r
- pos = parseSection(rule, pos, limit, parser, buf, ILLEGAL_SEG, true);\r
-\r
- // After parsing a segment, the relevant characters are\r
- // in buf, starting at offset bufSegStart. Extract them\r
- // into a string matcher, and replace them with a\r
- // standin for that matcher.\r
- StringMatcher m =\r
- new StringMatcher(buf.substring(bufSegStart),\r
- segmentNumber, parser.curData);\r
-\r
- // Record and associate object and segment number\r
- parser.setSegmentObject(segmentNumber, m);\r
- buf.setLength(bufSegStart);\r
- buf.append(parser.getSegmentStandin(segmentNumber));\r
- }\r
- break;\r
- case FUNCTION:\r
- case ALT_FUNCTION:\r
- {\r
- iref[0] = pos;\r
- TransliteratorIDParser.SingleID single = TransliteratorIDParser.parseFilterID(rule, iref);\r
- // The next character MUST be a segment open\r
- if (single == null ||\r
- !Utility.parseChar(rule, iref, SEGMENT_OPEN)) {\r
- syntaxError("Invalid function", rule, start);\r
- }\r
-\r
- Transliterator t = single.getInstance();\r
- if (t == null) {\r
- syntaxError("Invalid function ID", rule, start);\r
- }\r
-\r
- // bufSegStart is the offset in buf to the first\r
- // character of the segment we are parsing.\r
- int bufSegStart = buf.length();\r
-\r
- // Parse the segment\r
- pos = parseSection(rule, iref[0], limit, parser, buf, ILLEGAL_FUNC, true);\r
-\r
- // After parsing a segment, the relevant characters are\r
- // in buf, starting at offset bufSegStart.\r
- FunctionReplacer r =\r
- new FunctionReplacer(t,\r
- new StringReplacer(buf.substring(bufSegStart), parser.curData));\r
-\r
- // Replace the buffer contents with a stand-in\r
- buf.setLength(bufSegStart);\r
- buf.append(parser.generateStandInFor(r));\r
- }\r
- break;\r
- case SymbolTable.SYMBOL_REF:\r
- // Handle variable references and segment references "$1" .. "$9"\r
- {\r
- // A variable reference must be followed immediately\r
- // by a Unicode identifier start and zero or more\r
- // Unicode identifier part characters, or by a digit\r
- // 1..9 if it is a segment reference.\r
- if (pos == limit) {\r
- // A variable ref character at the end acts as\r
- // an anchor to the context limit, as in perl.\r
- anchorEnd = true;\r
- break;\r
- }\r
- // Parse "$1" "$2" .. "$9" .. (no upper limit)\r
- c = rule.charAt(pos);\r
- int r = UCharacter.digit(c, 10);\r
- if (r >= 1 && r <= 9) {\r
- iref[0] = pos;\r
- r = Utility.parseNumber(rule, iref, 10);\r
- if (r < 0) {\r
- syntaxError("Undefined segment reference",\r
- rule, start);\r
- }\r
- pos = iref[0];\r
- buf.append(parser.getSegmentStandin(r));\r
- } else {\r
- if (pp == null) { // Lazy create\r
- pp = new ParsePosition(0);\r
- }\r
- pp.setIndex(pos);\r
- String name = parser.parseData.\r
- parseReference(rule, pp, limit);\r
- if (name == null) {\r
- // This means the '$' was not followed by a\r
- // valid name. Try to interpret it as an\r
- // end anchor then. If this also doesn't work\r
- // (if we see a following character) then signal\r
- // an error.\r
- anchorEnd = true;\r
- break;\r
- }\r
- pos = pp.getIndex();\r
- // If this is a variable definition statement,\r
- // then the LHS variable will be undefined. In\r
- // that case appendVariableDef() will append the\r
- // special placeholder char variableLimit-1.\r
- varStart = buf.length();\r
- parser.appendVariableDef(name, buf);\r
- varLimit = buf.length();\r
- }\r
- }\r
- break;\r
- case DOT:\r
- buf.append(parser.getDotStandIn());\r
- break;\r
- case KLEENE_STAR:\r
- case ONE_OR_MORE:\r
- case ZERO_OR_ONE:\r
- // Quantifiers. We handle single characters, quoted strings,\r
- // variable references, and segments.\r
- // a+ matches aaa\r
- // 'foo'+ matches foofoofoo\r
- // $v+ matches xyxyxy if $v == xy\r
- // (seg)+ matches segsegseg\r
- {\r
- if (isSegment && buf.length() == bufStart) {\r
- // The */+ immediately follows '('\r
- syntaxError("Misplaced quantifier", rule, start);\r
- break;\r
- } \r
- \r
- int qstart, qlimit;\r
- // The */+ follows an isolated character or quote\r
- // or variable reference\r
- if (buf.length() == quoteLimit) {\r
- // The */+ follows a 'quoted string'\r
- qstart = quoteStart;\r
- qlimit = quoteLimit;\r
- } else if (buf.length() == varLimit) {\r
- // The */+ follows a $variableReference\r
- qstart = varStart;\r
- qlimit = varLimit;\r
- } else {\r
- // The */+ follows a single character, possibly\r
- // a segment standin\r
- qstart = buf.length() - 1;\r
- qlimit = qstart + 1;\r
- }\r
-\r
- UnicodeMatcher m;\r
- try {\r
- m = new StringMatcher(buf.toString(), qstart, qlimit,\r
- 0, parser.curData);\r
- } catch (RuntimeException e) {\r
- final String precontext = pos < 50 ? rule.substring(0, pos) : "..." + rule.substring(pos - 50, pos);\r
- final String postContext = limit-pos <= 50 ? rule.substring(pos, limit) : rule.substring(pos, pos+50) + "...";\r
- throw (RuntimeException)\r
- new IllegalIcuArgumentException("Failure in rule: " + precontext + "$$$"\r
- + postContext)\r
-//#if defined(FOUNDATION10) || defined(J2SE13)\r
-//#else\r
- .initCause(e)\r
-//#endif\r
- ;\r
- }\r
- int min = 0;\r
- int max = Quantifier.MAX;\r
- switch (c) {\r
- case ONE_OR_MORE:\r
- min = 1;\r
- break;\r
- case ZERO_OR_ONE:\r
- min = 0;\r
- max = 1;\r
- break;\r
- // case KLEENE_STAR:\r
- // do nothing -- min, max already set\r
- }\r
- m = new Quantifier(m, min, max);\r
- buf.setLength(qstart);\r
- buf.append(parser.generateStandInFor(m));\r
- }\r
- break;\r
-\r
- //------------------------------------------------------\r
- // Elements allowed ONLY WITHIN segments\r
- //------------------------------------------------------\r
- case SEGMENT_CLOSE:\r
- // assert(isSegment);\r
- // We're done parsing a segment.\r
- break main;\r
-\r
- //------------------------------------------------------\r
- // Elements allowed ONLY OUTSIDE segments\r
- //------------------------------------------------------\r
- case CONTEXT_ANTE:\r
- if (ante >= 0) {\r
- syntaxError("Multiple ante contexts", rule, start);\r
- }\r
- ante = buf.length();\r
- break;\r
- case CONTEXT_POST:\r
- if (post >= 0) {\r
- syntaxError("Multiple post contexts", rule, start);\r
- }\r
- post = buf.length();\r
- break;\r
- case CURSOR_POS:\r
- if (cursor >= 0) {\r
- syntaxError("Multiple cursors", rule, start);\r
- }\r
- cursor = buf.length();\r
- break;\r
- case CURSOR_OFFSET:\r
- if (cursorOffset < 0) {\r
- if (buf.length() > 0) {\r
- syntaxError("Misplaced " + c, rule, start);\r
- }\r
- --cursorOffset;\r
- } else if (cursorOffset > 0) {\r
- if (buf.length() != cursorOffsetPos || cursor >= 0) {\r
- syntaxError("Misplaced " + c, rule, start);\r
- }\r
- ++cursorOffset;\r
- } else {\r
- if (cursor == 0 && buf.length() == 0) {\r
- cursorOffset = -1;\r
- } else if (cursor < 0) {\r
- cursorOffsetPos = buf.length();\r
- cursorOffset = 1;\r
- } else {\r
- syntaxError("Misplaced " + c, rule, start);\r
- }\r
- }\r
- break;\r
-\r
- //------------------------------------------------------\r
- // Non-special characters\r
- //------------------------------------------------------\r
- default:\r
- // Disallow unquoted characters other than [0-9A-Za-z]\r
- // in the printable ASCII range. These characters are\r
- // reserved for possible future use.\r
- if (c >= 0x0021 && c <= 0x007E &&\r
- !((c >= '0' && c <= '9') ||\r
- (c >= 'A' && c <= 'Z') ||\r
- (c >= 'a' && c <= 'z'))) {\r
- syntaxError("Unquoted " + c, rule, start);\r
- }\r
- buf.append(c);\r
- break;\r
- }\r
- }\r
- return pos;\r
- }\r
-\r
- /**\r
- * Remove context.\r
- */\r
- void removeContext() {\r
- text = text.substring(ante < 0 ? 0 : ante,\r
- post < 0 ? text.length() : post);\r
- ante = post = -1;\r
- anchorStart = anchorEnd = false;\r
- }\r
-\r
- /**\r
- * Return true if this half looks like valid output, that is, does not\r
- * contain quantifiers or other special input-only elements.\r
- */\r
- public boolean isValidOutput(TransliteratorParser parser) {\r
- for (int i=0; i<text.length(); ) {\r
- int c = UTF16.charAt(text, i);\r
- i += UTF16.getCharCount(c);\r
- if (!parser.parseData.isReplacer(c)) {\r
- return false;\r
- }\r
- }\r
- return true;\r
- }\r
-\r
- /**\r
- * Return true if this half looks like valid input, that is, does not\r
- * contain functions or other special output-only elements.\r
- */\r
- public boolean isValidInput(TransliteratorParser parser) {\r
- for (int i=0; i<text.length(); ) {\r
- int c = UTF16.charAt(text, i);\r
- i += UTF16.getCharCount(c);\r
- if (!parser.parseData.isMatcher(c)) {\r
- return false;\r
- }\r
- }\r
- return true;\r
- }\r
- }\r
-\r
- //----------------------------------------------------------------------\r
- // PUBLIC methods\r
- //----------------------------------------------------------------------\r
-\r
- /**\r
- * Constructor.\r
- */\r
- public TransliteratorParser() {\r
- }\r
-\r
- /**\r
- * Parse a set of rules. After the parse completes, examine the public\r
- * data members for results.\r
- */\r
- public void parse(String rules, int dir) {\r
- parseRules(new RuleArray(new String[] { rules }), dir);\r
- }\r
- \r
- /*\r
- * Parse a set of rules. After the parse completes, examine the public\r
- * data members for results.\r
- */\r
-/* public void parse(ResourceReader rules, int direction) {\r
- parseRules(new RuleReader(rules), direction);\r
- }*/\r
-\r
- //----------------------------------------------------------------------\r
- // PRIVATE methods\r
- //----------------------------------------------------------------------\r
-\r
- /**\r
- * Parse an array of zero or more rules. The strings in the array are\r
- * treated as if they were concatenated together, with rule terminators\r
- * inserted between array elements if not present already.\r
- *\r
- * Any previous rules are discarded. Typically this method is called exactly\r
- * once, during construction.\r
- *\r
- * The member this.data will be set to null if there are no rules.\r
- *\r
- * @exception IllegalIcuArgumentException if there is a syntax error in the\r
- * rules\r
- */\r
- void parseRules(RuleBody ruleArray, int dir) {\r
- boolean parsingIDs = true;\r
- int ruleCount = 0;\r
-\r
- dataVector = new Vector();\r
- idBlockVector = new Vector();\r
- curData = null;\r
- direction = dir;\r
- compoundFilter = null;\r
- variablesVector = new Vector();\r
- variableNames = new Hashtable();\r
- parseData = new ParseData();\r
-\r
- List errors = new ArrayList();\r
- int errorCount = 0;\r
-\r
- ruleArray.reset();\r
-\r
- StringBuffer idBlockResult = new StringBuffer();\r
-\r
- // The compound filter offset is an index into idBlockResult.\r
- // If it is 0, then the compound filter occurred at the start,\r
- // and it is the offset to the _start_ of the compound filter\r
- // pattern. Otherwise it is the offset to the _limit_ of the\r
- // compound filter pattern within idBlockResult.\r
- this.compoundFilter = null;\r
- int compoundFilterOffset = -1;\r
-\r
- main:\r
- for (;;) {\r
- String rule = ruleArray.nextLine();\r
- if (rule == null) {\r
- break;\r
- }\r
- int pos = 0;\r
- int limit = rule.length();\r
- while (pos < limit) {\r
- char c = rule.charAt(pos++);\r
- if (UCharacterProperty.isRuleWhiteSpace(c)) {\r
- continue;\r
- }\r
- // Skip lines starting with the comment character\r
- if (c == RULE_COMMENT_CHAR) {\r
- pos = rule.indexOf("\n", pos) + 1;\r
- if (pos == 0) {\r
- break; // No "\n" found; rest of rule is a commnet\r
- }\r
- continue; // Either fall out or restart with next line\r
- }\r
-\r
- // skip empty rules\r
- if (c == END_OF_RULE)\r
- continue;\r
-\r
- // Often a rule file contains multiple errors. It's\r
- // convenient to the rule author if these are all reported\r
- // at once. We keep parsing rules even after a failure, up\r
- // to a specified limit, and report all errors at once.\r
- try {\r
- ++ruleCount;\r
-\r
- // We've found the start of a rule or ID. c is its first\r
- // character, and pos points past c.\r
- --pos;\r
- // Look for an ID token. Must have at least ID_TOKEN_LEN + 1\r
- // chars left.\r
- if ((pos + ID_TOKEN_LEN + 1) <= limit &&\r
- rule.regionMatches(pos, ID_TOKEN, 0, ID_TOKEN_LEN)) {\r
- pos += ID_TOKEN_LEN;\r
- c = rule.charAt(pos);\r
- while (UCharacterProperty.isRuleWhiteSpace(c) && pos < limit) {\r
- ++pos;\r
- c = rule.charAt(pos);\r
- }\r
- int[] p = new int[] { pos };\r
-\r
- if (!parsingIDs) {\r
- if (curData != null) {\r
- if (direction == Transliterator.FORWARD)\r
- dataVector.add(curData);\r
- else\r
- dataVector.insertElementAt(curData, 0);\r
- curData = null;\r
- }\r
- parsingIDs = true;\r
- }\r
-\r
- TransliteratorIDParser.SingleID id =\r
- TransliteratorIDParser.parseSingleID(\r
- rule, p, direction);\r
- if (p[0] != pos && Utility.parseChar(rule, p, END_OF_RULE)) {\r
- // Successful ::ID parse.\r
-\r
- if (direction == Transliterator.FORWARD) {\r
- idBlockResult.append(id.canonID).append(END_OF_RULE);\r
- } else {\r
- idBlockResult.insert(0, id.canonID + END_OF_RULE);\r
- }\r
-\r
- } else {\r
- // Couldn't parse an ID. Try to parse a global filter\r
- int[] withParens = new int[] { -1 };\r
- UnicodeSet f = TransliteratorIDParser.parseGlobalFilter(rule, p, direction, withParens, null);\r
- if (f != null && Utility.parseChar(rule, p, END_OF_RULE)) {\r
- if ((direction == Transliterator.FORWARD) ==\r
- (withParens[0] == 0)) {\r
- if (compoundFilter != null) {\r
- // Multiple compound filters\r
- syntaxError("Multiple global filters", rule, pos);\r
- }\r
- compoundFilter = f;\r
- compoundFilterOffset = ruleCount;\r
- }\r
- } else {\r
- // Invalid ::id\r
- // Can be parsed as neither an ID nor a global filter\r
- syntaxError("Invalid ::ID", rule, pos);\r
- }\r
- }\r
-\r
- pos = p[0];\r
- } else {\r
- if (parsingIDs) {\r
- if (direction == Transliterator.FORWARD)\r
- idBlockVector.add(idBlockResult.toString());\r
- else\r
- idBlockVector.insertElementAt(idBlockResult.toString(), 0);\r
- idBlockResult.delete(0, idBlockResult.length());\r
- parsingIDs = false;\r
- curData = new RuleBasedTransliterator.Data();\r
-\r
- // By default, rules use part of the private use area\r
- // E000..F8FF for variables and other stand-ins. Currently\r
- // the range F000..F8FF is typically sufficient. The 'use\r
- // variable range' pragma allows rule sets to modify this.\r
- setVariableRange(0xF000, 0xF8FF);\r
- }\r
-\r
- if (resemblesPragma(rule, pos, limit)) {\r
- int ppp = parsePragma(rule, pos, limit);\r
- if (ppp < 0) {\r
- syntaxError("Unrecognized pragma", rule, pos);\r
- }\r
- pos = ppp;\r
- // Parse a rule\r
- } else {\r
- pos = parseRule(rule, pos, limit);\r
- }\r
- }\r
- } catch (IllegalArgumentException e) {\r
- if (errorCount == 30) {\r
- errors.add(new IllegalIcuArgumentException("\nMore than 30 errors; further messages squelched")\r
-//#if defined(FOUNDATION10) || defined(J2SE13)\r
-//#else\r
- .initCause(e)\r
-//#endif\r
- );\r
- break main;\r
- }\r
- e.fillInStackTrace();\r
- errors.add(e);\r
- ++errorCount;\r
- pos = ruleEnd(rule, pos, limit) + 1; // +1 advances past ';'\r
- }\r
- }\r
- }\r
- if (parsingIDs && idBlockResult.length() > 0) {\r
- if (direction == Transliterator.FORWARD)\r
- idBlockVector.add(idBlockResult.toString());\r
- else\r
- idBlockVector.insertElementAt(idBlockResult.toString(), 0);\r
- }\r
- else if (!parsingIDs && curData != null) {\r
- if (direction == Transliterator.FORWARD)\r
- dataVector.add(curData);\r
- else\r
- dataVector.insertElementAt(curData, 0);\r
- }\r
-\r
- // Convert the set vector to an array\r
- for (int i = 0; i < dataVector.size(); i++) {\r
- RuleBasedTransliterator.Data data = (RuleBasedTransliterator.Data)dataVector.get(i);\r
- data.variables = new Object[variablesVector.size()];\r
- variablesVector.copyInto(data.variables);\r
- data.variableNames = new Hashtable();\r
- data.variableNames.putAll(variableNames);\r
- }\r
- variablesVector = null;\r
-\r
- // Do more syntax checking and index the rules\r
- try {\r
- if (compoundFilter != null) {\r
- if ((direction == Transliterator.FORWARD &&\r
- compoundFilterOffset != 1) ||\r
- (direction == Transliterator.REVERSE &&\r
- compoundFilterOffset != ruleCount)) {\r
- throw new IllegalIcuArgumentException("Compound filters misplaced");\r
- }\r
- }\r
-\r
- for (int i = 0; i < dataVector.size(); i++) {\r
- RuleBasedTransliterator.Data data = (RuleBasedTransliterator.Data)dataVector.get(i);\r
- data.ruleSet.freeze();\r
- }\r
-\r
- if (idBlockVector.size() == 1 && ((String)idBlockVector.get(0)).length() == 0)\r
- idBlockVector.remove(0);\r
-\r
- } catch (IllegalArgumentException e) {\r
- e.fillInStackTrace();\r
- errors.add(e);\r
- }\r
-\r
- if (errors.size() != 0) {\r
-//#if defined(FOUNDATION10) || defined(J2SE13)\r
-//#else\r
- for (int i = errors.size()-1; i > 0; --i) {\r
- RuntimeException previous = (RuntimeException) errors.get(i-1);\r
- while (previous.getCause() != null) {\r
- previous = (RuntimeException) previous.getCause(); // chain specially\r
- }\r
- previous.initCause((RuntimeException) errors.get(i));\r
- }\r
-//#endif\r
- throw (RuntimeException) errors.get(0);\r
- // if initCause not supported: throw new IllegalArgumentException(errors.toString());\r
- }\r
- }\r
-\r
- /**\r
- * MAIN PARSER. Parse the next rule in the given rule string, starting\r
- * at pos. Return the index after the last character parsed. Do not\r
- * parse characters at or after limit.\r
- *\r
- * Important: The character at pos must be a non-whitespace character\r
- * that is not the comment character.\r
- *\r
- * This method handles quoting, escaping, and whitespace removal. It\r
- * parses the end-of-rule character. It recognizes context and cursor\r
- * indicators. Once it does a lexical breakdown of the rule at pos, it\r
- * creates a rule object and adds it to our rule list.\r
- *\r
- * This method is tightly coupled to the inner class RuleHalf.\r
- */\r
- private int parseRule(String rule, int pos, int limit) {\r
- // Locate the left side, operator, and right side\r
- int start = pos;\r
- char operator = 0;\r
-\r
- // Set up segments data\r
- segmentStandins = new StringBuffer();\r
- segmentObjects = new Vector();\r
-\r
- RuleHalf left = new RuleHalf();\r
- RuleHalf right = new RuleHalf();\r
-\r
- undefinedVariableName = null;\r
- pos = left.parse(rule, pos, limit, this);\r
-\r
- if (pos == limit ||\r
- OPERATORS.indexOf(operator = rule.charAt(--pos)) < 0) {\r
- syntaxError("No operator pos=" + pos, rule, start);\r
- }\r
- ++pos;\r
-\r
- // Found an operator char. Check for forward-reverse operator.\r
- if (operator == REVERSE_RULE_OP &&\r
- (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {\r
- ++pos;\r
- operator = FWDREV_RULE_OP;\r
- }\r
-\r
- // Translate alternate op characters.\r
- switch (operator) {\r
- case ALT_FORWARD_RULE_OP:\r
- operator = FORWARD_RULE_OP;\r
- break;\r
- case ALT_REVERSE_RULE_OP:\r
- operator = REVERSE_RULE_OP;\r
- break;\r
- case ALT_FWDREV_RULE_OP:\r
- operator = FWDREV_RULE_OP;\r
- break;\r
- }\r
-\r
- pos = right.parse(rule, pos, limit, this);\r
-\r
- if (pos < limit) {\r
- if (rule.charAt(--pos) == END_OF_RULE) {\r
- ++pos;\r
- } else {\r
- // RuleHalf parser must have terminated at an operator\r
- syntaxError("Unquoted operator", rule, start);\r
- }\r
- }\r
-\r
- if (operator == VARIABLE_DEF_OP) {\r
- // LHS is the name. RHS is a single character, either a literal\r
- // or a set (already parsed). If RHS is longer than one\r
- // character, it is either a multi-character string, or multiple\r
- // sets, or a mixture of chars and sets -- syntax error.\r
-\r
- // We expect to see a single undefined variable (the one being\r
- // defined).\r
- if (undefinedVariableName == null) {\r
- syntaxError("Missing '$' or duplicate definition", rule, start);\r
- }\r
- if (left.text.length() != 1 || left.text.charAt(0) != variableLimit) {\r
- syntaxError("Malformed LHS", rule, start);\r
- }\r
- if (left.anchorStart || left.anchorEnd ||\r
- right.anchorStart || right.anchorEnd) {\r
- syntaxError("Malformed variable def", rule, start);\r
- }\r
- // We allow anything on the right, including an empty string.\r
- int n = right.text.length();\r
- char[] value = new char[n];\r
- right.text.getChars(0, n, value, 0);\r
- variableNames.put(undefinedVariableName, value);\r
-\r
- ++variableLimit;\r
- return pos;\r
- }\r
-\r
- // If this is not a variable definition rule, we shouldn't have\r
- // any undefined variable names.\r
- if (undefinedVariableName != null) {\r
- syntaxError("Undefined variable $" + undefinedVariableName,\r
- rule, start);\r
- }\r
-\r
- // Verify segments\r
- if (segmentStandins.length() > segmentObjects.size()) {\r
- syntaxError("Undefined segment reference", rule, start);\r
- }\r
- for (int i=0; i<segmentStandins.length(); ++i) {\r
- if (segmentStandins.charAt(i) == 0) {\r
- syntaxError("Internal error", rule, start); // will never happen\r
- }\r
- }\r
- for (int i=0; i<segmentObjects.size(); ++i) {\r
- if (segmentObjects.elementAt(i) == null) {\r
- syntaxError("Internal error", rule, start); // will never happen\r
- }\r
- }\r
-\r
- // If the direction we want doesn't match the rule\r
- // direction, do nothing.\r
- if (operator != FWDREV_RULE_OP &&\r
- ((direction == Transliterator.FORWARD) != (operator == FORWARD_RULE_OP))) {\r
- return pos;\r
- }\r
-\r
- // Transform the rule into a forward rule by swapping the\r
- // sides if necessary.\r
- if (direction == Transliterator.REVERSE) {\r
- RuleHalf temp = left;\r
- left = right;\r
- right = temp;\r
- }\r
-\r
- // Remove non-applicable elements in forward-reverse\r
- // rules. Bidirectional rules ignore elements that do not\r
- // apply.\r
- if (operator == FWDREV_RULE_OP) {\r
- right.removeContext();\r
- left.cursor = -1;\r
- left.cursorOffset = 0;\r
- }\r
-\r
- // Normalize context\r
- if (left.ante < 0) {\r
- left.ante = 0;\r
- }\r
- if (left.post < 0) {\r
- left.post = left.text.length();\r
- }\r
-\r
- // Context is only allowed on the input side. Cursors are only\r
- // allowed on the output side. Segment delimiters can only appear\r
- // on the left, and references on the right. Cursor offset\r
- // cannot appear without an explicit cursor. Cursor offset\r
- // cannot place the cursor outside the limits of the context.\r
- // Anchors are only allowed on the input side.\r
- if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 ||\r
- (right.cursorOffset != 0 && right.cursor < 0) ||\r
- // - The following two checks were used to ensure that the\r
- // - the cursor offset stayed within the ante- or postcontext.\r
- // - However, with the addition of quantifiers, we have to\r
- // - allow arbitrary cursor offsets and do runtime checking.\r
- //(right.cursorOffset > (left.text.length() - left.post)) ||\r
- //(-right.cursorOffset > left.ante) ||\r
- right.anchorStart || right.anchorEnd ||\r
- !left.isValidInput(this) || !right.isValidOutput(this) ||\r
- left.ante > left.post) {\r
- syntaxError("Malformed rule", rule, start);\r
- }\r
-\r
- // Flatten segment objects vector to an array\r
- UnicodeMatcher[] segmentsArray = null;\r
- if (segmentObjects.size() > 0) {\r
- segmentsArray = new UnicodeMatcher[segmentObjects.size()];\r
- segmentObjects.toArray(segmentsArray);\r
- }\r
-\r
- curData.ruleSet.addRule(new TransliterationRule(\r
- left.text, left.ante, left.post,\r
- right.text, right.cursor, right.cursorOffset,\r
- segmentsArray,\r
- left.anchorStart, left.anchorEnd,\r
- curData));\r
-\r
- return pos;\r
- }\r
-\r
- /**\r
- * Set the variable range to [start, end] (inclusive).\r
- */\r
- private void setVariableRange(int start, int end) {\r
- if (start > end || start < 0 || end > 0xFFFF) {\r
- throw new IllegalIcuArgumentException("Invalid variable range " + start + ", " + end);\r
- }\r
- \r
- curData.variablesBase = (char) start; // first private use\r
-\r
- if (dataVector.size() == 0) {\r
- variableNext = (char) start;\r
- variableLimit = (char) (end + 1);\r
- }\r
- }\r
-\r
- /**\r
- * Assert that the given character is NOT within the variable range.\r
- * If it is, signal an error. This is neccesary to ensure that the\r
- * variable range does not overlap characters used in a rule.\r
- */\r
- private void checkVariableRange(int ch, String rule, int start) {\r
- if (ch >= curData.variablesBase && ch < variableLimit) {\r
- syntaxError("Variable range character in rule", rule, start);\r
- }\r
- }\r
-\r
- // (The following method is part of an unimplemented feature.\r
- // Remove this clover pragma after the feature is implemented.\r
- // 2003-06-11 ICU 2.6 Alan)\r
- ///CLOVER:OFF\r
- /**\r
- * Set the maximum backup to 'backup', in response to a pragma\r
- * statement.\r
- */\r
- private void pragmaMaximumBackup(int backup) {\r
- //TODO Finish\r
- throw new IllegalIcuArgumentException("use maximum backup pragma not implemented yet");\r
- }\r
- ///CLOVER:ON\r
-\r
- // (The following method is part of an unimplemented feature.\r
- // Remove this clover pragma after the feature is implemented.\r
- // 2003-06-11 ICU 2.6 Alan)\r
- ///CLOVER:OFF\r
- /**\r
- * Begin normalizing all rules using the given mode, in response\r
- * to a pragma statement.\r
- */\r
- private void pragmaNormalizeRules(Normalizer.Mode mode) {\r
- //TODO Finish\r
- throw new IllegalIcuArgumentException("use normalize rules pragma not implemented yet");\r
- }\r
- ///CLOVER:ON\r
-\r
- /**\r
- * Return true if the given rule looks like a pragma.\r
- * @param pos offset to the first non-whitespace character\r
- * of the rule.\r
- * @param limit pointer past the last character of the rule.\r
- */\r
- static boolean resemblesPragma(String rule, int pos, int limit) {\r
- // Must start with /use\s/i\r
- return Utility.parsePattern(rule, pos, limit, "use ", null) >= 0;\r
- }\r
-\r
- /**\r
- * Parse a pragma. This method assumes resemblesPragma() has\r
- * already returned true.\r
- * @param pos offset to the first non-whitespace character\r
- * of the rule.\r
- * @param limit pointer past the last character of the rule.\r
- * @return the position index after the final ';' of the pragma,\r
- * or -1 on failure.\r
- */\r
- private int parsePragma(String rule, int pos, int limit) {\r
- int[] array = new int[2];\r
-\r
- // resemblesPragma() has already returned true, so we\r
- // know that pos points to /use\s/i; we can skip 4 characters\r
- // immediately\r
- pos += 4;\r
- \r
- // Here are the pragmas we recognize:\r
- // use variable range 0xE000 0xEFFF;\r
- // use maximum backup 16;\r
- // use nfd rules;\r
- int p = Utility.parsePattern(rule, pos, limit, "~variable range # #~;", array);\r
- if (p >= 0) {\r
- setVariableRange(array[0], array[1]);\r
- return p;\r
- }\r
-\r
- p = Utility.parsePattern(rule, pos, limit, "~maximum backup #~;", array);\r
- if (p >= 0) {\r
- pragmaMaximumBackup(array[0]);\r
- return p;\r
- }\r
-\r
- p = Utility.parsePattern(rule, pos, limit, "~nfd rules~;", null);\r
- if (p >= 0) {\r
- pragmaNormalizeRules(Normalizer.NFD);\r
- return p;\r
- }\r
-\r
- p = Utility.parsePattern(rule, pos, limit, "~nfc rules~;", null);\r
- if (p >= 0) {\r
- pragmaNormalizeRules(Normalizer.NFC);\r
- return p;\r
- }\r
-\r
- // Syntax error: unable to parse pragma\r
- return -1;\r
- }\r
-\r
- /**\r
- * Throw an exception indicating a syntax error. Search the rule string\r
- * for the probable end of the rule. Of course, if the error is that\r
- * the end of rule marker is missing, then the rule end will not be found.\r
- * In any case the rule start will be correctly reported.\r
- * @param msg error description\r
- * @param rule pattern string\r
- * @param start position of first character of current rule\r
- */\r
- static final void syntaxError(String msg, String rule, int start) {\r
- int end = ruleEnd(rule, start, rule.length());\r
- throw new IllegalIcuArgumentException(msg + " in \"" +\r
- Utility.escape(rule.substring(start, end)) + '"');\r
- }\r
-\r
- static final int ruleEnd(String rule, int start, int limit) {\r
- int end = Utility.quotedIndexOf(rule, start, limit, ";");\r
- if (end < 0) {\r
- end = limit;\r
- }\r
- return end;\r
- }\r
-\r
- /**\r
- * Parse a UnicodeSet out, store it, and return the stand-in character\r
- * used to represent it.\r
- */\r
- private final char parseSet(String rule, ParsePosition pos) {\r
- UnicodeSet set = new UnicodeSet(rule, pos, parseData);\r
- if (variableNext >= variableLimit) {\r
- throw new RuntimeException("Private use variables exhausted");\r
- }\r
- set.compact();\r
- return generateStandInFor(set);\r
- }\r
-\r
- /**\r
- * Generate and return a stand-in for a new UnicodeMatcher or UnicodeReplacer.\r
- * Store the object.\r
- */\r
- char generateStandInFor(Object obj) {\r
- // assert(obj != null);\r
-\r
- // Look up previous stand-in, if any. This is a short list\r
- // (typical n is 0, 1, or 2); linear search is optimal.\r
- for (int i=0; i<variablesVector.size(); ++i) {\r
- if (variablesVector.elementAt(i) == obj) { // [sic] pointer comparison\r
- return (char) (curData.variablesBase + i);\r
- }\r
- }\r
-\r
- if (variableNext >= variableLimit) {\r
- throw new RuntimeException("Variable range exhausted");\r
- }\r
- variablesVector.addElement(obj);\r
- return variableNext++;\r
- }\r
-\r
- /**\r
- * Return the standin for segment seg (1-based).\r
- */\r
- public char getSegmentStandin(int seg) {\r
- if (segmentStandins.length() < seg) {\r
- segmentStandins.setLength(seg);\r
- }\r
- char c = segmentStandins.charAt(seg-1);\r
- if (c == 0) {\r
- if (variableNext >= variableLimit) {\r
- throw new RuntimeException("Variable range exhausted");\r
- }\r
- c = variableNext++;\r
- // Set a placeholder in the master variables vector that will be\r
- // filled in later by setSegmentObject(). We know that we will get\r
- // called first because setSegmentObject() will call us.\r
- variablesVector.addElement(null);\r
- segmentStandins.setCharAt(seg-1, c);\r
- }\r
- return c;\r
- }\r
- \r
- /**\r
- * Set the object for segment seg (1-based).\r
- */\r
- public void setSegmentObject(int seg, StringMatcher obj) {\r
- // Since we call parseSection() recursively, nested\r
- // segments will result in segment i+1 getting parsed\r
- // and stored before segment i; be careful with the\r
- // vector handling here.\r
- if (segmentObjects.size() < seg) {\r
- segmentObjects.setSize(seg);\r
- }\r
- int index = getSegmentStandin(seg) - curData.variablesBase;\r
- if (segmentObjects.elementAt(seg-1) != null ||\r
- variablesVector.elementAt(index) != null) {\r
- throw new RuntimeException(); // should never happen\r
- }\r
- segmentObjects.setElementAt(obj, seg-1);\r
- variablesVector.setElementAt(obj, index);\r
- }\r
-\r
- /**\r
- * Return the stand-in for the dot set. It is allocated the first\r
- * time and reused thereafter.\r
- */\r
- char getDotStandIn() {\r
- if (dotStandIn == -1) {\r
- dotStandIn = generateStandInFor(new UnicodeSet(DOT_SET));\r
- }\r
- return (char) dotStandIn;\r
- }\r
-\r
- /**\r
- * Append the value of the given variable name to the given\r
- * StringBuffer.\r
- * @exception IllegalIcuArgumentException if the name is unknown.\r
- */\r
- private void appendVariableDef(String name, StringBuffer buf) {\r
- char[] ch = (char[]) variableNames.get(name);\r
- if (ch == null) {\r
- // We allow one undefined variable so that variable definition\r
- // statements work. For the first undefined variable we return\r
- // the special placeholder variableLimit-1, and save the variable\r
- // name.\r
- if (undefinedVariableName == null) {\r
- undefinedVariableName = name;\r
- if (variableNext >= variableLimit) {\r
- throw new RuntimeException("Private use variables exhausted");\r
- }\r
- buf.append((char) --variableLimit);\r
- } else {\r
- throw new IllegalIcuArgumentException("Undefined variable $"\r
- + name);\r
- }\r
- } else {\r
- buf.append(ch);\r
- }\r
- }\r
-}\r
-\r
-//eof\r
+//##header J2SE15
+/*
+**********************************************************************
+* Copyright (c) 2001-2009, International Business Machines
+* Corporation and others. All Rights Reserved.
+**********************************************************************
+*/
+package com.ibm.icu.text;
+
+import com.ibm.icu.impl.IllegalIcuArgumentException;
+import com.ibm.icu.impl.Utility;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Vector;
+import java.util.Hashtable;
+import java.text.ParsePosition;
+import com.ibm.icu.lang.*;
+import com.ibm.icu.impl.UCharacterProperty;
+
+class TransliteratorParser {
+
+ //----------------------------------------------------------------------
+ // Data members
+ //----------------------------------------------------------------------
+
+ /**
+ * PUBLIC data member.
+ * A Vector of RuleBasedTransliterator.Data objects, one for each discrete group
+ * of rules in the rule set
+ */
+ public Vector dataVector;
+
+ /**
+ * PUBLIC data member.
+ * A Vector of Strings containing all of the ID blocks in the rule set
+ */
+ public Vector idBlockVector;
+
+ /**
+ * The current data object for which we are parsing rules
+ */
+ private RuleBasedTransliterator.Data curData;
+
+ /**
+ * PUBLIC data member containing the parsed compound filter, if any.
+ */
+ public UnicodeSet compoundFilter;
+
+
+ private int direction;
+
+ /**
+ * Temporary symbol table used during parsing.
+ */
+ private ParseData parseData;
+
+ /**
+ * Temporary vector of set variables. When parsing is complete, this
+ * is copied into the array data.variables. As with data.variables,
+ * element 0 corresponds to character data.variablesBase.
+ */
+ private Vector variablesVector;
+
+ /**
+ * Temporary table of variable names. When parsing is complete, this is
+ * copied into data.variableNames.
+ */
+ private Hashtable variableNames;
+
+ /**
+ * String of standins for segments. Used during the parsing of a single
+ * rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds
+ * to StringMatcher object segmentObjects.elementAt(0), etc.
+ */
+ private StringBuffer segmentStandins;
+
+ /**
+ * Vector of StringMatcher objects for segments. Used during the
+ * parsing of a single rule.
+ * segmentStandins.charAt(0) is the standin for "$1" and corresponds
+ * to StringMatcher object segmentObjects.elementAt(0), etc.
+ */
+ private Vector segmentObjects;
+
+ /**
+ * The next available stand-in for variables. This starts at some point in
+ * the private use area (discovered dynamically) and increments up toward
+ * <code>variableLimit</code>. At any point during parsing, available
+ * variables are <code>variableNext..variableLimit-1</code>.
+ */
+ private char variableNext;
+
+ /**
+ * The last available stand-in for variables. This is discovered
+ * dynamically. At any point during parsing, available variables are
+ * <code>variableNext..variableLimit-1</code>. During variable definition
+ * we use the special value variableLimit-1 as a placeholder.
+ */
+ private char variableLimit;
+
+ /**
+ * When we encounter an undefined variable, we do not immediately signal
+ * an error, in case we are defining this variable, e.g., "$a = [a-z];".
+ * Instead, we save the name of the undefined variable, and substitute
+ * in the placeholder char variableLimit - 1, and decrement
+ * variableLimit.
+ */
+ private String undefinedVariableName;
+
+ /**
+ * The stand-in character for the 'dot' set, represented by '.' in
+ * patterns. This is allocated the first time it is needed, and
+ * reused thereafter.
+ */
+ private int dotStandIn = -1;
+
+ //----------------------------------------------------------------------
+ // Constants
+ //----------------------------------------------------------------------
+
+ // Indicator for ID blocks
+ private static final String ID_TOKEN = "::";
+ private static final int ID_TOKEN_LEN = 2;
+
+/*
+(reserved for future expansion)
+ // markers for beginning and end of rule groups
+ private static final String BEGIN_TOKEN = "BEGIN";
+ private static final String END_TOKEN = "END";
+*/
+
+ // Operators
+ private static final char VARIABLE_DEF_OP = '=';
+ private static final char FORWARD_RULE_OP = '>';
+ private static final char REVERSE_RULE_OP = '<';
+ private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op
+
+ private static final String OPERATORS = "=><\u2190\u2192\u2194";
+ private static final String HALF_ENDERS = "=><\u2190\u2192\u2194;";
+
+ // Other special characters
+ private static final char QUOTE = '\'';
+ private static final char ESCAPE = '\\';
+ private static final char END_OF_RULE = ';';
+ private static final char RULE_COMMENT_CHAR = '#';
+
+ private static final char CONTEXT_ANTE = '{'; // ante{key
+ private static final char CONTEXT_POST = '}'; // key}post
+ private static final char CURSOR_POS = '|';
+ private static final char CURSOR_OFFSET = '@';
+ private static final char ANCHOR_START = '^';
+
+ private static final char KLEENE_STAR = '*';
+ private static final char ONE_OR_MORE = '+';
+ private static final char ZERO_OR_ONE = '?';
+
+ private static final char DOT = '.';
+ private static final String DOT_SET = "[^[:Zp:][:Zl:]\\r\\n$]";
+
+ // By definition, the ANCHOR_END special character is a
+ // trailing SymbolTable.SYMBOL_REF character.
+ // private static final char ANCHOR_END = '$';
+
+ // Segments of the input string are delimited by "(" and ")". In the
+ // output string these segments are referenced as "$1", "$2", etc.
+ private static final char SEGMENT_OPEN = '(';
+ private static final char SEGMENT_CLOSE = ')';
+
+ // A function is denoted &Source-Target/Variant(text)
+ private static final char FUNCTION = '&';
+
+ // Aliases for some of the syntax characters. These are provided so
+ // transliteration rules can be expressed in XML without clashing with
+ // XML syntax characters '<', '>', and '&'.
+ private static final char ALT_REVERSE_RULE_OP = '\u2190'; // Left Arrow
+ private static final char ALT_FORWARD_RULE_OP = '\u2192'; // Right Arrow
+ private static final char ALT_FWDREV_RULE_OP = '\u2194'; // Left Right Arrow
+ private static final char ALT_FUNCTION = '\u2206'; // Increment (~Greek Capital Delta)
+
+ // Special characters disallowed at the top level
+ private static UnicodeSet ILLEGAL_TOP = new UnicodeSet("[\\)]");
+
+ // Special characters disallowed within a segment
+ private static UnicodeSet ILLEGAL_SEG = new UnicodeSet("[\\{\\}\\|\\@]");
+
+ // Special characters disallowed within a function argument
+ private static UnicodeSet ILLEGAL_FUNC = new UnicodeSet("[\\^\\(\\.\\*\\+\\?\\{\\}\\|\\@]");
+
+ //----------------------------------------------------------------------
+ // class ParseData
+ //----------------------------------------------------------------------
+
+ /**
+ * This class implements the SymbolTable interface. It is used
+ * during parsing to give UnicodeSet access to variables that
+ * have been defined so far. Note that it uses variablesVector,
+ * _not_ data.variables.
+ */
+ private class ParseData implements SymbolTable {
+
+ /**
+ * Implement SymbolTable API.
+ */
+ public char[] lookup(String name) {
+ return (char[]) variableNames.get(name);
+ }
+
+ /**
+ * Implement SymbolTable API.
+ */
+ public UnicodeMatcher lookupMatcher(int ch) {
+ // Note that we cannot use data.lookup() because the
+ // set array has not been constructed yet.
+ int i = ch - curData.variablesBase;
+ if (i >= 0 && i < variablesVector.size()) {
+ return (UnicodeMatcher) variablesVector.elementAt(i);
+ }
+ return null;
+ }
+
+ /**
+ * Implement SymbolTable API. Parse out a symbol reference
+ * name.
+ */
+ public String parseReference(String text, ParsePosition pos, int limit) {
+ int start = pos.getIndex();
+ int i = start;
+ while (i < limit) {
+ char c = text.charAt(i);
+ if ((i==start && !UCharacter.isUnicodeIdentifierStart(c)) ||
+ !UCharacter.isUnicodeIdentifierPart(c)) {
+ break;
+ }
+ ++i;
+ }
+ if (i == start) { // No valid name chars
+ return null;
+ }
+ pos.setIndex(i);
+ return text.substring(start, i);
+ }
+
+ /**
+ * Return true if the given character is a matcher standin or a plain
+ * character (non standin).
+ */
+ public boolean isMatcher(int ch) {
+ // Note that we cannot use data.lookup() because the
+ // set array has not been constructed yet.
+ int i = ch - curData.variablesBase;
+ if (i >= 0 && i < variablesVector.size()) {
+ return variablesVector.elementAt(i) instanceof UnicodeMatcher;
+ }
+ return true;
+ }
+
+ /**
+ * Return true if the given character is a replacer standin or a plain
+ * character (non standin).
+ */
+ public boolean isReplacer(int ch) {
+ // Note that we cannot use data.lookup() because the
+ // set array has not been constructed yet.
+ int i = ch - curData.variablesBase;
+ if (i >= 0 && i < variablesVector.size()) {
+ return variablesVector.elementAt(i) instanceof UnicodeReplacer;
+ }
+ return true;
+ }
+ }
+
+ //----------------------------------------------------------------------
+ // classes RuleBody, RuleArray, and RuleReader
+ //----------------------------------------------------------------------
+
+ /**
+ * A private abstract class representing the interface to rule
+ * source code that is broken up into lines. Handles the
+ * folding of lines terminated by a backslash. This folding
+ * is limited; it does not account for comments, quotes, or
+ * escapes, so its use to be limited.
+ */
+ private static abstract class RuleBody {
+
+ /**
+ * Retrieve the next line of the source, or return null if
+ * none. Folds lines terminated by a backslash into the
+ * next line, without regard for comments, quotes, or
+ * escapes.
+ */
+ String nextLine() {
+ String s = handleNextLine();
+ if (s != null &&
+ s.length() > 0 &&
+ s.charAt(s.length() - 1) == '\\') {
+
+ StringBuffer b = new StringBuffer(s);
+ do {
+ b.deleteCharAt(b.length()-1);
+ s = handleNextLine();
+ if (s == null) {
+ break;
+ }
+ b.append(s);
+ } while (s.length() > 0 &&
+ s.charAt(s.length() - 1) == '\\');
+
+ s = b.toString();
+ }
+ return s;
+ }
+
+ /**
+ * Reset to the first line of the source.
+ */
+ abstract void reset();
+
+ /**
+ * Subclass method to return the next line of the source.
+ */
+ abstract String handleNextLine();
+ }
+
+ /**
+ * RuleBody subclass for a String[] array.
+ */
+ private static class RuleArray extends RuleBody {
+ String[] array;
+ int i;
+ public RuleArray(String[] array) { this.array = array; i = 0; }
+ public String handleNextLine() {
+ return (i < array.length) ? array[i++] : null;
+ }
+ public void reset() {
+ i = 0;
+ }
+ }
+
+ /*
+ * RuleBody subclass for a ResourceReader.
+ */
+/* private static class RuleReader extends RuleBody {
+ ResourceReader reader;
+ public RuleReader(ResourceReader reader) { this.reader = reader; }
+ public String handleNextLine() {
+ try {
+ return reader.readLine();
+ } catch (java.io.IOException e) {}
+ return null;
+ }
+ public void reset() {
+ reader.reset();
+ }
+ }*/
+
+ //----------------------------------------------------------------------
+ // class RuleHalf
+ //----------------------------------------------------------------------
+
+ /**
+ * A class representing one side of a rule. This class knows how to
+ * parse half of a rule. It is tightly coupled to the method
+ * TransliteratorParser.parseRule().
+ */
+ private static class RuleHalf {
+
+ public String text;
+
+ public int cursor = -1; // position of cursor in text
+ public int ante = -1; // position of ante context marker '{' in text
+ public int post = -1; // position of post context marker '}' in text
+
+ // Record the offset to the cursor either to the left or to the
+ // right of the key. This is indicated by characters on the output
+ // side that allow the cursor to be positioned arbitrarily within
+ // the matching text. For example, abc{def} > | @@@ xyz; changes
+ // def to xyz and moves the cursor to before abc. Offset characters
+ // must be at the start or end, and they cannot move the cursor past
+ // the ante- or postcontext text. Placeholders are only valid in
+ // output text. The length of the ante and post context is
+ // determined at runtime, because of supplementals and quantifiers.
+ public int cursorOffset = 0; // only nonzero on output side
+
+ // Position of first CURSOR_OFFSET on _right_. This will be -1
+ // for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc.
+ private int cursorOffsetPos = 0;
+
+ public boolean anchorStart = false;
+ public boolean anchorEnd = false;
+
+ /**
+ * The segment number from 1..n of the next '(' we see
+ * during parsing; 1-based.
+ */
+ private int nextSegmentNumber = 1;
+
+ /**
+ * Parse one side of a rule, stopping at either the limit,
+ * the END_OF_RULE character, or an operator.
+ * @return the index after the terminating character, or
+ * if limit was reached, limit
+ */
+ public int parse(String rule, int pos, int limit,
+ TransliteratorParser parser) {
+ int start = pos;
+ StringBuffer buf = new StringBuffer();
+ pos = parseSection(rule, pos, limit, parser, buf, ILLEGAL_TOP, false);
+ text = buf.toString();
+
+ if (cursorOffset > 0 && cursor != cursorOffsetPos) {
+ syntaxError("Misplaced " + CURSOR_POS, rule, start);
+ }
+
+ return pos;
+ }
+
+ /**
+ * Parse a section of one side of a rule, stopping at either
+ * the limit, the END_OF_RULE character, an operator, or a
+ * segment close character. This method parses both a
+ * top-level rule half and a segment within such a rule half.
+ * It calls itself recursively to parse segments and nested
+ * segments.
+ * @param buf buffer into which to accumulate the rule pattern
+ * characters, either literal characters from the rule or
+ * standins for UnicodeMatcher objects including segments.
+ * @param illegal the set of special characters that is illegal during
+ * this parse.
+ * @param isSegment if true, then we've already seen a '(' and
+ * pos on entry points right after it. Accumulate everything
+ * up to the closing ')', put it in a segment matcher object,
+ * generate a standin for it, and add the standin to buf. As
+ * a side effect, update the segments vector with a reference
+ * to the segment matcher. This works recursively for nested
+ * segments. If isSegment is false, just accumulate
+ * characters into buf.
+ * @return the index after the terminating character, or
+ * if limit was reached, limit
+ */
+ private int parseSection(String rule, int pos, int limit,
+ TransliteratorParser parser,
+ StringBuffer buf,
+ UnicodeSet illegal,
+ boolean isSegment) {
+ int start = pos;
+ ParsePosition pp = null;
+ int quoteStart = -1; // Most recent 'single quoted string'
+ int quoteLimit = -1;
+ int varStart = -1; // Most recent $variableReference
+ int varLimit = -1;
+ int[] iref = new int[1];
+ int bufStart = buf.length();
+
+ main:
+ while (pos < limit) {
+ // Since all syntax characters are in the BMP, fetching
+ // 16-bit code units suffices here.
+ char c = rule.charAt(pos++);
+ if (UCharacterProperty.isRuleWhiteSpace(c)) {
+ continue;
+ }
+ // HALF_ENDERS is all chars that end a rule half: "<>=;"
+ if (HALF_ENDERS.indexOf(c) >= 0) {
+ if (isSegment) {
+ syntaxError("Unclosed segment", rule, start);
+ }
+ break main;
+ }
+ if (anchorEnd) {
+ // Text after a presumed end anchor is a syntax err
+ syntaxError("Malformed variable reference", rule, start);
+ }
+ if (UnicodeSet.resemblesPattern(rule, pos-1)) {
+ if (pp == null) {
+ pp = new ParsePosition(0);
+ }
+ pp.setIndex(pos-1); // Backup to opening '['
+ buf.append(parser.parseSet(rule, pp));
+ pos = pp.getIndex();
+ continue;
+ }
+ // Handle escapes
+ if (c == ESCAPE) {
+ if (pos == limit) {
+ syntaxError("Trailing backslash", rule, start);
+ }
+ iref[0] = pos;
+ int escaped = Utility.unescapeAt(rule, iref);
+ pos = iref[0];
+ if (escaped == -1) {
+ syntaxError("Malformed escape", rule, start);
+ }
+ parser.checkVariableRange(escaped, rule, start);
+ UTF16.append(buf, escaped);
+ continue;
+ }
+ // Handle quoted matter
+ if (c == QUOTE) {
+ int iq = rule.indexOf(QUOTE, pos);
+ if (iq == pos) {
+ buf.append(c); // Parse [''] outside quotes as [']
+ ++pos;
+ } else {
+ /* This loop picks up a run of quoted text of the
+ * form 'aaaa' each time through. If this run
+ * hasn't really ended ('aaaa''bbbb') then it keeps
+ * looping, each time adding on a new run. When it
+ * reaches the final quote it breaks.
+ */
+ quoteStart = buf.length();
+ for (;;) {
+ if (iq < 0) {
+ syntaxError("Unterminated quote", rule, start);
+ }
+ buf.append(rule.substring(pos, iq));
+ pos = iq+1;
+ if (pos < limit && rule.charAt(pos) == QUOTE) {
+ // Parse [''] inside quotes as [']
+ iq = rule.indexOf(QUOTE, pos+1);
+ // Continue looping
+ } else {
+ break;
+ }
+ }
+ quoteLimit = buf.length();
+
+ for (iq=quoteStart; iq<quoteLimit; ++iq) {
+ parser.checkVariableRange(buf.charAt(iq), rule, start);
+ }
+ }
+ continue;
+ }
+
+ parser.checkVariableRange(c, rule, start);
+
+ if (illegal.contains(c)) {
+ syntaxError("Illegal character '" + c + '\'', rule, start);
+ }
+
+ switch (c) {
+
+ //------------------------------------------------------
+ // Elements allowed within and out of segments
+ //------------------------------------------------------
+ case ANCHOR_START:
+ if (buf.length() == 0 && !anchorStart) {
+ anchorStart = true;
+ } else {
+ syntaxError("Misplaced anchor start",
+ rule, start);
+ }
+ break;
+ case SEGMENT_OPEN:
+ {
+ // bufSegStart is the offset in buf to the first
+ // character of the segment we are parsing.
+ int bufSegStart = buf.length();
+
+ // Record segment number now, since nextSegmentNumber
+ // will be incremented during the call to parseSection
+ // if there are nested segments.
+ int segmentNumber = nextSegmentNumber++; // 1-based
+
+ // Parse the segment
+ pos = parseSection(rule, pos, limit, parser, buf, ILLEGAL_SEG, true);
+
+ // After parsing a segment, the relevant characters are
+ // in buf, starting at offset bufSegStart. Extract them
+ // into a string matcher, and replace them with a
+ // standin for that matcher.
+ StringMatcher m =
+ new StringMatcher(buf.substring(bufSegStart),
+ segmentNumber, parser.curData);
+
+ // Record and associate object and segment number
+ parser.setSegmentObject(segmentNumber, m);
+ buf.setLength(bufSegStart);
+ buf.append(parser.getSegmentStandin(segmentNumber));
+ }
+ break;
+ case FUNCTION:
+ case ALT_FUNCTION:
+ {
+ iref[0] = pos;
+ TransliteratorIDParser.SingleID single = TransliteratorIDParser.parseFilterID(rule, iref);
+ // The next character MUST be a segment open
+ if (single == null ||
+ !Utility.parseChar(rule, iref, SEGMENT_OPEN)) {
+ syntaxError("Invalid function", rule, start);
+ }
+
+ Transliterator t = single.getInstance();
+ if (t == null) {
+ syntaxError("Invalid function ID", rule, start);
+ }
+
+ // bufSegStart is the offset in buf to the first
+ // character of the segment we are parsing.
+ int bufSegStart = buf.length();
+
+ // Parse the segment
+ pos = parseSection(rule, iref[0], limit, parser, buf, ILLEGAL_FUNC, true);
+
+ // After parsing a segment, the relevant characters are
+ // in buf, starting at offset bufSegStart.
+ FunctionReplacer r =
+ new FunctionReplacer(t,
+ new StringReplacer(buf.substring(bufSegStart), parser.curData));
+
+ // Replace the buffer contents with a stand-in
+ buf.setLength(bufSegStart);
+ buf.append(parser.generateStandInFor(r));
+ }
+ break;
+ case SymbolTable.SYMBOL_REF:
+ // Handle variable references and segment references "$1" .. "$9"
+ {
+ // A variable reference must be followed immediately
+ // by a Unicode identifier start and zero or more
+ // Unicode identifier part characters, or by a digit
+ // 1..9 if it is a segment reference.
+ if (pos == limit) {
+ // A variable ref character at the end acts as
+ // an anchor to the context limit, as in perl.
+ anchorEnd = true;
+ break;
+ }
+ // Parse "$1" "$2" .. "$9" .. (no upper limit)
+ c = rule.charAt(pos);
+ int r = UCharacter.digit(c, 10);
+ if (r >= 1 && r <= 9) {
+ iref[0] = pos;
+ r = Utility.parseNumber(rule, iref, 10);
+ if (r < 0) {
+ syntaxError("Undefined segment reference",
+ rule, start);
+ }
+ pos = iref[0];
+ buf.append(parser.getSegmentStandin(r));
+ } else {
+ if (pp == null) { // Lazy create
+ pp = new ParsePosition(0);
+ }
+ pp.setIndex(pos);
+ String name = parser.parseData.
+ parseReference(rule, pp, limit);
+ if (name == null) {
+ // This means the '$' was not followed by a
+ // valid name. Try to interpret it as an
+ // end anchor then. If this also doesn't work
+ // (if we see a following character) then signal
+ // an error.
+ anchorEnd = true;
+ break;
+ }
+ pos = pp.getIndex();
+ // If this is a variable definition statement,
+ // then the LHS variable will be undefined. In
+ // that case appendVariableDef() will append the
+ // special placeholder char variableLimit-1.
+ varStart = buf.length();
+ parser.appendVariableDef(name, buf);
+ varLimit = buf.length();
+ }
+ }
+ break;
+ case DOT:
+ buf.append(parser.getDotStandIn());
+ break;
+ case KLEENE_STAR:
+ case ONE_OR_MORE:
+ case ZERO_OR_ONE:
+ // Quantifiers. We handle single characters, quoted strings,
+ // variable references, and segments.
+ // a+ matches aaa
+ // 'foo'+ matches foofoofoo
+ // $v+ matches xyxyxy if $v == xy
+ // (seg)+ matches segsegseg
+ {
+ if (isSegment && buf.length() == bufStart) {
+ // The */+ immediately follows '('
+ syntaxError("Misplaced quantifier", rule, start);
+ break;
+ }
+
+ int qstart, qlimit;
+ // The */+ follows an isolated character or quote
+ // or variable reference
+ if (buf.length() == quoteLimit) {
+ // The */+ follows a 'quoted string'
+ qstart = quoteStart;
+ qlimit = quoteLimit;
+ } else if (buf.length() == varLimit) {
+ // The */+ follows a $variableReference
+ qstart = varStart;
+ qlimit = varLimit;
+ } else {
+ // The */+ follows a single character, possibly
+ // a segment standin
+ qstart = buf.length() - 1;
+ qlimit = qstart + 1;
+ }
+
+ UnicodeMatcher m;
+ try {
+ m = new StringMatcher(buf.toString(), qstart, qlimit,
+ 0, parser.curData);
+ } catch (RuntimeException e) {
+ final String precontext = pos < 50 ? rule.substring(0, pos) : "..." + rule.substring(pos - 50, pos);
+ final String postContext = limit-pos <= 50 ? rule.substring(pos, limit) : rule.substring(pos, pos+50) + "...";
+ throw (RuntimeException)
+ new IllegalIcuArgumentException("Failure in rule: " + precontext + "$$$"
+ + postContext)
+//#if defined(FOUNDATION10) || defined(J2SE13)
+//#else
+ .initCause(e)
+//#endif
+ ;
+ }
+ int min = 0;
+ int max = Quantifier.MAX;
+ switch (c) {
+ case ONE_OR_MORE:
+ min = 1;
+ break;
+ case ZERO_OR_ONE:
+ min = 0;
+ max = 1;
+ break;
+ // case KLEENE_STAR:
+ // do nothing -- min, max already set
+ }
+ m = new Quantifier(m, min, max);
+ buf.setLength(qstart);
+ buf.append(parser.generateStandInFor(m));
+ }
+ break;
+
+ //------------------------------------------------------
+ // Elements allowed ONLY WITHIN segments
+ //------------------------------------------------------
+ case SEGMENT_CLOSE:
+ // assert(isSegment);
+ // We're done parsing a segment.
+ break main;
+
+ //------------------------------------------------------
+ // Elements allowed ONLY OUTSIDE segments
+ //------------------------------------------------------
+ case CONTEXT_ANTE:
+ if (ante >= 0) {
+ syntaxError("Multiple ante contexts", rule, start);
+ }
+ ante = buf.length();
+ break;
+ case CONTEXT_POST:
+ if (post >= 0) {
+ syntaxError("Multiple post contexts", rule, start);
+ }
+ post = buf.length();
+ break;
+ case CURSOR_POS:
+ if (cursor >= 0) {
+ syntaxError("Multiple cursors", rule, start);
+ }
+ cursor = buf.length();
+ break;
+ case CURSOR_OFFSET:
+ if (cursorOffset < 0) {
+ if (buf.length() > 0) {
+ syntaxError("Misplaced " + c, rule, start);
+ }
+ --cursorOffset;
+ } else if (cursorOffset > 0) {
+ if (buf.length() != cursorOffsetPos || cursor >= 0) {
+ syntaxError("Misplaced " + c, rule, start);
+ }
+ ++cursorOffset;
+ } else {
+ if (cursor == 0 && buf.length() == 0) {
+ cursorOffset = -1;
+ } else if (cursor < 0) {
+ cursorOffsetPos = buf.length();
+ cursorOffset = 1;
+ } else {
+ syntaxError("Misplaced " + c, rule, start);
+ }
+ }
+ break;
+
+ //------------------------------------------------------
+ // Non-special characters
+ //------------------------------------------------------
+ default:
+ // Disallow unquoted characters other than [0-9A-Za-z]
+ // in the printable ASCII range. These characters are
+ // reserved for possible future use.
+ if (c >= 0x0021 && c <= 0x007E &&
+ !((c >= '0' && c <= '9') ||
+ (c >= 'A' && c <= 'Z') ||
+ (c >= 'a' && c <= 'z'))) {
+ syntaxError("Unquoted " + c, rule, start);
+ }
+ buf.append(c);
+ break;
+ }
+ }
+ return pos;
+ }
+
+ /**
+ * Remove context.
+ */
+ void removeContext() {
+ text = text.substring(ante < 0 ? 0 : ante,
+ post < 0 ? text.length() : post);
+ ante = post = -1;
+ anchorStart = anchorEnd = false;
+ }
+
+ /**
+ * Return true if this half looks like valid output, that is, does not
+ * contain quantifiers or other special input-only elements.
+ */
+ public boolean isValidOutput(TransliteratorParser parser) {
+ for (int i=0; i<text.length(); ) {
+ int c = UTF16.charAt(text, i);
+ i += UTF16.getCharCount(c);
+ if (!parser.parseData.isReplacer(c)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Return true if this half looks like valid input, that is, does not
+ * contain functions or other special output-only elements.
+ */
+ public boolean isValidInput(TransliteratorParser parser) {
+ for (int i=0; i<text.length(); ) {
+ int c = UTF16.charAt(text, i);
+ i += UTF16.getCharCount(c);
+ if (!parser.parseData.isMatcher(c)) {
+ return false;
+ }
+ }
+ return true;
+ }
+ }
+
+ //----------------------------------------------------------------------
+ // PUBLIC methods
+ //----------------------------------------------------------------------
+
+ /**
+ * Constructor.
+ */
+ public TransliteratorParser() {
+ }
+
+ /**
+ * Parse a set of rules. After the parse completes, examine the public
+ * data members for results.
+ */
+ public void parse(String rules, int dir) {
+ parseRules(new RuleArray(new String[] { rules }), dir);
+ }
+
+ /*
+ * Parse a set of rules. After the parse completes, examine the public
+ * data members for results.
+ */
+/* public void parse(ResourceReader rules, int direction) {
+ parseRules(new RuleReader(rules), direction);
+ }*/
+
+ //----------------------------------------------------------------------
+ // PRIVATE methods
+ //----------------------------------------------------------------------
+
+ /**
+ * Parse an array of zero or more rules. The strings in the array are
+ * treated as if they were concatenated together, with rule terminators
+ * inserted between array elements if not present already.
+ *
+ * Any previous rules are discarded. Typically this method is called exactly
+ * once, during construction.
+ *
+ * The member this.data will be set to null if there are no rules.
+ *
+ * @exception IllegalIcuArgumentException if there is a syntax error in the
+ * rules
+ */
+ void parseRules(RuleBody ruleArray, int dir) {
+ boolean parsingIDs = true;
+ int ruleCount = 0;
+
+ dataVector = new Vector();
+ idBlockVector = new Vector();
+ curData = null;
+ direction = dir;
+ compoundFilter = null;
+ variablesVector = new Vector();
+ variableNames = new Hashtable();
+ parseData = new ParseData();
+
+ List errors = new ArrayList();
+ int errorCount = 0;
+
+ ruleArray.reset();
+
+ StringBuffer idBlockResult = new StringBuffer();
+
+ // The compound filter offset is an index into idBlockResult.
+ // If it is 0, then the compound filter occurred at the start,
+ // and it is the offset to the _start_ of the compound filter
+ // pattern. Otherwise it is the offset to the _limit_ of the
+ // compound filter pattern within idBlockResult.
+ this.compoundFilter = null;
+ int compoundFilterOffset = -1;
+
+ main:
+ for (;;) {
+ String rule = ruleArray.nextLine();
+ if (rule == null) {
+ break;
+ }
+ int pos = 0;
+ int limit = rule.length();
+ while (pos < limit) {
+ char c = rule.charAt(pos++);
+ if (UCharacterProperty.isRuleWhiteSpace(c)) {
+ continue;
+ }
+ // Skip lines starting with the comment character
+ if (c == RULE_COMMENT_CHAR) {
+ pos = rule.indexOf("\n", pos) + 1;
+ if (pos == 0) {
+ break; // No "\n" found; rest of rule is a commnet
+ }
+ continue; // Either fall out or restart with next line
+ }
+
+ // skip empty rules
+ if (c == END_OF_RULE)
+ continue;
+
+ // Often a rule file contains multiple errors. It's
+ // convenient to the rule author if these are all reported
+ // at once. We keep parsing rules even after a failure, up
+ // to a specified limit, and report all errors at once.
+ try {
+ ++ruleCount;
+
+ // We've found the start of a rule or ID. c is its first
+ // character, and pos points past c.
+ --pos;
+ // Look for an ID token. Must have at least ID_TOKEN_LEN + 1
+ // chars left.
+ if ((pos + ID_TOKEN_LEN + 1) <= limit &&
+ rule.regionMatches(pos, ID_TOKEN, 0, ID_TOKEN_LEN)) {
+ pos += ID_TOKEN_LEN;
+ c = rule.charAt(pos);
+ while (UCharacterProperty.isRuleWhiteSpace(c) && pos < limit) {
+ ++pos;
+ c = rule.charAt(pos);
+ }
+ int[] p = new int[] { pos };
+
+ if (!parsingIDs) {
+ if (curData != null) {
+ if (direction == Transliterator.FORWARD)
+ dataVector.add(curData);
+ else
+ dataVector.insertElementAt(curData, 0);
+ curData = null;
+ }
+ parsingIDs = true;
+ }
+
+ TransliteratorIDParser.SingleID id =
+ TransliteratorIDParser.parseSingleID(
+ rule, p, direction);
+ if (p[0] != pos && Utility.parseChar(rule, p, END_OF_RULE)) {
+ // Successful ::ID parse.
+
+ if (direction == Transliterator.FORWARD) {
+ idBlockResult.append(id.canonID).append(END_OF_RULE);
+ } else {
+ idBlockResult.insert(0, id.canonID + END_OF_RULE);
+ }
+
+ } else {
+ // Couldn't parse an ID. Try to parse a global filter
+ int[] withParens = new int[] { -1 };
+ UnicodeSet f = TransliteratorIDParser.parseGlobalFilter(rule, p, direction, withParens, null);
+ if (f != null && Utility.parseChar(rule, p, END_OF_RULE)) {
+ if ((direction == Transliterator.FORWARD) ==
+ (withParens[0] == 0)) {
+ if (compoundFilter != null) {
+ // Multiple compound filters
+ syntaxError("Multiple global filters", rule, pos);
+ }
+ compoundFilter = f;
+ compoundFilterOffset = ruleCount;
+ }
+ } else {
+ // Invalid ::id
+ // Can be parsed as neither an ID nor a global filter
+ syntaxError("Invalid ::ID", rule, pos);
+ }
+ }
+
+ pos = p[0];
+ } else {
+ if (parsingIDs) {
+ if (direction == Transliterator.FORWARD)
+ idBlockVector.add(idBlockResult.toString());
+ else
+ idBlockVector.insertElementAt(idBlockResult.toString(), 0);
+ idBlockResult.delete(0, idBlockResult.length());
+ parsingIDs = false;
+ curData = new RuleBasedTransliterator.Data();
+
+ // By default, rules use part of the private use area
+ // E000..F8FF for variables and other stand-ins. Currently
+ // the range F000..F8FF is typically sufficient. The 'use
+ // variable range' pragma allows rule sets to modify this.
+ setVariableRange(0xF000, 0xF8FF);
+ }
+
+ if (resemblesPragma(rule, pos, limit)) {
+ int ppp = parsePragma(rule, pos, limit);
+ if (ppp < 0) {
+ syntaxError("Unrecognized pragma", rule, pos);
+ }
+ pos = ppp;
+ // Parse a rule
+ } else {
+ pos = parseRule(rule, pos, limit);
+ }
+ }
+ } catch (IllegalArgumentException e) {
+ if (errorCount == 30) {
+ errors.add(new IllegalIcuArgumentException("\nMore than 30 errors; further messages squelched")
+//#if defined(FOUNDATION10) || defined(J2SE13)
+//#else
+ .initCause(e)
+//#endif
+ );
+ break main;
+ }
+ e.fillInStackTrace();
+ errors.add(e);
+ ++errorCount;
+ pos = ruleEnd(rule, pos, limit) + 1; // +1 advances past ';'
+ }
+ }
+ }
+ if (parsingIDs && idBlockResult.length() > 0) {
+ if (direction == Transliterator.FORWARD)
+ idBlockVector.add(idBlockResult.toString());
+ else
+ idBlockVector.insertElementAt(idBlockResult.toString(), 0);
+ }
+ else if (!parsingIDs && curData != null) {
+ if (direction == Transliterator.FORWARD)
+ dataVector.add(curData);
+ else
+ dataVector.insertElementAt(curData, 0);
+ }
+
+ // Convert the set vector to an array
+ for (int i = 0; i < dataVector.size(); i++) {
+ RuleBasedTransliterator.Data data = (RuleBasedTransliterator.Data)dataVector.get(i);
+ data.variables = new Object[variablesVector.size()];
+ variablesVector.copyInto(data.variables);
+ data.variableNames = new Hashtable();
+ data.variableNames.putAll(variableNames);
+ }
+ variablesVector = null;
+
+ // Do more syntax checking and index the rules
+ try {
+ if (compoundFilter != null) {
+ if ((direction == Transliterator.FORWARD &&
+ compoundFilterOffset != 1) ||
+ (direction == Transliterator.REVERSE &&
+ compoundFilterOffset != ruleCount)) {
+ throw new IllegalIcuArgumentException("Compound filters misplaced");
+ }
+ }
+
+ for (int i = 0; i < dataVector.size(); i++) {
+ RuleBasedTransliterator.Data data = (RuleBasedTransliterator.Data)dataVector.get(i);
+ data.ruleSet.freeze();
+ }
+
+ if (idBlockVector.size() == 1 && ((String)idBlockVector.get(0)).length() == 0)
+ idBlockVector.remove(0);
+
+ } catch (IllegalArgumentException e) {
+ e.fillInStackTrace();
+ errors.add(e);
+ }
+
+ if (errors.size() != 0) {
+//#if defined(FOUNDATION10) || defined(J2SE13)
+//#else
+ for (int i = errors.size()-1; i > 0; --i) {
+ RuntimeException previous = (RuntimeException) errors.get(i-1);
+ while (previous.getCause() != null) {
+ previous = (RuntimeException) previous.getCause(); // chain specially
+ }
+ previous.initCause((RuntimeException) errors.get(i));
+ }
+//#endif
+ throw (RuntimeException) errors.get(0);
+ // if initCause not supported: throw new IllegalArgumentException(errors.toString());
+ }
+ }
+
+ /**
+ * MAIN PARSER. Parse the next rule in the given rule string, starting
+ * at pos. Return the index after the last character parsed. Do not
+ * parse characters at or after limit.
+ *
+ * Important: The character at pos must be a non-whitespace character
+ * that is not the comment character.
+ *
+ * This method handles quoting, escaping, and whitespace removal. It
+ * parses the end-of-rule character. It recognizes context and cursor
+ * indicators. Once it does a lexical breakdown of the rule at pos, it
+ * creates a rule object and adds it to our rule list.
+ *
+ * This method is tightly coupled to the inner class RuleHalf.
+ */
+ private int parseRule(String rule, int pos, int limit) {
+ // Locate the left side, operator, and right side
+ int start = pos;
+ char operator = 0;
+
+ // Set up segments data
+ segmentStandins = new StringBuffer();
+ segmentObjects = new Vector();
+
+ RuleHalf left = new RuleHalf();
+ RuleHalf right = new RuleHalf();
+
+ undefinedVariableName = null;
+ pos = left.parse(rule, pos, limit, this);
+
+ if (pos == limit ||
+ OPERATORS.indexOf(operator = rule.charAt(--pos)) < 0) {
+ syntaxError("No operator pos=" + pos, rule, start);
+ }
+ ++pos;
+
+ // Found an operator char. Check for forward-reverse operator.
+ if (operator == REVERSE_RULE_OP &&
+ (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
+ ++pos;
+ operator = FWDREV_RULE_OP;
+ }
+
+ // Translate alternate op characters.
+ switch (operator) {
+ case ALT_FORWARD_RULE_OP:
+ operator = FORWARD_RULE_OP;
+ break;
+ case ALT_REVERSE_RULE_OP:
+ operator = REVERSE_RULE_OP;
+ break;
+ case ALT_FWDREV_RULE_OP:
+ operator = FWDREV_RULE_OP;
+ break;
+ }
+
+ pos = right.parse(rule, pos, limit, this);
+
+ if (pos < limit) {
+ if (rule.charAt(--pos) == END_OF_RULE) {
+ ++pos;
+ } else {
+ // RuleHalf parser must have terminated at an operator
+ syntaxError("Unquoted operator", rule, start);
+ }
+ }
+
+ if (operator == VARIABLE_DEF_OP) {
+ // LHS is the name. RHS is a single character, either a literal
+ // or a set (already parsed). If RHS is longer than one
+ // character, it is either a multi-character string, or multiple
+ // sets, or a mixture of chars and sets -- syntax error.
+
+ // We expect to see a single undefined variable (the one being
+ // defined).
+ if (undefinedVariableName == null) {
+ syntaxError("Missing '$' or duplicate definition", rule, start);
+ }
+ if (left.text.length() != 1 || left.text.charAt(0) != variableLimit) {
+ syntaxError("Malformed LHS", rule, start);
+ }
+ if (left.anchorStart || left.anchorEnd ||
+ right.anchorStart || right.anchorEnd) {
+ syntaxError("Malformed variable def", rule, start);
+ }
+ // We allow anything on the right, including an empty string.
+ int n = right.text.length();
+ char[] value = new char[n];
+ right.text.getChars(0, n, value, 0);
+ variableNames.put(undefinedVariableName, value);
+
+ ++variableLimit;
+ return pos;
+ }
+
+ // If this is not a variable definition rule, we shouldn't have
+ // any undefined variable names.
+ if (undefinedVariableName != null) {
+ syntaxError("Undefined variable $" + undefinedVariableName,
+ rule, start);
+ }
+
+ // Verify segments
+ if (segmentStandins.length() > segmentObjects.size()) {
+ syntaxError("Undefined segment reference", rule, start);
+ }
+ for (int i=0; i<segmentStandins.length(); ++i) {
+ if (segmentStandins.charAt(i) == 0) {
+ syntaxError("Internal error", rule, start); // will never happen
+ }
+ }
+ for (int i=0; i<segmentObjects.size(); ++i) {
+ if (segmentObjects.elementAt(i) == null) {
+ syntaxError("Internal error", rule, start); // will never happen
+ }
+ }
+
+ // If the direction we want doesn't match the rule
+ // direction, do nothing.
+ if (operator != FWDREV_RULE_OP &&
+ ((direction == Transliterator.FORWARD) != (operator == FORWARD_RULE_OP))) {
+ return pos;
+ }
+
+ // Transform the rule into a forward rule by swapping the
+ // sides if necessary.
+ if (direction == Transliterator.REVERSE) {
+ RuleHalf temp = left;
+ left = right;
+ right = temp;
+ }
+
+ // Remove non-applicable elements in forward-reverse
+ // rules. Bidirectional rules ignore elements that do not
+ // apply.
+ if (operator == FWDREV_RULE_OP) {
+ right.removeContext();
+ left.cursor = -1;
+ left.cursorOffset = 0;
+ }
+
+ // Normalize context
+ if (left.ante < 0) {
+ left.ante = 0;
+ }
+ if (left.post < 0) {
+ left.post = left.text.length();
+ }
+
+ // Context is only allowed on the input side. Cursors are only
+ // allowed on the output side. Segment delimiters can only appear
+ // on the left, and references on the right. Cursor offset
+ // cannot appear without an explicit cursor. Cursor offset
+ // cannot place the cursor outside the limits of the context.
+ // Anchors are only allowed on the input side.
+ if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 ||
+ (right.cursorOffset != 0 && right.cursor < 0) ||
+ // - The following two checks were used to ensure that the
+ // - the cursor offset stayed within the ante- or postcontext.
+ // - However, with the addition of quantifiers, we have to
+ // - allow arbitrary cursor offsets and do runtime checking.
+ //(right.cursorOffset > (left.text.length() - left.post)) ||
+ //(-right.cursorOffset > left.ante) ||
+ right.anchorStart || right.anchorEnd ||
+ !left.isValidInput(this) || !right.isValidOutput(this) ||
+ left.ante > left.post) {
+ syntaxError("Malformed rule", rule, start);
+ }
+
+ // Flatten segment objects vector to an array
+ UnicodeMatcher[] segmentsArray = null;
+ if (segmentObjects.size() > 0) {
+ segmentsArray = new UnicodeMatcher[segmentObjects.size()];
+ segmentObjects.toArray(segmentsArray);
+ }
+
+ curData.ruleSet.addRule(new TransliterationRule(
+ left.text, left.ante, left.post,
+ right.text, right.cursor, right.cursorOffset,
+ segmentsArray,
+ left.anchorStart, left.anchorEnd,
+ curData));
+
+ return pos;
+ }
+
+ /**
+ * Set the variable range to [start, end] (inclusive).
+ */
+ private void setVariableRange(int start, int end) {
+ if (start > end || start < 0 || end > 0xFFFF) {
+ throw new IllegalIcuArgumentException("Invalid variable range " + start + ", " + end);
+ }
+
+ curData.variablesBase = (char) start; // first private use
+
+ if (dataVector.size() == 0) {
+ variableNext = (char) start;
+ variableLimit = (char) (end + 1);
+ }
+ }
+
+ /**
+ * Assert that the given character is NOT within the variable range.
+ * If it is, signal an error. This is neccesary to ensure that the
+ * variable range does not overlap characters used in a rule.
+ */
+ private void checkVariableRange(int ch, String rule, int start) {
+ if (ch >= curData.variablesBase && ch < variableLimit) {
+ syntaxError("Variable range character in rule", rule, start);
+ }
+ }
+
+ // (The following method is part of an unimplemented feature.
+ // Remove this clover pragma after the feature is implemented.
+ // 2003-06-11 ICU 2.6 Alan)
+ ///CLOVER:OFF
+ /**
+ * Set the maximum backup to 'backup', in response to a pragma
+ * statement.
+ */
+ private void pragmaMaximumBackup(int backup) {
+ //TODO Finish
+ throw new IllegalIcuArgumentException("use maximum backup pragma not implemented yet");
+ }
+ ///CLOVER:ON
+
+ // (The following method is part of an unimplemented feature.
+ // Remove this clover pragma after the feature is implemented.
+ // 2003-06-11 ICU 2.6 Alan)
+ ///CLOVER:OFF
+ /**
+ * Begin normalizing all rules using the given mode, in response
+ * to a pragma statement.
+ */
+ private void pragmaNormalizeRules(Normalizer.Mode mode) {
+ //TODO Finish
+ throw new IllegalIcuArgumentException("use normalize rules pragma not implemented yet");
+ }
+ ///CLOVER:ON
+
+ /**
+ * Return true if the given rule looks like a pragma.
+ * @param pos offset to the first non-whitespace character
+ * of the rule.
+ * @param limit pointer past the last character of the rule.
+ */
+ static boolean resemblesPragma(String rule, int pos, int limit) {
+ // Must start with /use\s/i
+ return Utility.parsePattern(rule, pos, limit, "use ", null) >= 0;
+ }
+
+ /**
+ * Parse a pragma. This method assumes resemblesPragma() has
+ * already returned true.
+ * @param pos offset to the first non-whitespace character
+ * of the rule.
+ * @param limit pointer past the last character of the rule.
+ * @return the position index after the final ';' of the pragma,
+ * or -1 on failure.
+ */
+ private int parsePragma(String rule, int pos, int limit) {
+ int[] array = new int[2];
+
+ // resemblesPragma() has already returned true, so we
+ // know that pos points to /use\s/i; we can skip 4 characters
+ // immediately
+ pos += 4;
+
+ // Here are the pragmas we recognize:
+ // use variable range 0xE000 0xEFFF;
+ // use maximum backup 16;
+ // use nfd rules;
+ int p = Utility.parsePattern(rule, pos, limit, "~variable range # #~;", array);
+ if (p >= 0) {
+ setVariableRange(array[0], array[1]);
+ return p;
+ }
+
+ p = Utility.parsePattern(rule, pos, limit, "~maximum backup #~;", array);
+ if (p >= 0) {
+ pragmaMaximumBackup(array[0]);
+ return p;
+ }
+
+ p = Utility.parsePattern(rule, pos, limit, "~nfd rules~;", null);
+ if (p >= 0) {
+ pragmaNormalizeRules(Normalizer.NFD);
+ return p;
+ }
+
+ p = Utility.parsePattern(rule, pos, limit, "~nfc rules~;", null);
+ if (p >= 0) {
+ pragmaNormalizeRules(Normalizer.NFC);
+ return p;
+ }
+
+ // Syntax error: unable to parse pragma
+ return -1;
+ }
+
+ /**
+ * Throw an exception indicating a syntax error. Search the rule string
+ * for the probable end of the rule. Of course, if the error is that
+ * the end of rule marker is missing, then the rule end will not be found.
+ * In any case the rule start will be correctly reported.
+ * @param msg error description
+ * @param rule pattern string
+ * @param start position of first character of current rule
+ */
+ static final void syntaxError(String msg, String rule, int start) {
+ int end = ruleEnd(rule, start, rule.length());
+ throw new IllegalIcuArgumentException(msg + " in \"" +
+ Utility.escape(rule.substring(start, end)) + '"');
+ }
+
+ static final int ruleEnd(String rule, int start, int limit) {
+ int end = Utility.quotedIndexOf(rule, start, limit, ";");
+ if (end < 0) {
+ end = limit;
+ }
+ return end;
+ }
+
+ /**
+ * Parse a UnicodeSet out, store it, and return the stand-in character
+ * used to represent it.
+ */
+ private final char parseSet(String rule, ParsePosition pos) {
+ UnicodeSet set = new UnicodeSet(rule, pos, parseData);
+ if (variableNext >= variableLimit) {
+ throw new RuntimeException("Private use variables exhausted");
+ }
+ set.compact();
+ return generateStandInFor(set);
+ }
+
+ /**
+ * Generate and return a stand-in for a new UnicodeMatcher or UnicodeReplacer.
+ * Store the object.
+ */
+ char generateStandInFor(Object obj) {
+ // assert(obj != null);
+
+ // Look up previous stand-in, if any. This is a short list
+ // (typical n is 0, 1, or 2); linear search is optimal.
+ for (int i=0; i<variablesVector.size(); ++i) {
+ if (variablesVector.elementAt(i) == obj) { // [sic] pointer comparison
+ return (char) (curData.variablesBase + i);
+ }
+ }
+
+ if (variableNext >= variableLimit) {
+ throw new RuntimeException("Variable range exhausted");
+ }
+ variablesVector.addElement(obj);
+ return variableNext++;
+ }
+
+ /**
+ * Return the standin for segment seg (1-based).
+ */
+ public char getSegmentStandin(int seg) {
+ if (segmentStandins.length() < seg) {
+ segmentStandins.setLength(seg);
+ }
+ char c = segmentStandins.charAt(seg-1);
+ if (c == 0) {
+ if (variableNext >= variableLimit) {
+ throw new RuntimeException("Variable range exhausted");
+ }
+ c = variableNext++;
+ // Set a placeholder in the master variables vector that will be
+ // filled in later by setSegmentObject(). We know that we will get
+ // called first because setSegmentObject() will call us.
+ variablesVector.addElement(null);
+ segmentStandins.setCharAt(seg-1, c);
+ }
+ return c;
+ }
+
+ /**
+ * Set the object for segment seg (1-based).
+ */
+ public void setSegmentObject(int seg, StringMatcher obj) {
+ // Since we call parseSection() recursively, nested
+ // segments will result in segment i+1 getting parsed
+ // and stored before segment i; be careful with the
+ // vector handling here.
+ if (segmentObjects.size() < seg) {
+ segmentObjects.setSize(seg);
+ }
+ int index = getSegmentStandin(seg) - curData.variablesBase;
+ if (segmentObjects.elementAt(seg-1) != null ||
+ variablesVector.elementAt(index) != null) {
+ throw new RuntimeException(); // should never happen
+ }
+ segmentObjects.setElementAt(obj, seg-1);
+ variablesVector.setElementAt(obj, index);
+ }
+
+ /**
+ * Return the stand-in for the dot set. It is allocated the first
+ * time and reused thereafter.
+ */
+ char getDotStandIn() {
+ if (dotStandIn == -1) {
+ dotStandIn = generateStandInFor(new UnicodeSet(DOT_SET));
+ }
+ return (char) dotStandIn;
+ }
+
+ /**
+ * Append the value of the given variable name to the given
+ * StringBuffer.
+ * @exception IllegalIcuArgumentException if the name is unknown.
+ */
+ private void appendVariableDef(String name, StringBuffer buf) {
+ char[] ch = (char[]) variableNames.get(name);
+ if (ch == null) {
+ // We allow one undefined variable so that variable definition
+ // statements work. For the first undefined variable we return
+ // the special placeholder variableLimit-1, and save the variable
+ // name.
+ if (undefinedVariableName == null) {
+ undefinedVariableName = name;
+ if (variableNext >= variableLimit) {
+ throw new RuntimeException("Private use variables exhausted");
+ }
+ buf.append((char) --variableLimit);
+ } else {
+ throw new IllegalIcuArgumentException("Undefined variable $"
+ + name);
+ }
+ } else {
+ buf.append(ch);
+ }
+ }
+}
+
+//eof