]> gitweb.fperrin.net Git - Dictionary.git/blobdiff - jars/icu4j-4_2_1-src/src/com/ibm/icu/impl/PatternTokenizer.java
go
[Dictionary.git] / jars / icu4j-4_2_1-src / src / com / ibm / icu / impl / PatternTokenizer.java
old mode 100755 (executable)
new mode 100644 (file)
index 419e67b..e670bdc
-//##header\r
-/*\r
- *******************************************************************************\r
- * Copyright (C) 2006-2009, Google, International Business Machines Corporation *\r
- * and others. All Rights Reserved.                                            *\r
- *******************************************************************************\r
- */\r
-package com.ibm.icu.impl;\r
-\r
-import com.ibm.icu.text.UTF16;\r
-import com.ibm.icu.text.UnicodeSet;\r
-\r
-/**\r
- * A simple parsing class for patterns and rules. Handles '...' quotations, \\uxxxx and \\Uxxxxxxxx, and symple syntax.\r
- * The '' (two quotes) is treated as a single quote, inside or outside a quote\r
- * <ul>\r
- * <li>Any ignorable characters are ignored in parsing.</li>\r
- * <li>Any syntax characters are broken into separate tokens</li>\r
- * <li>Quote characters can be specified: '...', "...", and \x </li>\r
- * <li>Other characters are treated as literals</li>\r
- * </ul>\r
- */\r
-public class PatternTokenizer {\r
-    // settings used in the interpretation of the pattern\r
-    private UnicodeSet ignorableCharacters = new UnicodeSet();\r
-    private UnicodeSet syntaxCharacters = new UnicodeSet();\r
-    private UnicodeSet extraQuotingCharacters = new UnicodeSet();\r
-    private UnicodeSet escapeCharacters = new UnicodeSet();\r
-    private boolean usingSlash = false;\r
-    private boolean usingQuote = false;\r
-    \r
-    // transient data, set when needed. Null it out for any changes in the above fields.\r
-    private transient UnicodeSet needingQuoteCharacters = null;\r
-    \r
-    // data about the current pattern being parsed. start gets moved as we go along.\r
-    private int start;\r
-    private int limit;\r
-    private String pattern;\r
-    \r
-    public UnicodeSet getIgnorableCharacters() {\r
-        return (UnicodeSet) ignorableCharacters.clone();\r
-    }\r
-    /**\r
-     * Sets the characters to be ignored in parsing, eg new UnicodeSet("[:pattern_whitespace:]");\r
-     * @param ignorableCharacters\r
-     * @return\r
-     */\r
-    public PatternTokenizer setIgnorableCharacters(UnicodeSet ignorableCharacters) {\r
-        this.ignorableCharacters = (UnicodeSet) ignorableCharacters.clone();\r
-        needingQuoteCharacters = null;\r
-        return this;\r
-    }\r
-    public UnicodeSet getSyntaxCharacters() {\r
-        return (UnicodeSet) syntaxCharacters.clone();\r
-    }\r
-    public UnicodeSet getExtraQuotingCharacters() {\r
-        return (UnicodeSet) extraQuotingCharacters.clone();\r
-    }\r
-    /**\r
-     *  Sets the characters to be interpreted as syntax characters in parsing, eg new UnicodeSet("[:pattern_syntax:]")\r
-     * @param syntaxCharacters\r
-     * @return\r
-     */\r
-    public PatternTokenizer setSyntaxCharacters(UnicodeSet syntaxCharacters) {\r
-        this.syntaxCharacters = (UnicodeSet) syntaxCharacters.clone();\r
-        needingQuoteCharacters = null;\r
-        return this;\r
-    }   \r
-    /**\r
-     *  Sets the extra characters to be quoted in literals\r
-     * @param syntaxCharacters\r
-     * @return\r
-     */\r
-    public PatternTokenizer setExtraQuotingCharacters(UnicodeSet syntaxCharacters) {\r
-        this.extraQuotingCharacters = (UnicodeSet) syntaxCharacters.clone();\r
-        needingQuoteCharacters = null;\r
-        return this;\r
-    }   \r
-    \r
-    public UnicodeSet getEscapeCharacters() {\r
-        return (UnicodeSet) escapeCharacters.clone();\r
-    }\r
-    /**\r
-     * Set characters to be escaped in literals, in quoteLiteral and normalize, eg new UnicodeSet("[^\\u0020-\\u007E]");\r
-     * @param escapeCharacters\r
-     * @return\r
-     */\r
-    public PatternTokenizer setEscapeCharacters(UnicodeSet escapeCharacters) {\r
-        this.escapeCharacters = (UnicodeSet) escapeCharacters.clone();\r
-        return this;\r
-    }\r
-    public boolean isUsingQuote() {\r
-        return usingQuote;\r
-    }\r
-    public PatternTokenizer setUsingQuote(boolean usingQuote) {\r
-        this.usingQuote = usingQuote;\r
-        needingQuoteCharacters = null;\r
-        return this;\r
-    }\r
-    public boolean isUsingSlash() {\r
-        return usingSlash;\r
-    }\r
-    public PatternTokenizer setUsingSlash(boolean usingSlash) {\r
-        this.usingSlash = usingSlash;\r
-        needingQuoteCharacters = null;\r
-        return this;\r
-    }\r
-    //    public UnicodeSet getQuoteCharacters() {\r
-//  return (UnicodeSet) quoteCharacters.clone();\r
-//  }\r
-//  public PatternTokenizer setQuoteCharacters(UnicodeSet quoteCharacters) {\r
-//  this.quoteCharacters = (UnicodeSet) quoteCharacters.clone();\r
-//  needingQuoteCharacters = null;\r
-//  return this;\r
-//  }\r
-    public int getLimit() {\r
-        return limit;\r
-    }\r
-    public PatternTokenizer setLimit(int limit) {\r
-        this.limit = limit;\r
-        return this;\r
-    }\r
-    public int getStart() {\r
-        return start;\r
-    }\r
-    public PatternTokenizer setStart(int start) {\r
-        this.start = start;\r
-        return this;\r
-    }\r
-\r
-//#if defined(FOUNDATION10) || defined(J2SE13)\r
-//##    public PatternTokenizer setPattern(StringBuffer pattern) {\r
-//##        return setPattern(pattern.toString());\r
-//##    }\r
-//#else \r
-    public PatternTokenizer setPattern(CharSequence pattern) {\r
-        return setPattern(pattern.toString());\r
-    }\r
-//#endif\r
-\r
-    public PatternTokenizer setPattern(String pattern) {\r
-        if (pattern == null) {\r
-            throw new IllegalArgumentException("Inconsistent arguments");\r
-        }\r
-        this.start = 0;\r
-        this.limit = pattern.length();\r
-        this.pattern = pattern;\r
-        return this;\r
-    }\r
-\r
-    public static final char SINGLE_QUOTE = '\'';\r
-    public static final char BACK_SLASH = '\\';\r
-    private static int NO_QUOTE = -1, IN_QUOTE = -2;\r
-\r
-//#if defined(FOUNDATION10) || defined(J2SE13)\r
-//##    public String quoteLiteral(StringBuffer string) {\r
-//##        return quoteLiteral(string.toString());\r
-//##    }\r
-//#else\r
-    public String quoteLiteral(CharSequence string) {\r
-        return quoteLiteral(string.toString());\r
-    }\r
-//#endif\r
-\r
-    /**\r
-     * Quote a literal string, using the available settings. Thus syntax characters, quote characters, and ignorable characters will be put into quotes.\r
-     * @param string\r
-     * @return\r
-     */\r
-    public String quoteLiteral(String string) {\r
-        if (needingQuoteCharacters == null) {\r
-            needingQuoteCharacters = new UnicodeSet().addAll(syntaxCharacters).addAll(ignorableCharacters).addAll(extraQuotingCharacters); // .addAll(quoteCharacters)\r
-            if (usingSlash) needingQuoteCharacters.add(BACK_SLASH);\r
-            if (usingQuote) needingQuoteCharacters.add(SINGLE_QUOTE);\r
-        }\r
-        StringBuffer result = new StringBuffer();\r
-        int quotedChar = NO_QUOTE;\r
-        int cp;\r
-        for (int i = 0; i < string.length(); i += UTF16.getCharCount(cp)) {\r
-            cp = UTF16.charAt(string, i);\r
-            if (escapeCharacters.contains(cp)) {\r
-                // we may have to fix up previous characters\r
-                if (quotedChar == IN_QUOTE) {\r
-                    result.append(SINGLE_QUOTE);\r
-                    quotedChar = NO_QUOTE;\r
-                }\r
-                appendEscaped(result, cp);\r
-                continue;\r
-            }\r
-            \r
-            if (needingQuoteCharacters.contains(cp)) {\r
-                // if we have already started a quote\r
-                if (quotedChar == IN_QUOTE) {\r
-                    UTF16.append(result, cp);\r
-                    if (usingQuote && cp == SINGLE_QUOTE) { // double it\r
-                        result.append(SINGLE_QUOTE);\r
-                    }\r
-                    continue;\r
-                }\r
-                // otherwise not already in quote\r
-                if (usingSlash) {\r
-                    result.append(BACK_SLASH);\r
-                    UTF16.append(result, cp);\r
-                    continue;\r
-                }\r
-                if (usingQuote) {\r
-                    if (cp == SINGLE_QUOTE) { // double it and continue\r
-                        result.append(SINGLE_QUOTE);\r
-                        result.append(SINGLE_QUOTE);\r
-                        continue;\r
-                    }\r
-                    result.append(SINGLE_QUOTE);\r
-                    UTF16.append(result, cp);\r
-                    quotedChar = IN_QUOTE;\r
-                    continue;\r
-                }\r
-                // we have no choice but to use \\u or \\U\r
-                appendEscaped(result, cp);\r
-                continue;\r
-            }\r
-            // otherwise cp doesn't need quoting\r
-            // we may have to fix up previous characters\r
-            if (quotedChar == IN_QUOTE) {\r
-                result.append(SINGLE_QUOTE);\r
-                quotedChar = NO_QUOTE;\r
-            }\r
-            UTF16.append(result, cp);\r
-        }\r
-        // all done. \r
-        // we may have to fix up previous characters\r
-        if (quotedChar == IN_QUOTE) {\r
-            result.append(SINGLE_QUOTE);\r
-        }\r
-        return result.toString();\r
-    }\r
-\r
-    private void appendEscaped(StringBuffer result, int cp) {\r
-        if (cp <= 0xFFFF) {\r
-            result.append("\\u").append(Utility.hex(cp,4));\r
-        } else {\r
-            result.append("\\U").append(Utility.hex(cp,8));\r
-        }\r
-    }\r
-    \r
-    public String normalize() {\r
-        int oldStart = start;\r
-        StringBuffer result = new StringBuffer();\r
-        StringBuffer buffer = new StringBuffer();\r
-        while (true) {\r
-            buffer.setLength(0);\r
-            int status = next(buffer);\r
-            if (status == DONE) {\r
-                start = oldStart;\r
-                return result.toString();\r
-            }\r
-            if (status != SYNTAX) {\r
-                result.append(quoteLiteral(buffer));\r
-            } else {\r
-                result.append(buffer);\r
-            }\r
-        }\r
-    }\r
-    \r
-    public static final int DONE = 0, SYNTAX = 1, LITERAL = 2, BROKEN_QUOTE = 3, BROKEN_ESCAPE = 4, UNKNOWN = 5;\r
-    \r
-    private static final int AFTER_QUOTE = -1, NONE = 0, START_QUOTE = 1, NORMAL_QUOTE = 2, SLASH_START = 3, HEX = 4;\r
-    \r
-    public int next(StringBuffer buffer) {\r
-        if (start >= limit) return DONE;\r
-        int status = UNKNOWN;\r
-        int lastQuote = UNKNOWN;\r
-        int quoteStatus = NONE;\r
-        int hexCount = 0;\r
-        int hexValue = 0;\r
-        int cp;\r
-        main:\r
-            for (int i = start; i < limit; i += UTF16.getCharCount(cp)) {\r
-                cp = UTF16.charAt(pattern, i);\r
-                // if we are in a quote, then handle it.\r
-                switch (quoteStatus) {\r
-                case SLASH_START:\r
-                    switch (cp) {\r
-                    case 'u':\r
-                        quoteStatus = HEX;\r
-                        hexCount = 4;\r
-                        hexValue = 0;\r
-                        continue main;\r
-                    case 'U': \r
-                        quoteStatus = HEX;\r
-                        hexCount = 8;\r
-                        hexValue = 0;\r
-                        continue main;\r
-                    default:\r
-                        if (usingSlash) {\r
-                            UTF16.append(buffer, cp);\r
-                            quoteStatus = NONE;\r
-                            continue main;\r
-                        } else {\r
-                            buffer.append(BACK_SLASH);\r
-                            quoteStatus = NONE;\r
-                        }\r
-                    }\r
-                    break; // fall through to NONE\r
-                case HEX:\r
-                    hexValue <<= 4;\r
-                    hexValue += cp;\r
-                    switch (cp) {\r
-                    case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':\r
-                        hexValue -= '0'; break;\r
-                    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':\r
-                        hexValue -= 'a' - 10; break;\r
-                    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':\r
-                        hexValue -= 'A' - 10; break;\r
-                    default:\r
-                        start = i;\r
-                    return BROKEN_ESCAPE;\r
-                    }\r
-                    --hexCount;\r
-                    if (hexCount == 0) {\r
-                        quoteStatus = NONE;\r
-                        UTF16.append(buffer, hexValue);\r
-                    }\r
-                    continue main;\r
-                case AFTER_QUOTE:\r
-                    // see if we get another quote character\r
-                    // if we just ended a quote BUT the following character is the lastQuote character, then we have a situation like '...''...', so we restart the quote\r
-                    if (cp == lastQuote) {\r
-                        UTF16.append(buffer, cp);\r
-                        quoteStatus = NORMAL_QUOTE;\r
-                        continue main;\r
-                    }\r
-                    quoteStatus = NONE;\r
-                    break; // fall through to NONE\r
-                case START_QUOTE:\r
-                    // if we are at the very start of a quote, and we hit another quote mark then we emit a literal quote character and end the quote\r
-                    if (cp == lastQuote) {\r
-                        UTF16.append(buffer, cp);\r
-                        quoteStatus = NONE; // get out of quote, with no trace remaining\r
-                        continue;                            \r
-                    }\r
-                    // otherwise get into quote\r
-                    UTF16.append(buffer, cp);\r
-                    quoteStatus = NORMAL_QUOTE;\r
-                    continue main;\r
-                case NORMAL_QUOTE: \r
-                    if (cp == lastQuote) {\r
-                        quoteStatus = AFTER_QUOTE; // get out of quote\r
-                        continue main;\r
-                    }\r
-                    UTF16.append(buffer, cp);\r
-                    continue main;\r
-                }\r
-                \r
-                if (ignorableCharacters.contains(cp)) {\r
-                    continue;\r
-                }\r
-                // do syntax characters\r
-                if (syntaxCharacters.contains(cp)) {\r
-                    if (status == UNKNOWN) {\r
-                        UTF16.append(buffer, cp);\r
-                        start = i + UTF16.getCharCount(cp);\r
-                        return SYNTAX;\r
-                    } else { // LITERAL, so back up and break\r
-                        start = i;\r
-                        return status;\r
-                    }\r
-                }\r
-                // otherwise it is a literal; keep on going\r
-                status = LITERAL;\r
-                if (cp == BACK_SLASH) {\r
-                    quoteStatus = SLASH_START;\r
-                    continue;\r
-                } else if (usingQuote && cp == SINGLE_QUOTE) {\r
-                    lastQuote = cp;\r
-                    quoteStatus = START_QUOTE;\r
-                    continue;\r
-                }\r
-                // normal literals\r
-                UTF16.append(buffer, cp);\r
-            }\r
-        // handle final cleanup\r
-        start = limit;\r
-        switch (quoteStatus) {\r
-        case HEX:\r
-            status = BROKEN_ESCAPE;\r
-            break;\r
-        case SLASH_START:\r
-            if (usingSlash) {\r
-                status = BROKEN_ESCAPE;\r
-            } else {\r
-                buffer.append(BACK_SLASH);\r
-            }\r
-            break;\r
-        case START_QUOTE: case NORMAL_QUOTE:\r
-            status = BROKEN_QUOTE;\r
-            break;\r
-        }\r
-        return status;\r
-    }\r
-    \r
-    \r
-}\r
-//eof\r
+//##header J2SE15
+/*
+ *******************************************************************************
+ * Copyright (C) 2006-2009, Google, International Business Machines Corporation *
+ * and others. All Rights Reserved.                                            *
+ *******************************************************************************
+ */
+package com.ibm.icu.impl;
+
+import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;
+
+/**
+ * A simple parsing class for patterns and rules. Handles '...' quotations, \\uxxxx and \\Uxxxxxxxx, and symple syntax.
+ * The '' (two quotes) is treated as a single quote, inside or outside a quote
+ * <ul>
+ * <li>Any ignorable characters are ignored in parsing.</li>
+ * <li>Any syntax characters are broken into separate tokens</li>
+ * <li>Quote characters can be specified: '...', "...", and \x </li>
+ * <li>Other characters are treated as literals</li>
+ * </ul>
+ */
+public class PatternTokenizer {
+    // settings used in the interpretation of the pattern
+    private UnicodeSet ignorableCharacters = new UnicodeSet();
+    private UnicodeSet syntaxCharacters = new UnicodeSet();
+    private UnicodeSet extraQuotingCharacters = new UnicodeSet();
+    private UnicodeSet escapeCharacters = new UnicodeSet();
+    private boolean usingSlash = false;
+    private boolean usingQuote = false;
+    
+    // transient data, set when needed. Null it out for any changes in the above fields.
+    private transient UnicodeSet needingQuoteCharacters = null;
+    
+    // data about the current pattern being parsed. start gets moved as we go along.
+    private int start;
+    private int limit;
+    private String pattern;
+    
+    public UnicodeSet getIgnorableCharacters() {
+        return (UnicodeSet) ignorableCharacters.clone();
+    }
+    /**
+     * Sets the characters to be ignored in parsing, eg new UnicodeSet("[:pattern_whitespace:]");
+     * @param ignorableCharacters
+     * @return
+     */
+    public PatternTokenizer setIgnorableCharacters(UnicodeSet ignorableCharacters) {
+        this.ignorableCharacters = (UnicodeSet) ignorableCharacters.clone();
+        needingQuoteCharacters = null;
+        return this;
+    }
+    public UnicodeSet getSyntaxCharacters() {
+        return (UnicodeSet) syntaxCharacters.clone();
+    }
+    public UnicodeSet getExtraQuotingCharacters() {
+        return (UnicodeSet) extraQuotingCharacters.clone();
+    }
+    /**
+     *  Sets the characters to be interpreted as syntax characters in parsing, eg new UnicodeSet("[:pattern_syntax:]")
+     * @param syntaxCharacters
+     * @return
+     */
+    public PatternTokenizer setSyntaxCharacters(UnicodeSet syntaxCharacters) {
+        this.syntaxCharacters = (UnicodeSet) syntaxCharacters.clone();
+        needingQuoteCharacters = null;
+        return this;
+    }   
+    /**
+     *  Sets the extra characters to be quoted in literals
+     * @param syntaxCharacters
+     * @return
+     */
+    public PatternTokenizer setExtraQuotingCharacters(UnicodeSet syntaxCharacters) {
+        this.extraQuotingCharacters = (UnicodeSet) syntaxCharacters.clone();
+        needingQuoteCharacters = null;
+        return this;
+    }   
+    
+    public UnicodeSet getEscapeCharacters() {
+        return (UnicodeSet) escapeCharacters.clone();
+    }
+    /**
+     * Set characters to be escaped in literals, in quoteLiteral and normalize, eg new UnicodeSet("[^\\u0020-\\u007E]");
+     * @param escapeCharacters
+     * @return
+     */
+    public PatternTokenizer setEscapeCharacters(UnicodeSet escapeCharacters) {
+        this.escapeCharacters = (UnicodeSet) escapeCharacters.clone();
+        return this;
+    }
+    public boolean isUsingQuote() {
+        return usingQuote;
+    }
+    public PatternTokenizer setUsingQuote(boolean usingQuote) {
+        this.usingQuote = usingQuote;
+        needingQuoteCharacters = null;
+        return this;
+    }
+    public boolean isUsingSlash() {
+        return usingSlash;
+    }
+    public PatternTokenizer setUsingSlash(boolean usingSlash) {
+        this.usingSlash = usingSlash;
+        needingQuoteCharacters = null;
+        return this;
+    }
+    //    public UnicodeSet getQuoteCharacters() {
+//  return (UnicodeSet) quoteCharacters.clone();
+//  }
+//  public PatternTokenizer setQuoteCharacters(UnicodeSet quoteCharacters) {
+//  this.quoteCharacters = (UnicodeSet) quoteCharacters.clone();
+//  needingQuoteCharacters = null;
+//  return this;
+//  }
+    public int getLimit() {
+        return limit;
+    }
+    public PatternTokenizer setLimit(int limit) {
+        this.limit = limit;
+        return this;
+    }
+    public int getStart() {
+        return start;
+    }
+    public PatternTokenizer setStart(int start) {
+        this.start = start;
+        return this;
+    }
+
+//#if defined(FOUNDATION10) || defined(J2SE13)
+//##    public PatternTokenizer setPattern(StringBuffer pattern) {
+//##        return setPattern(pattern.toString());
+//##    }
+//#else 
+    public PatternTokenizer setPattern(CharSequence pattern) {
+        return setPattern(pattern.toString());
+    }
+//#endif
+
+    public PatternTokenizer setPattern(String pattern) {
+        if (pattern == null) {
+            throw new IllegalArgumentException("Inconsistent arguments");
+        }
+        this.start = 0;
+        this.limit = pattern.length();
+        this.pattern = pattern;
+        return this;
+    }
+
+    public static final char SINGLE_QUOTE = '\'';
+    public static final char BACK_SLASH = '\\';
+    private static int NO_QUOTE = -1, IN_QUOTE = -2;
+
+//#if defined(FOUNDATION10) || defined(J2SE13)
+//##    public String quoteLiteral(StringBuffer string) {
+//##        return quoteLiteral(string.toString());
+//##    }
+//#else
+    public String quoteLiteral(CharSequence string) {
+        return quoteLiteral(string.toString());
+    }
+//#endif
+
+    /**
+     * Quote a literal string, using the available settings. Thus syntax characters, quote characters, and ignorable characters will be put into quotes.
+     * @param string
+     * @return
+     */
+    public String quoteLiteral(String string) {
+        if (needingQuoteCharacters == null) {
+            needingQuoteCharacters = new UnicodeSet().addAll(syntaxCharacters).addAll(ignorableCharacters).addAll(extraQuotingCharacters); // .addAll(quoteCharacters)
+            if (usingSlash) needingQuoteCharacters.add(BACK_SLASH);
+            if (usingQuote) needingQuoteCharacters.add(SINGLE_QUOTE);
+        }
+        StringBuffer result = new StringBuffer();
+        int quotedChar = NO_QUOTE;
+        int cp;
+        for (int i = 0; i < string.length(); i += UTF16.getCharCount(cp)) {
+            cp = UTF16.charAt(string, i);
+            if (escapeCharacters.contains(cp)) {
+                // we may have to fix up previous characters
+                if (quotedChar == IN_QUOTE) {
+                    result.append(SINGLE_QUOTE);
+                    quotedChar = NO_QUOTE;
+                }
+                appendEscaped(result, cp);
+                continue;
+            }
+            
+            if (needingQuoteCharacters.contains(cp)) {
+                // if we have already started a quote
+                if (quotedChar == IN_QUOTE) {
+                    UTF16.append(result, cp);
+                    if (usingQuote && cp == SINGLE_QUOTE) { // double it
+                        result.append(SINGLE_QUOTE);
+                    }
+                    continue;
+                }
+                // otherwise not already in quote
+                if (usingSlash) {
+                    result.append(BACK_SLASH);
+                    UTF16.append(result, cp);
+                    continue;
+                }
+                if (usingQuote) {
+                    if (cp == SINGLE_QUOTE) { // double it and continue
+                        result.append(SINGLE_QUOTE);
+                        result.append(SINGLE_QUOTE);
+                        continue;
+                    }
+                    result.append(SINGLE_QUOTE);
+                    UTF16.append(result, cp);
+                    quotedChar = IN_QUOTE;
+                    continue;
+                }
+                // we have no choice but to use \\u or \\U
+                appendEscaped(result, cp);
+                continue;
+            }
+            // otherwise cp doesn't need quoting
+            // we may have to fix up previous characters
+            if (quotedChar == IN_QUOTE) {
+                result.append(SINGLE_QUOTE);
+                quotedChar = NO_QUOTE;
+            }
+            UTF16.append(result, cp);
+        }
+        // all done. 
+        // we may have to fix up previous characters
+        if (quotedChar == IN_QUOTE) {
+            result.append(SINGLE_QUOTE);
+        }
+        return result.toString();
+    }
+
+    private void appendEscaped(StringBuffer result, int cp) {
+        if (cp <= 0xFFFF) {
+            result.append("\\u").append(Utility.hex(cp,4));
+        } else {
+            result.append("\\U").append(Utility.hex(cp,8));
+        }
+    }
+    
+    public String normalize() {
+        int oldStart = start;
+        StringBuffer result = new StringBuffer();
+        StringBuffer buffer = new StringBuffer();
+        while (true) {
+            buffer.setLength(0);
+            int status = next(buffer);
+            if (status == DONE) {
+                start = oldStart;
+                return result.toString();
+            }
+            if (status != SYNTAX) {
+                result.append(quoteLiteral(buffer));
+            } else {
+                result.append(buffer);
+            }
+        }
+    }
+    
+    public static final int DONE = 0, SYNTAX = 1, LITERAL = 2, BROKEN_QUOTE = 3, BROKEN_ESCAPE = 4, UNKNOWN = 5;
+    
+    private static final int AFTER_QUOTE = -1, NONE = 0, START_QUOTE = 1, NORMAL_QUOTE = 2, SLASH_START = 3, HEX = 4;
+    
+    public int next(StringBuffer buffer) {
+        if (start >= limit) return DONE;
+        int status = UNKNOWN;
+        int lastQuote = UNKNOWN;
+        int quoteStatus = NONE;
+        int hexCount = 0;
+        int hexValue = 0;
+        int cp;
+        main:
+            for (int i = start; i < limit; i += UTF16.getCharCount(cp)) {
+                cp = UTF16.charAt(pattern, i);
+                // if we are in a quote, then handle it.
+                switch (quoteStatus) {
+                case SLASH_START:
+                    switch (cp) {
+                    case 'u':
+                        quoteStatus = HEX;
+                        hexCount = 4;
+                        hexValue = 0;
+                        continue main;
+                    case 'U': 
+                        quoteStatus = HEX;
+                        hexCount = 8;
+                        hexValue = 0;
+                        continue main;
+                    default:
+                        if (usingSlash) {
+                            UTF16.append(buffer, cp);
+                            quoteStatus = NONE;
+                            continue main;
+                        } else {
+                            buffer.append(BACK_SLASH);
+                            quoteStatus = NONE;
+                        }
+                    }
+                    break; // fall through to NONE
+                case HEX:
+                    hexValue <<= 4;
+                    hexValue += cp;
+                    switch (cp) {
+                    case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
+                        hexValue -= '0'; break;
+                    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+                        hexValue -= 'a' - 10; break;
+                    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+                        hexValue -= 'A' - 10; break;
+                    default:
+                        start = i;
+                    return BROKEN_ESCAPE;
+                    }
+                    --hexCount;
+                    if (hexCount == 0) {
+                        quoteStatus = NONE;
+                        UTF16.append(buffer, hexValue);
+                    }
+                    continue main;
+                case AFTER_QUOTE:
+                    // see if we get another quote character
+                    // if we just ended a quote BUT the following character is the lastQuote character, then we have a situation like '...''...', so we restart the quote
+                    if (cp == lastQuote) {
+                        UTF16.append(buffer, cp);
+                        quoteStatus = NORMAL_QUOTE;
+                        continue main;
+                    }
+                    quoteStatus = NONE;
+                    break; // fall through to NONE
+                case START_QUOTE:
+                    // if we are at the very start of a quote, and we hit another quote mark then we emit a literal quote character and end the quote
+                    if (cp == lastQuote) {
+                        UTF16.append(buffer, cp);
+                        quoteStatus = NONE; // get out of quote, with no trace remaining
+                        continue;                            
+                    }
+                    // otherwise get into quote
+                    UTF16.append(buffer, cp);
+                    quoteStatus = NORMAL_QUOTE;
+                    continue main;
+                case NORMAL_QUOTE: 
+                    if (cp == lastQuote) {
+                        quoteStatus = AFTER_QUOTE; // get out of quote
+                        continue main;
+                    }
+                    UTF16.append(buffer, cp);
+                    continue main;
+                }
+                
+                if (ignorableCharacters.contains(cp)) {
+                    continue;
+                }
+                // do syntax characters
+                if (syntaxCharacters.contains(cp)) {
+                    if (status == UNKNOWN) {
+                        UTF16.append(buffer, cp);
+                        start = i + UTF16.getCharCount(cp);
+                        return SYNTAX;
+                    } else { // LITERAL, so back up and break
+                        start = i;
+                        return status;
+                    }
+                }
+                // otherwise it is a literal; keep on going
+                status = LITERAL;
+                if (cp == BACK_SLASH) {
+                    quoteStatus = SLASH_START;
+                    continue;
+                } else if (usingQuote && cp == SINGLE_QUOTE) {
+                    lastQuote = cp;
+                    quoteStatus = START_QUOTE;
+                    continue;
+                }
+                // normal literals
+                UTF16.append(buffer, cp);
+            }
+        // handle final cleanup
+        start = limit;
+        switch (quoteStatus) {
+        case HEX:
+            status = BROKEN_ESCAPE;
+            break;
+        case SLASH_START:
+            if (usingSlash) {
+                status = BROKEN_ESCAPE;
+            } else {
+                buffer.append(BACK_SLASH);
+            }
+            break;
+        case START_QUOTE: case NORMAL_QUOTE:
+            status = BROKEN_QUOTE;
+            break;
+        }
+        return status;
+    }
+    
+    
+}
+//eof