]> gitweb.fperrin.net Git - Dictionary.git/blobdiff - jars/icu4j-4_2_1-src/src/com/ibm/icu/dev/test/util/Tokenizer.java
go
[Dictionary.git] / jars / icu4j-4_2_1-src / src / com / ibm / icu / dev / test / util / Tokenizer.java
old mode 100755 (executable)
new mode 100644 (file)
index d7d81b0..6c095d7
-//##header\r
-//#if defined(FOUNDATION10) || defined(J2SE13)\r
-//#else\r
-/*\r
- *******************************************************************************\r
- * Copyright (C) 2002-2009, International Business Machines Corporation and    *\r
- * others. All Rights Reserved.                                                *\r
- *******************************************************************************\r
- */\r
-package com.ibm.icu.dev.test.util;\r
-\r
-import java.text.ParsePosition;\r
-\r
-import com.ibm.icu.text.*;\r
-import com.ibm.icu.lang.*;\r
-\r
-import java.util.HashMap;\r
-import java.util.HashSet;\r
-import java.util.Set;\r
-import java.util.Map;\r
-\r
-public class Tokenizer {\r
-    protected String source;\r
-    \r
-    protected StringBuffer buffer = new StringBuffer();\r
-    protected long number;\r
-    protected UnicodeSet unicodeSet = null;\r
-    protected int index;\r
-    boolean backedup = false;\r
-    protected int lastIndex = -1;\r
-    protected int nextIndex;\r
-    int lastValue = BACKEDUP_TOO_FAR;\r
-    TokenSymbolTable symbolTable = new TokenSymbolTable();\r
-\r
-    private static final char\r
-        QUOTE = '\'',\r
-        BSLASH = '\\';\r
-    private static final UnicodeSet QUOTERS = new UnicodeSet().add(QUOTE).add(BSLASH);\r
-    private static final UnicodeSet WHITESPACE = new UnicodeSet("[" +\r
-        "\\u0009-\\u000D\\u0020\\u0085\\u200E\\u200F\\u2028\\u2029" +\r
-        "]");\r
-    private static final UnicodeSet SYNTAX = new UnicodeSet("[" +\r
-        "\\u0021-\\u002F\\u003A-\\u0040\\u005B-\\u0060\\u007B-\\u007E" +\r
-        "\\u00A1-\\u00A7\\u00A9\\u00AB-\\u00AC\\u00AE" +\r
-        "\\u00B0-\\u00B1\\u00B6\\u00B7\\u00BB\\u00BF\\u00D7\\u00F7" +\r
-        "\\u2010-\\u2027\\u2030-\\u205E\\u2190-\\u2BFF" +\r
-        "\\u3001\\u3003\\u3008-\\u3020\\u3030" +\r
-        "\\uFD3E\\uFD3F\\uFE45\\uFE46" +\r
-        "]").removeAll(QUOTERS).remove('$');\r
-    private static final UnicodeSet NEWLINE = new UnicodeSet("[\\u000A\\u000D\\u0085\\u2028\\u2029]");\r
-    //private static final UnicodeSet DECIMAL = new UnicodeSet("[:Nd:]");\r
-    private static final UnicodeSet NON_STRING = new UnicodeSet()\r
-        .addAll(WHITESPACE)\r
-        .addAll(SYNTAX);\r
-           \r
-    protected UnicodeSet whiteSpace = WHITESPACE;\r
-    protected UnicodeSet syntax = SYNTAX;\r
-    private UnicodeSet non_string = NON_STRING;\r
-\r
-    private void fixSets() {\r
-        if (syntax.containsSome(QUOTERS) || syntax.containsSome(whiteSpace)) {\r
-            syntax = ((UnicodeSet)syntax.clone()).removeAll(QUOTERS).removeAll(whiteSpace);\r
-        }\r
-        if (whiteSpace.containsSome(QUOTERS)) {\r
-            whiteSpace = ((UnicodeSet)whiteSpace.clone()).removeAll(QUOTERS);\r
-        }\r
-        non_string = new UnicodeSet(syntax)\r
-            .addAll(whiteSpace);\r
-    }\r
-    \r
-    public Tokenizer setSource(String source) {\r
-        this.source = source;\r
-        this.index = 0;\r
-        return this; // for chaining\r
-    }\r
-    \r
-    public Tokenizer setIndex(int index) {\r
-        this.index = index;\r
-        return this; // for chaining\r
-    }\r
-    \r
-    public static final int \r
-        DONE = -1, \r
-        NUMBER = -2, \r
-        STRING = -3, \r
-        UNICODESET = -4, \r
-        UNTERMINATED_QUOTE = -5,\r
-        BACKEDUP_TOO_FAR = -6;\r
-        \r
-    private static final int\r
-        //FIRST = 0,\r
-        //IN_NUMBER = 1,\r
-        //IN_SPACE = 2,\r
-        AFTER_QUOTE = 3,    // warning: order is important for switch statement\r
-        IN_STRING = 4, \r
-        AFTER_BSLASH = 5, \r
-        IN_QUOTE = 6;\r
-   \r
-    public String toString(int type, boolean backedupBefore) {\r
-        String s = backedup ? "@" : "*";\r
-        switch(type) {\r
-            case DONE: \r
-                return s+"Done"+s;\r
-            case BACKEDUP_TOO_FAR:\r
-                return s+"Illegal Backup"+s;\r
-            case UNTERMINATED_QUOTE: \r
-                return s+"Unterminated Quote=" + getString() + s;\r
-            case STRING:\r
-                return s+"s=" + getString() + s;\r
-            case NUMBER:\r
-                return s+"n=" + getNumber() + s;\r
-            case UNICODESET:\r
-                return s+"n=" + getUnicodeSet() + s;           \r
-            default:\r
-                return s+"c=" + usf.getName(type,true) + s;\r
-        }\r
-    }\r
-    \r
-    private static final BagFormatter usf = new BagFormatter();\r
-    \r
-    public void backup() {\r
-        if (backedup) throw new IllegalArgumentException("backup too far");\r
-        backedup = true;\r
-        nextIndex = index;\r
-        index = lastIndex;\r
-    }\r
-    \r
-    /*\r
-    public int next2() {\r
-        boolean backedupBefore = backedup;\r
-        int result = next();\r
-        System.out.println(toString(result, backedupBefore));\r
-        return result;\r
-    }    \r
-    */\r
-    \r
-    public int next() {\r
-        if (backedup) {\r
-            backedup = false;\r
-            index = nextIndex;\r
-            return lastValue;\r
-        }\r
-        int cp = 0;\r
-        boolean inComment = false;\r
-        // clean off any leading whitespace or comments\r
-        while (true) {\r
-            if (index >= source.length()) return lastValue = DONE;\r
-            cp = nextChar();\r
-            if (inComment) {\r
-                if (NEWLINE.contains(cp)) inComment = false;\r
-            } else {\r
-                if (cp == '#') inComment = true;\r
-                else if (!whiteSpace.contains(cp)) break;\r
-            }\r
-        }\r
-        // record the last index in case we have to backup\r
-        lastIndex = index;\r
-        \r
-        if (cp == '[') {\r
-            ParsePosition pos = new ParsePosition(index-1);\r
-            unicodeSet = new UnicodeSet(source,pos,symbolTable);\r
-            index = pos.getIndex();\r
-            return lastValue = UNICODESET;\r
-        }\r
-        // get syntax character\r
-        if (syntax.contains(cp)) return lastValue = cp;\r
-        \r
-        // get number, if there is one\r
-        if (UCharacter.getType(cp) == Character.DECIMAL_DIGIT_NUMBER) {\r
-            number = UCharacter.getNumericValue(cp);\r
-            while (index < source.length()) {\r
-                cp = nextChar();\r
-                if (UCharacter.getType(cp) != Character.DECIMAL_DIGIT_NUMBER) {\r
-                    index -= UTF16.getCharCount(cp); // BACKUP!\r
-                    break;\r
-                }\r
-                number *= 10;\r
-                number += UCharacter.getNumericValue(cp);\r
-            }\r
-            return lastValue =  NUMBER;\r
-        }\r
-        buffer.setLength(0);\r
-        int status = IN_STRING;\r
-        main:\r
-        while (true) {\r
-            switch (status) {\r
-                case AFTER_QUOTE: // check for double ''?\r
-                    if (cp == QUOTE) {\r
-                        UTF16.append(buffer, QUOTE);\r
-                        status = IN_QUOTE;\r
-                        break;\r
-                    }\r
-                    // OTHERWISE FALL THROUGH!!!\r
-                case IN_STRING: \r
-                    if (cp == QUOTE) status = IN_QUOTE;\r
-                    else if (cp == BSLASH) status = AFTER_BSLASH;\r
-                    else if (non_string.contains(cp)) {\r
-                        index -= UTF16.getCharCount(cp); // BACKUP!\r
-                        break main;\r
-                    } else UTF16.append(buffer,cp);\r
-                    break;\r
-                case IN_QUOTE:\r
-                    if (cp == QUOTE) status = AFTER_QUOTE;\r
-                    else UTF16.append(buffer,cp);\r
-                    break;\r
-                case AFTER_BSLASH:\r
-                    switch(cp) {\r
-                        case 'n': cp = '\n'; break;\r
-                        case 'r': cp = '\r'; break;\r
-                        case 't': cp = '\t'; break;\r
-                    }\r
-                    UTF16.append(buffer,cp);\r
-                    status = IN_STRING;\r
-                    break;\r
-                default: throw new IllegalArgumentException("Internal Error");\r
-            }\r
-            if (index >= source.length()) break;\r
-            cp = nextChar();\r
-        }\r
-        if (status > IN_STRING) return lastValue = UNTERMINATED_QUOTE;\r
-        return lastValue =  STRING;\r
-    }\r
-    \r
-    public String getString() {\r
-        return buffer.toString();\r
-    }\r
-    \r
-    public String toString() {\r
-        return source.substring(0,index) + "$$$" + source.substring(index);\r
-    }\r
-    \r
-    public long getNumber() {\r
-        return number;\r
-    }\r
-    \r
-    public UnicodeSet getUnicodeSet() {\r
-        return unicodeSet;\r
-    }\r
-    \r
-    private int nextChar() {\r
-        int cp = UTF16.charAt(source,index);\r
-        index += UTF16.getCharCount(cp);\r
-        return cp;\r
-    }\r
-    public int getIndex() {\r
-        return index;\r
-    }\r
-    public String getSource() {\r
-        return source;\r
-    }\r
-    public UnicodeSet getSyntax() {\r
-        return syntax;\r
-    }\r
-    public UnicodeSet getWhiteSpace() {\r
-        return whiteSpace;\r
-    }\r
-    public void setSyntax(UnicodeSet set) {\r
-        syntax = set;\r
-        fixSets();\r
-    }\r
-    public void setWhiteSpace(UnicodeSet set) {\r
-        whiteSpace = set;\r
-        fixSets();\r
-    }\r
-    \r
-    public Set getLookedUpItems() {\r
-        return symbolTable.itemsLookedUp;\r
-    }\r
-    \r
-    public void addSymbol(String var, String value, int start, int limit) {\r
-        // the limit is after the ';', so remove it\r
-        --limit;\r
-        char[] body = new char[limit - start];\r
-        value.getChars(start, limit, body, 0);\r
-        symbolTable.add(var, body);\r
-    }\r
-    \r
-    public class TokenSymbolTable implements SymbolTable {\r
-        Map contents = new HashMap();\r
-        Set itemsLookedUp = new HashSet();\r
-            \r
-        public void add(String var, char[] body) {\r
-            // start from 1 to avoid the $\r
-            contents.put(var.substring(1), body);\r
-        }\r
-            \r
-        /* (non-Javadoc)\r
-         * @see com.ibm.icu.text.SymbolTable#lookup(java.lang.String)\r
-         */\r
-        public char[] lookup(String s) {\r
-            itemsLookedUp.add('$' + s);\r
-            return (char[])contents.get(s);\r
-        }\r
-    \r
-        /* (non-Javadoc)\r
-         * @see com.ibm.icu.text.SymbolTable#lookupMatcher(int)\r
-         */\r
-        public UnicodeMatcher lookupMatcher(int ch) {\r
-            // TODO Auto-generated method stub\r
-            return null;\r
-        }\r
-    \r
-        /* (non-Javadoc)\r
-         * @see com.ibm.icu.text.SymbolTable#parseReference(java.lang.String, java.text.ParsePosition, int)\r
-         */\r
-        public String parseReference(String text, ParsePosition pos, int limit) {\r
-            int cp;\r
-            int start = pos.getIndex();\r
-            int i;\r
-            for (i = start; i < limit; i += UTF16.getCharCount(cp)) {\r
-                cp = UTF16.charAt(text, i);\r
-                if (!com.ibm.icu.lang.UCharacter.isUnicodeIdentifierPart(cp)) {\r
-                    break;\r
-                }\r
-            }\r
-            pos.setIndex(i);\r
-            return text.substring(start,i);\r
-        }\r
-        \r
-    }\r
-}\r
-\r
-//#endif\r
+//##header J2SE15
+//#if defined(FOUNDATION10) || defined(J2SE13)
+//#else
+/*
+ *******************************************************************************
+ * Copyright (C) 2002-2009, International Business Machines Corporation and    *
+ * others. All Rights Reserved.                                                *
+ *******************************************************************************
+ */
+package com.ibm.icu.dev.test.util;
+
+import java.text.ParsePosition;
+
+import com.ibm.icu.text.*;
+import com.ibm.icu.lang.*;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.Map;
+
+public class Tokenizer {
+    protected String source;
+    
+    protected StringBuffer buffer = new StringBuffer();
+    protected long number;
+    protected UnicodeSet unicodeSet = null;
+    protected int index;
+    boolean backedup = false;
+    protected int lastIndex = -1;
+    protected int nextIndex;
+    int lastValue = BACKEDUP_TOO_FAR;
+    TokenSymbolTable symbolTable = new TokenSymbolTable();
+
+    private static final char
+        QUOTE = '\'',
+        BSLASH = '\\';
+    private static final UnicodeSet QUOTERS = new UnicodeSet().add(QUOTE).add(BSLASH);
+    private static final UnicodeSet WHITESPACE = new UnicodeSet("[" +
+        "\\u0009-\\u000D\\u0020\\u0085\\u200E\\u200F\\u2028\\u2029" +
+        "]");
+    private static final UnicodeSet SYNTAX = new UnicodeSet("[" +
+        "\\u0021-\\u002F\\u003A-\\u0040\\u005B-\\u0060\\u007B-\\u007E" +
+        "\\u00A1-\\u00A7\\u00A9\\u00AB-\\u00AC\\u00AE" +
+        "\\u00B0-\\u00B1\\u00B6\\u00B7\\u00BB\\u00BF\\u00D7\\u00F7" +
+        "\\u2010-\\u2027\\u2030-\\u205E\\u2190-\\u2BFF" +
+        "\\u3001\\u3003\\u3008-\\u3020\\u3030" +
+        "\\uFD3E\\uFD3F\\uFE45\\uFE46" +
+        "]").removeAll(QUOTERS).remove('$');
+    private static final UnicodeSet NEWLINE = new UnicodeSet("[\\u000A\\u000D\\u0085\\u2028\\u2029]");
+    //private static final UnicodeSet DECIMAL = new UnicodeSet("[:Nd:]");
+    private static final UnicodeSet NON_STRING = new UnicodeSet()
+        .addAll(WHITESPACE)
+        .addAll(SYNTAX);
+           
+    protected UnicodeSet whiteSpace = WHITESPACE;
+    protected UnicodeSet syntax = SYNTAX;
+    private UnicodeSet non_string = NON_STRING;
+
+    private void fixSets() {
+        if (syntax.containsSome(QUOTERS) || syntax.containsSome(whiteSpace)) {
+            syntax = ((UnicodeSet)syntax.clone()).removeAll(QUOTERS).removeAll(whiteSpace);
+        }
+        if (whiteSpace.containsSome(QUOTERS)) {
+            whiteSpace = ((UnicodeSet)whiteSpace.clone()).removeAll(QUOTERS);
+        }
+        non_string = new UnicodeSet(syntax)
+            .addAll(whiteSpace);
+    }
+    
+    public Tokenizer setSource(String source) {
+        this.source = source;
+        this.index = 0;
+        return this; // for chaining
+    }
+    
+    public Tokenizer setIndex(int index) {
+        this.index = index;
+        return this; // for chaining
+    }
+    
+    public static final int 
+        DONE = -1, 
+        NUMBER = -2, 
+        STRING = -3, 
+        UNICODESET = -4, 
+        UNTERMINATED_QUOTE = -5,
+        BACKEDUP_TOO_FAR = -6;
+        
+    private static final int
+        //FIRST = 0,
+        //IN_NUMBER = 1,
+        //IN_SPACE = 2,
+        AFTER_QUOTE = 3,    // warning: order is important for switch statement
+        IN_STRING = 4, 
+        AFTER_BSLASH = 5, 
+        IN_QUOTE = 6;
+   
+    public String toString(int type, boolean backedupBefore) {
+        String s = backedup ? "@" : "*";
+        switch(type) {
+            case DONE: 
+                return s+"Done"+s;
+            case BACKEDUP_TOO_FAR:
+                return s+"Illegal Backup"+s;
+            case UNTERMINATED_QUOTE: 
+                return s+"Unterminated Quote=" + getString() + s;
+            case STRING:
+                return s+"s=" + getString() + s;
+            case NUMBER:
+                return s+"n=" + getNumber() + s;
+            case UNICODESET:
+                return s+"n=" + getUnicodeSet() + s;           
+            default:
+                return s+"c=" + usf.getName(type,true) + s;
+        }
+    }
+    
+    private static final BagFormatter usf = new BagFormatter();
+    
+    public void backup() {
+        if (backedup) throw new IllegalArgumentException("backup too far");
+        backedup = true;
+        nextIndex = index;
+        index = lastIndex;
+    }
+    
+    /*
+    public int next2() {
+        boolean backedupBefore = backedup;
+        int result = next();
+        System.out.println(toString(result, backedupBefore));
+        return result;
+    }    
+    */
+    
+    public int next() {
+        if (backedup) {
+            backedup = false;
+            index = nextIndex;
+            return lastValue;
+        }
+        int cp = 0;
+        boolean inComment = false;
+        // clean off any leading whitespace or comments
+        while (true) {
+            if (index >= source.length()) return lastValue = DONE;
+            cp = nextChar();
+            if (inComment) {
+                if (NEWLINE.contains(cp)) inComment = false;
+            } else {
+                if (cp == '#') inComment = true;
+                else if (!whiteSpace.contains(cp)) break;
+            }
+        }
+        // record the last index in case we have to backup
+        lastIndex = index;
+        
+        if (cp == '[') {
+            ParsePosition pos = new ParsePosition(index-1);
+            unicodeSet = new UnicodeSet(source,pos,symbolTable);
+            index = pos.getIndex();
+            return lastValue = UNICODESET;
+        }
+        // get syntax character
+        if (syntax.contains(cp)) return lastValue = cp;
+        
+        // get number, if there is one
+        if (UCharacter.getType(cp) == Character.DECIMAL_DIGIT_NUMBER) {
+            number = UCharacter.getNumericValue(cp);
+            while (index < source.length()) {
+                cp = nextChar();
+                if (UCharacter.getType(cp) != Character.DECIMAL_DIGIT_NUMBER) {
+                    index -= UTF16.getCharCount(cp); // BACKUP!
+                    break;
+                }
+                number *= 10;
+                number += UCharacter.getNumericValue(cp);
+            }
+            return lastValue =  NUMBER;
+        }
+        buffer.setLength(0);
+        int status = IN_STRING;
+        main:
+        while (true) {
+            switch (status) {
+                case AFTER_QUOTE: // check for double ''?
+                    if (cp == QUOTE) {
+                        UTF16.append(buffer, QUOTE);
+                        status = IN_QUOTE;
+                        break;
+                    }
+                    // OTHERWISE FALL THROUGH!!!
+                case IN_STRING: 
+                    if (cp == QUOTE) status = IN_QUOTE;
+                    else if (cp == BSLASH) status = AFTER_BSLASH;
+                    else if (non_string.contains(cp)) {
+                        index -= UTF16.getCharCount(cp); // BACKUP!
+                        break main;
+                    } else UTF16.append(buffer,cp);
+                    break;
+                case IN_QUOTE:
+                    if (cp == QUOTE) status = AFTER_QUOTE;
+                    else UTF16.append(buffer,cp);
+                    break;
+                case AFTER_BSLASH:
+                    switch(cp) {
+                        case 'n': cp = '\n'; break;
+                        case 'r': cp = '\r'; break;
+                        case 't': cp = '\t'; break;
+                    }
+                    UTF16.append(buffer,cp);
+                    status = IN_STRING;
+                    break;
+                default: throw new IllegalArgumentException("Internal Error");
+            }
+            if (index >= source.length()) break;
+            cp = nextChar();
+        }
+        if (status > IN_STRING) return lastValue = UNTERMINATED_QUOTE;
+        return lastValue =  STRING;
+    }
+    
+    public String getString() {
+        return buffer.toString();
+    }
+    
+    public String toString() {
+        return source.substring(0,index) + "$$$" + source.substring(index);
+    }
+    
+    public long getNumber() {
+        return number;
+    }
+    
+    public UnicodeSet getUnicodeSet() {
+        return unicodeSet;
+    }
+    
+    private int nextChar() {
+        int cp = UTF16.charAt(source,index);
+        index += UTF16.getCharCount(cp);
+        return cp;
+    }
+    public int getIndex() {
+        return index;
+    }
+    public String getSource() {
+        return source;
+    }
+    public UnicodeSet getSyntax() {
+        return syntax;
+    }
+    public UnicodeSet getWhiteSpace() {
+        return whiteSpace;
+    }
+    public void setSyntax(UnicodeSet set) {
+        syntax = set;
+        fixSets();
+    }
+    public void setWhiteSpace(UnicodeSet set) {
+        whiteSpace = set;
+        fixSets();
+    }
+    
+    public Set getLookedUpItems() {
+        return symbolTable.itemsLookedUp;
+    }
+    
+    public void addSymbol(String var, String value, int start, int limit) {
+        // the limit is after the ';', so remove it
+        --limit;
+        char[] body = new char[limit - start];
+        value.getChars(start, limit, body, 0);
+        symbolTable.add(var, body);
+    }
+    
+    public class TokenSymbolTable implements SymbolTable {
+        Map contents = new HashMap();
+        Set itemsLookedUp = new HashSet();
+            
+        public void add(String var, char[] body) {
+            // start from 1 to avoid the $
+            contents.put(var.substring(1), body);
+        }
+            
+        /* (non-Javadoc)
+         * @see com.ibm.icu.text.SymbolTable#lookup(java.lang.String)
+         */
+        public char[] lookup(String s) {
+            itemsLookedUp.add('$' + s);
+            return (char[])contents.get(s);
+        }
+    
+        /* (non-Javadoc)
+         * @see com.ibm.icu.text.SymbolTable#lookupMatcher(int)
+         */
+        public UnicodeMatcher lookupMatcher(int ch) {
+            // TODO Auto-generated method stub
+            return null;
+        }
+    
+        /* (non-Javadoc)
+         * @see com.ibm.icu.text.SymbolTable#parseReference(java.lang.String, java.text.ParsePosition, int)
+         */
+        public String parseReference(String text, ParsePosition pos, int limit) {
+            int cp;
+            int start = pos.getIndex();
+            int i;
+            for (i = start; i < limit; i += UTF16.getCharCount(cp)) {
+                cp = UTF16.charAt(text, i);
+                if (!com.ibm.icu.lang.UCharacter.isUnicodeIdentifierPart(cp)) {
+                    break;
+                }
+            }
+            pos.setIndex(i);
+            return text.substring(start,i);
+        }
+        
+    }
+}
+
+//#endif