/* ******************************************************************************* * Copyright (C) 2002-2009, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ package com.ibm.icu.dev.test.util; import java.text.ParsePosition; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.text.SymbolTable; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeMatcher; import com.ibm.icu.text.UnicodeSet; public class Tokenizer { protected String source; protected StringBuffer buffer = new StringBuffer(); protected long number; protected UnicodeSet unicodeSet = null; protected int index; boolean backedup = false; protected int lastIndex = -1; protected int nextIndex; int lastValue = BACKEDUP_TOO_FAR; TokenSymbolTable symbolTable = new TokenSymbolTable(); private static final char QUOTE = '\'', BSLASH = '\\'; private static final UnicodeSet QUOTERS = new UnicodeSet().add(QUOTE).add(BSLASH); private static final UnicodeSet WHITESPACE = new UnicodeSet("[" + "\\u0009-\\u000D\\u0020\\u0085\\u200E\\u200F\\u2028\\u2029" + "]"); private static final UnicodeSet SYNTAX = new UnicodeSet("[" + "\\u0021-\\u002F\\u003A-\\u0040\\u005B-\\u0060\\u007B-\\u007E" + "\\u00A1-\\u00A7\\u00A9\\u00AB-\\u00AC\\u00AE" + "\\u00B0-\\u00B1\\u00B6\\u00B7\\u00BB\\u00BF\\u00D7\\u00F7" + "\\u2010-\\u2027\\u2030-\\u205E\\u2190-\\u2BFF" + "\\u3001\\u3003\\u3008-\\u3020\\u3030" + "\\uFD3E\\uFD3F\\uFE45\\uFE46" + "]").removeAll(QUOTERS).remove('$'); private static final UnicodeSet NEWLINE = new UnicodeSet("[\\u000A\\u000D\\u0085\\u2028\\u2029]"); //private static final UnicodeSet DECIMAL = new UnicodeSet("[:Nd:]"); private static final UnicodeSet NON_STRING = new UnicodeSet() .addAll(WHITESPACE) .addAll(SYNTAX); protected UnicodeSet whiteSpace = WHITESPACE; protected UnicodeSet syntax = SYNTAX; private UnicodeSet non_string = NON_STRING; private void fixSets() { if (syntax.containsSome(QUOTERS) || syntax.containsSome(whiteSpace)) { syntax = ((UnicodeSet)syntax.clone()).removeAll(QUOTERS).removeAll(whiteSpace); } if (whiteSpace.containsSome(QUOTERS)) { whiteSpace = ((UnicodeSet)whiteSpace.clone()).removeAll(QUOTERS); } non_string = new UnicodeSet(syntax) .addAll(whiteSpace); } public Tokenizer setSource(String source) { this.source = source; this.index = 0; return this; // for chaining } public Tokenizer setIndex(int index) { this.index = index; return this; // for chaining } public static final int DONE = -1, NUMBER = -2, STRING = -3, UNICODESET = -4, UNTERMINATED_QUOTE = -5, BACKEDUP_TOO_FAR = -6; private static final int //FIRST = 0, //IN_NUMBER = 1, //IN_SPACE = 2, AFTER_QUOTE = 3, // warning: order is important for switch statement IN_STRING = 4, AFTER_BSLASH = 5, IN_QUOTE = 6; public String toString(int type, boolean backedupBefore) { String s = backedup ? "@" : "*"; switch(type) { case DONE: return s+"Done"+s; case BACKEDUP_TOO_FAR: return s+"Illegal Backup"+s; case UNTERMINATED_QUOTE: return s+"Unterminated Quote=" + getString() + s; case STRING: return s+"s=" + getString() + s; case NUMBER: return s+"n=" + getNumber() + s; case UNICODESET: return s+"n=" + getUnicodeSet() + s; default: return s+"c=" + usf.getName(type,true) + s; } } private static final BagFormatter usf = new BagFormatter(); public void backup() { if (backedup) throw new IllegalArgumentException("backup too far"); backedup = true; nextIndex = index; index = lastIndex; } /* public int next2() { boolean backedupBefore = backedup; int result = next(); System.out.println(toString(result, backedupBefore)); return result; } */ public int next() { if (backedup) { backedup = false; index = nextIndex; return lastValue; } int cp = 0; boolean inComment = false; // clean off any leading whitespace or comments while (true) { if (index >= source.length()) return lastValue = DONE; cp = nextChar(); if (inComment) { if (NEWLINE.contains(cp)) inComment = false; } else { if (cp == '#') inComment = true; else if (!whiteSpace.contains(cp)) break; } } // record the last index in case we have to backup lastIndex = index; if (cp == '[') { ParsePosition pos = new ParsePosition(index-1); unicodeSet = new UnicodeSet(source,pos,symbolTable); index = pos.getIndex(); return lastValue = UNICODESET; } // get syntax character if (syntax.contains(cp)) return lastValue = cp; // get number, if there is one if (UCharacter.getType(cp) == Character.DECIMAL_DIGIT_NUMBER) { number = UCharacter.getNumericValue(cp); while (index < source.length()) { cp = nextChar(); if (UCharacter.getType(cp) != Character.DECIMAL_DIGIT_NUMBER) { index -= UTF16.getCharCount(cp); // BACKUP! break; } number *= 10; number += UCharacter.getNumericValue(cp); } return lastValue = NUMBER; } buffer.setLength(0); int status = IN_STRING; main: while (true) { switch (status) { case AFTER_QUOTE: // check for double ''? if (cp == QUOTE) { UTF16.append(buffer, QUOTE); status = IN_QUOTE; break; } // OTHERWISE FALL THROUGH!!! case IN_STRING: if (cp == QUOTE) status = IN_QUOTE; else if (cp == BSLASH) status = AFTER_BSLASH; else if (non_string.contains(cp)) { index -= UTF16.getCharCount(cp); // BACKUP! break main; } else UTF16.append(buffer,cp); break; case IN_QUOTE: if (cp == QUOTE) status = AFTER_QUOTE; else UTF16.append(buffer,cp); break; case AFTER_BSLASH: switch(cp) { case 'n': cp = '\n'; break; case 'r': cp = '\r'; break; case 't': cp = '\t'; break; } UTF16.append(buffer,cp); status = IN_STRING; break; default: throw new IllegalArgumentException("Internal Error"); } if (index >= source.length()) break; cp = nextChar(); } if (status > IN_STRING) return lastValue = UNTERMINATED_QUOTE; return lastValue = STRING; } public String getString() { return buffer.toString(); } public String toString() { return source.substring(0,index) + "$$$" + source.substring(index); } public long getNumber() { return number; } public UnicodeSet getUnicodeSet() { return unicodeSet; } private int nextChar() { int cp = UTF16.charAt(source,index); index += UTF16.getCharCount(cp); return cp; } public int getIndex() { return index; } public String getSource() { return source; } public UnicodeSet getSyntax() { return syntax; } public UnicodeSet getWhiteSpace() { return whiteSpace; } public void setSyntax(UnicodeSet set) { syntax = set; fixSets(); } public void setWhiteSpace(UnicodeSet set) { whiteSpace = set; fixSets(); } public Set getLookedUpItems() { return symbolTable.itemsLookedUp; } public void addSymbol(String var, String value, int start, int limit) { // the limit is after the ';', so remove it --limit; char[] body = new char[limit - start]; value.getChars(start, limit, body, 0); symbolTable.add(var, body); } public class TokenSymbolTable implements SymbolTable { Map contents = new HashMap(); Set itemsLookedUp = new HashSet(); public void add(String var, char[] body) { // start from 1 to avoid the $ contents.put(var.substring(1), body); } /* (non-Javadoc) * @see com.ibm.icu.text.SymbolTable#lookup(java.lang.String) */ public char[] lookup(String s) { itemsLookedUp.add('$' + s); return (char[])contents.get(s); } /* (non-Javadoc) * @see com.ibm.icu.text.SymbolTable#lookupMatcher(int) */ public UnicodeMatcher lookupMatcher(int ch) { // TODO Auto-generated method stub return null; } /* (non-Javadoc) * @see com.ibm.icu.text.SymbolTable#parseReference(java.lang.String, java.text.ParsePosition, int) */ public String parseReference(String text, ParsePosition pos, int limit) { int cp; int start = pos.getIndex(); int i; for (i = start; i < limit; i += UTF16.getCharCount(cp)) { cp = UTF16.charAt(text, i); if (!com.ibm.icu.lang.UCharacter.isUnicodeIdentifierPart(cp)) { break; } } pos.setIndex(i); return text.substring(start,i); } } }