2 //#if defined(FOUNDATION10) || defined(J2SE13)
5 *******************************************************************************
6 * Copyright (C) 2002-2009, International Business Machines Corporation and *
7 * others. All Rights Reserved. *
8 *******************************************************************************
10 package com.ibm.icu.dev.test.util;
12 import java.text.ParsePosition;
14 import com.ibm.icu.text.*;
15 import com.ibm.icu.lang.*;
17 import java.util.HashMap;
18 import java.util.HashSet;
22 public class Tokenizer {
23 protected String source;
25 protected StringBuffer buffer = new StringBuffer();
26 protected long number;
27 protected UnicodeSet unicodeSet = null;
29 boolean backedup = false;
30 protected int lastIndex = -1;
31 protected int nextIndex;
32 int lastValue = BACKEDUP_TOO_FAR;
33 TokenSymbolTable symbolTable = new TokenSymbolTable();
35 private static final char
38 private static final UnicodeSet QUOTERS = new UnicodeSet().add(QUOTE).add(BSLASH);
39 private static final UnicodeSet WHITESPACE = new UnicodeSet("[" +
40 "\\u0009-\\u000D\\u0020\\u0085\\u200E\\u200F\\u2028\\u2029" +
42 private static final UnicodeSet SYNTAX = new UnicodeSet("[" +
43 "\\u0021-\\u002F\\u003A-\\u0040\\u005B-\\u0060\\u007B-\\u007E" +
44 "\\u00A1-\\u00A7\\u00A9\\u00AB-\\u00AC\\u00AE" +
45 "\\u00B0-\\u00B1\\u00B6\\u00B7\\u00BB\\u00BF\\u00D7\\u00F7" +
46 "\\u2010-\\u2027\\u2030-\\u205E\\u2190-\\u2BFF" +
47 "\\u3001\\u3003\\u3008-\\u3020\\u3030" +
48 "\\uFD3E\\uFD3F\\uFE45\\uFE46" +
49 "]").removeAll(QUOTERS).remove('$');
50 private static final UnicodeSet NEWLINE = new UnicodeSet("[\\u000A\\u000D\\u0085\\u2028\\u2029]");
51 //private static final UnicodeSet DECIMAL = new UnicodeSet("[:Nd:]");
52 private static final UnicodeSet NON_STRING = new UnicodeSet()
56 protected UnicodeSet whiteSpace = WHITESPACE;
57 protected UnicodeSet syntax = SYNTAX;
58 private UnicodeSet non_string = NON_STRING;
60 private void fixSets() {
61 if (syntax.containsSome(QUOTERS) || syntax.containsSome(whiteSpace)) {
62 syntax = ((UnicodeSet)syntax.clone()).removeAll(QUOTERS).removeAll(whiteSpace);
64 if (whiteSpace.containsSome(QUOTERS)) {
65 whiteSpace = ((UnicodeSet)whiteSpace.clone()).removeAll(QUOTERS);
67 non_string = new UnicodeSet(syntax)
71 public Tokenizer setSource(String source) {
74 return this; // for chaining
77 public Tokenizer setIndex(int index) {
79 return this; // for chaining
82 public static final int
87 UNTERMINATED_QUOTE = -5,
88 BACKEDUP_TOO_FAR = -6;
90 private static final int
94 AFTER_QUOTE = 3, // warning: order is important for switch statement
99 public String toString(int type, boolean backedupBefore) {
100 String s = backedup ? "@" : "*";
104 case BACKEDUP_TOO_FAR:
105 return s+"Illegal Backup"+s;
106 case UNTERMINATED_QUOTE:
107 return s+"Unterminated Quote=" + getString() + s;
109 return s+"s=" + getString() + s;
111 return s+"n=" + getNumber() + s;
113 return s+"n=" + getUnicodeSet() + s;
115 return s+"c=" + usf.getName(type,true) + s;
119 private static final BagFormatter usf = new BagFormatter();
121 public void backup() {
122 if (backedup) throw new IllegalArgumentException("backup too far");
130 boolean backedupBefore = backedup;
132 System.out.println(toString(result, backedupBefore));
144 boolean inComment = false;
145 // clean off any leading whitespace or comments
147 if (index >= source.length()) return lastValue = DONE;
150 if (NEWLINE.contains(cp)) inComment = false;
152 if (cp == '#') inComment = true;
153 else if (!whiteSpace.contains(cp)) break;
156 // record the last index in case we have to backup
160 ParsePosition pos = new ParsePosition(index-1);
161 unicodeSet = new UnicodeSet(source,pos,symbolTable);
162 index = pos.getIndex();
163 return lastValue = UNICODESET;
165 // get syntax character
166 if (syntax.contains(cp)) return lastValue = cp;
168 // get number, if there is one
169 if (UCharacter.getType(cp) == Character.DECIMAL_DIGIT_NUMBER) {
170 number = UCharacter.getNumericValue(cp);
171 while (index < source.length()) {
173 if (UCharacter.getType(cp) != Character.DECIMAL_DIGIT_NUMBER) {
174 index -= UTF16.getCharCount(cp); // BACKUP!
178 number += UCharacter.getNumericValue(cp);
180 return lastValue = NUMBER;
183 int status = IN_STRING;
187 case AFTER_QUOTE: // check for double ''?
189 UTF16.append(buffer, QUOTE);
193 // OTHERWISE FALL THROUGH!!!
195 if (cp == QUOTE) status = IN_QUOTE;
196 else if (cp == BSLASH) status = AFTER_BSLASH;
197 else if (non_string.contains(cp)) {
198 index -= UTF16.getCharCount(cp); // BACKUP!
200 } else UTF16.append(buffer,cp);
203 if (cp == QUOTE) status = AFTER_QUOTE;
204 else UTF16.append(buffer,cp);
208 case 'n': cp = '\n'; break;
209 case 'r': cp = '\r'; break;
210 case 't': cp = '\t'; break;
212 UTF16.append(buffer,cp);
215 default: throw new IllegalArgumentException("Internal Error");
217 if (index >= source.length()) break;
220 if (status > IN_STRING) return lastValue = UNTERMINATED_QUOTE;
221 return lastValue = STRING;
224 public String getString() {
225 return buffer.toString();
228 public String toString() {
229 return source.substring(0,index) + "$$$" + source.substring(index);
232 public long getNumber() {
236 public UnicodeSet getUnicodeSet() {
240 private int nextChar() {
241 int cp = UTF16.charAt(source,index);
242 index += UTF16.getCharCount(cp);
245 public int getIndex() {
248 public String getSource() {
251 public UnicodeSet getSyntax() {
254 public UnicodeSet getWhiteSpace() {
257 public void setSyntax(UnicodeSet set) {
261 public void setWhiteSpace(UnicodeSet set) {
266 public Set getLookedUpItems() {
267 return symbolTable.itemsLookedUp;
270 public void addSymbol(String var, String value, int start, int limit) {
271 // the limit is after the ';', so remove it
273 char[] body = new char[limit - start];
274 value.getChars(start, limit, body, 0);
275 symbolTable.add(var, body);
278 public class TokenSymbolTable implements SymbolTable {
279 Map contents = new HashMap();
280 Set itemsLookedUp = new HashSet();
282 public void add(String var, char[] body) {
283 // start from 1 to avoid the $
284 contents.put(var.substring(1), body);
288 * @see com.ibm.icu.text.SymbolTable#lookup(java.lang.String)
290 public char[] lookup(String s) {
291 itemsLookedUp.add('$' + s);
292 return (char[])contents.get(s);
296 * @see com.ibm.icu.text.SymbolTable#lookupMatcher(int)
298 public UnicodeMatcher lookupMatcher(int ch) {
299 // TODO Auto-generated method stub
304 * @see com.ibm.icu.text.SymbolTable#parseReference(java.lang.String, java.text.ParsePosition, int)
306 public String parseReference(String text, ParsePosition pos, int limit) {
308 int start = pos.getIndex();
310 for (i = start; i < limit; i += UTF16.getCharCount(cp)) {
311 cp = UTF16.charAt(text, i);
312 if (!com.ibm.icu.lang.UCharacter.isUnicodeIdentifierPart(cp)) {
317 return text.substring(start,i);