2 *******************************************************************************
\r
3 * Copyright (C) 2002-2009, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.dev.test.util;
\r
9 import java.text.ParsePosition;
\r
10 import java.util.HashMap;
\r
11 import java.util.HashSet;
\r
12 import java.util.Map;
\r
13 import java.util.Set;
\r
15 import com.ibm.icu.lang.UCharacter;
\r
16 import com.ibm.icu.text.SymbolTable;
\r
17 import com.ibm.icu.text.UTF16;
\r
18 import com.ibm.icu.text.UnicodeMatcher;
\r
19 import com.ibm.icu.text.UnicodeSet;
\r
21 public class Tokenizer {
\r
22 protected String source;
\r
24 protected StringBuffer buffer = new StringBuffer();
\r
25 protected long number;
\r
26 protected UnicodeSet unicodeSet = null;
\r
27 protected int index;
\r
28 boolean backedup = false;
\r
29 protected int lastIndex = -1;
\r
30 protected int nextIndex;
\r
31 int lastValue = BACKEDUP_TOO_FAR;
\r
32 TokenSymbolTable symbolTable = new TokenSymbolTable();
\r
34 private static final char
\r
37 private static final UnicodeSet QUOTERS = new UnicodeSet().add(QUOTE).add(BSLASH);
\r
38 private static final UnicodeSet WHITESPACE = new UnicodeSet("[" +
\r
39 "\\u0009-\\u000D\\u0020\\u0085\\u200E\\u200F\\u2028\\u2029" +
\r
41 private static final UnicodeSet SYNTAX = new UnicodeSet("[" +
\r
42 "\\u0021-\\u002F\\u003A-\\u0040\\u005B-\\u0060\\u007B-\\u007E" +
\r
43 "\\u00A1-\\u00A7\\u00A9\\u00AB-\\u00AC\\u00AE" +
\r
44 "\\u00B0-\\u00B1\\u00B6\\u00B7\\u00BB\\u00BF\\u00D7\\u00F7" +
\r
45 "\\u2010-\\u2027\\u2030-\\u205E\\u2190-\\u2BFF" +
\r
46 "\\u3001\\u3003\\u3008-\\u3020\\u3030" +
\r
47 "\\uFD3E\\uFD3F\\uFE45\\uFE46" +
\r
48 "]").removeAll(QUOTERS).remove('$');
\r
49 private static final UnicodeSet NEWLINE = new UnicodeSet("[\\u000A\\u000D\\u0085\\u2028\\u2029]");
\r
50 //private static final UnicodeSet DECIMAL = new UnicodeSet("[:Nd:]");
\r
51 private static final UnicodeSet NON_STRING = new UnicodeSet()
\r
55 protected UnicodeSet whiteSpace = WHITESPACE;
\r
56 protected UnicodeSet syntax = SYNTAX;
\r
57 private UnicodeSet non_string = NON_STRING;
\r
59 private void fixSets() {
\r
60 if (syntax.containsSome(QUOTERS) || syntax.containsSome(whiteSpace)) {
\r
61 syntax = ((UnicodeSet)syntax.clone()).removeAll(QUOTERS).removeAll(whiteSpace);
\r
63 if (whiteSpace.containsSome(QUOTERS)) {
\r
64 whiteSpace = ((UnicodeSet)whiteSpace.clone()).removeAll(QUOTERS);
\r
66 non_string = new UnicodeSet(syntax)
\r
67 .addAll(whiteSpace);
\r
70 public Tokenizer setSource(String source) {
\r
71 this.source = source;
\r
73 return this; // for chaining
\r
76 public Tokenizer setIndex(int index) {
\r
78 return this; // for chaining
\r
81 public static final int
\r
86 UNTERMINATED_QUOTE = -5,
\r
87 BACKEDUP_TOO_FAR = -6;
\r
89 private static final int
\r
93 AFTER_QUOTE = 3, // warning: order is important for switch statement
\r
98 public String toString(int type, boolean backedupBefore) {
\r
99 String s = backedup ? "@" : "*";
\r
103 case BACKEDUP_TOO_FAR:
\r
104 return s+"Illegal Backup"+s;
\r
105 case UNTERMINATED_QUOTE:
\r
106 return s+"Unterminated Quote=" + getString() + s;
\r
108 return s+"s=" + getString() + s;
\r
110 return s+"n=" + getNumber() + s;
\r
112 return s+"n=" + getUnicodeSet() + s;
\r
114 return s+"c=" + usf.getName(type,true) + s;
\r
118 private static final BagFormatter usf = new BagFormatter();
\r
120 public void backup() {
\r
121 if (backedup) throw new IllegalArgumentException("backup too far");
\r
128 public int next2() {
\r
129 boolean backedupBefore = backedup;
\r
130 int result = next();
\r
131 System.out.println(toString(result, backedupBefore));
\r
136 public int next() {
\r
143 boolean inComment = false;
\r
144 // clean off any leading whitespace or comments
\r
146 if (index >= source.length()) return lastValue = DONE;
\r
149 if (NEWLINE.contains(cp)) inComment = false;
\r
151 if (cp == '#') inComment = true;
\r
152 else if (!whiteSpace.contains(cp)) break;
\r
155 // record the last index in case we have to backup
\r
159 ParsePosition pos = new ParsePosition(index-1);
\r
160 unicodeSet = new UnicodeSet(source,pos,symbolTable);
\r
161 index = pos.getIndex();
\r
162 return lastValue = UNICODESET;
\r
164 // get syntax character
\r
165 if (syntax.contains(cp)) return lastValue = cp;
\r
167 // get number, if there is one
\r
168 if (UCharacter.getType(cp) == Character.DECIMAL_DIGIT_NUMBER) {
\r
169 number = UCharacter.getNumericValue(cp);
\r
170 while (index < source.length()) {
\r
172 if (UCharacter.getType(cp) != Character.DECIMAL_DIGIT_NUMBER) {
\r
173 index -= UTF16.getCharCount(cp); // BACKUP!
\r
177 number += UCharacter.getNumericValue(cp);
\r
179 return lastValue = NUMBER;
\r
181 buffer.setLength(0);
\r
182 int status = IN_STRING;
\r
186 case AFTER_QUOTE: // check for double ''?
\r
188 UTF16.append(buffer, QUOTE);
\r
192 // OTHERWISE FALL THROUGH!!!
\r
194 if (cp == QUOTE) status = IN_QUOTE;
\r
195 else if (cp == BSLASH) status = AFTER_BSLASH;
\r
196 else if (non_string.contains(cp)) {
\r
197 index -= UTF16.getCharCount(cp); // BACKUP!
\r
199 } else UTF16.append(buffer,cp);
\r
202 if (cp == QUOTE) status = AFTER_QUOTE;
\r
203 else UTF16.append(buffer,cp);
\r
207 case 'n': cp = '\n'; break;
\r
208 case 'r': cp = '\r'; break;
\r
209 case 't': cp = '\t'; break;
\r
211 UTF16.append(buffer,cp);
\r
212 status = IN_STRING;
\r
214 default: throw new IllegalArgumentException("Internal Error");
\r
216 if (index >= source.length()) break;
\r
219 if (status > IN_STRING) return lastValue = UNTERMINATED_QUOTE;
\r
220 return lastValue = STRING;
\r
223 public String getString() {
\r
224 return buffer.toString();
\r
227 public String toString() {
\r
228 return source.substring(0,index) + "$$$" + source.substring(index);
\r
231 public long getNumber() {
\r
235 public UnicodeSet getUnicodeSet() {
\r
239 private int nextChar() {
\r
240 int cp = UTF16.charAt(source,index);
\r
241 index += UTF16.getCharCount(cp);
\r
244 public int getIndex() {
\r
247 public String getSource() {
\r
250 public UnicodeSet getSyntax() {
\r
253 public UnicodeSet getWhiteSpace() {
\r
256 public void setSyntax(UnicodeSet set) {
\r
260 public void setWhiteSpace(UnicodeSet set) {
\r
265 public Set getLookedUpItems() {
\r
266 return symbolTable.itemsLookedUp;
\r
269 public void addSymbol(String var, String value, int start, int limit) {
\r
270 // the limit is after the ';', so remove it
\r
272 char[] body = new char[limit - start];
\r
273 value.getChars(start, limit, body, 0);
\r
274 symbolTable.add(var, body);
\r
277 public class TokenSymbolTable implements SymbolTable {
\r
278 Map contents = new HashMap();
\r
279 Set itemsLookedUp = new HashSet();
\r
281 public void add(String var, char[] body) {
\r
282 // start from 1 to avoid the $
\r
283 contents.put(var.substring(1), body);
\r
287 * @see com.ibm.icu.text.SymbolTable#lookup(java.lang.String)
\r
289 public char[] lookup(String s) {
\r
290 itemsLookedUp.add('$' + s);
\r
291 return (char[])contents.get(s);
\r
295 * @see com.ibm.icu.text.SymbolTable#lookupMatcher(int)
\r
297 public UnicodeMatcher lookupMatcher(int ch) {
\r
298 // TODO Auto-generated method stub
\r
303 * @see com.ibm.icu.text.SymbolTable#parseReference(java.lang.String, java.text.ParsePosition, int)
\r
305 public String parseReference(String text, ParsePosition pos, int limit) {
\r
307 int start = pos.getIndex();
\r
309 for (i = start; i < limit; i += UTF16.getCharCount(cp)) {
\r
310 cp = UTF16.charAt(text, i);
\r
311 if (!com.ibm.icu.lang.UCharacter.isUnicodeIdentifierPart(cp)) {
\r
316 return text.substring(start,i);
\r