2 //#if defined(FOUNDATION10) || defined(J2SE13)
\r
5 *******************************************************************************
\r
6 * Copyright (C) 2002-2009, International Business Machines Corporation and *
\r
7 * others. All Rights Reserved. *
\r
8 *******************************************************************************
\r
10 package com.ibm.icu.dev.test.util;
\r
12 import java.text.ParsePosition;
\r
14 import com.ibm.icu.text.*;
\r
15 import com.ibm.icu.lang.*;
\r
17 import java.util.HashMap;
\r
18 import java.util.HashSet;
\r
19 import java.util.Set;
\r
20 import java.util.Map;
\r
22 public class Tokenizer {
\r
23 protected String source;
\r
25 protected StringBuffer buffer = new StringBuffer();
\r
26 protected long number;
\r
27 protected UnicodeSet unicodeSet = null;
\r
28 protected int index;
\r
29 boolean backedup = false;
\r
30 protected int lastIndex = -1;
\r
31 protected int nextIndex;
\r
32 int lastValue = BACKEDUP_TOO_FAR;
\r
33 TokenSymbolTable symbolTable = new TokenSymbolTable();
\r
35 private static final char
\r
38 private static final UnicodeSet QUOTERS = new UnicodeSet().add(QUOTE).add(BSLASH);
\r
39 private static final UnicodeSet WHITESPACE = new UnicodeSet("[" +
\r
40 "\\u0009-\\u000D\\u0020\\u0085\\u200E\\u200F\\u2028\\u2029" +
\r
42 private static final UnicodeSet SYNTAX = new UnicodeSet("[" +
\r
43 "\\u0021-\\u002F\\u003A-\\u0040\\u005B-\\u0060\\u007B-\\u007E" +
\r
44 "\\u00A1-\\u00A7\\u00A9\\u00AB-\\u00AC\\u00AE" +
\r
45 "\\u00B0-\\u00B1\\u00B6\\u00B7\\u00BB\\u00BF\\u00D7\\u00F7" +
\r
46 "\\u2010-\\u2027\\u2030-\\u205E\\u2190-\\u2BFF" +
\r
47 "\\u3001\\u3003\\u3008-\\u3020\\u3030" +
\r
48 "\\uFD3E\\uFD3F\\uFE45\\uFE46" +
\r
49 "]").removeAll(QUOTERS).remove('$');
\r
50 private static final UnicodeSet NEWLINE = new UnicodeSet("[\\u000A\\u000D\\u0085\\u2028\\u2029]");
\r
51 //private static final UnicodeSet DECIMAL = new UnicodeSet("[:Nd:]");
\r
52 private static final UnicodeSet NON_STRING = new UnicodeSet()
\r
56 protected UnicodeSet whiteSpace = WHITESPACE;
\r
57 protected UnicodeSet syntax = SYNTAX;
\r
58 private UnicodeSet non_string = NON_STRING;
\r
60 private void fixSets() {
\r
61 if (syntax.containsSome(QUOTERS) || syntax.containsSome(whiteSpace)) {
\r
62 syntax = ((UnicodeSet)syntax.clone()).removeAll(QUOTERS).removeAll(whiteSpace);
\r
64 if (whiteSpace.containsSome(QUOTERS)) {
\r
65 whiteSpace = ((UnicodeSet)whiteSpace.clone()).removeAll(QUOTERS);
\r
67 non_string = new UnicodeSet(syntax)
\r
68 .addAll(whiteSpace);
\r
71 public Tokenizer setSource(String source) {
\r
72 this.source = source;
\r
74 return this; // for chaining
\r
77 public Tokenizer setIndex(int index) {
\r
79 return this; // for chaining
\r
82 public static final int
\r
87 UNTERMINATED_QUOTE = -5,
\r
88 BACKEDUP_TOO_FAR = -6;
\r
90 private static final int
\r
94 AFTER_QUOTE = 3, // warning: order is important for switch statement
\r
99 public String toString(int type, boolean backedupBefore) {
\r
100 String s = backedup ? "@" : "*";
\r
104 case BACKEDUP_TOO_FAR:
\r
105 return s+"Illegal Backup"+s;
\r
106 case UNTERMINATED_QUOTE:
\r
107 return s+"Unterminated Quote=" + getString() + s;
\r
109 return s+"s=" + getString() + s;
\r
111 return s+"n=" + getNumber() + s;
\r
113 return s+"n=" + getUnicodeSet() + s;
\r
115 return s+"c=" + usf.getName(type,true) + s;
\r
119 private static final BagFormatter usf = new BagFormatter();
\r
121 public void backup() {
\r
122 if (backedup) throw new IllegalArgumentException("backup too far");
\r
129 public int next2() {
\r
130 boolean backedupBefore = backedup;
\r
131 int result = next();
\r
132 System.out.println(toString(result, backedupBefore));
\r
137 public int next() {
\r
144 boolean inComment = false;
\r
145 // clean off any leading whitespace or comments
\r
147 if (index >= source.length()) return lastValue = DONE;
\r
150 if (NEWLINE.contains(cp)) inComment = false;
\r
152 if (cp == '#') inComment = true;
\r
153 else if (!whiteSpace.contains(cp)) break;
\r
156 // record the last index in case we have to backup
\r
160 ParsePosition pos = new ParsePosition(index-1);
\r
161 unicodeSet = new UnicodeSet(source,pos,symbolTable);
\r
162 index = pos.getIndex();
\r
163 return lastValue = UNICODESET;
\r
165 // get syntax character
\r
166 if (syntax.contains(cp)) return lastValue = cp;
\r
168 // get number, if there is one
\r
169 if (UCharacter.getType(cp) == Character.DECIMAL_DIGIT_NUMBER) {
\r
170 number = UCharacter.getNumericValue(cp);
\r
171 while (index < source.length()) {
\r
173 if (UCharacter.getType(cp) != Character.DECIMAL_DIGIT_NUMBER) {
\r
174 index -= UTF16.getCharCount(cp); // BACKUP!
\r
178 number += UCharacter.getNumericValue(cp);
\r
180 return lastValue = NUMBER;
\r
182 buffer.setLength(0);
\r
183 int status = IN_STRING;
\r
187 case AFTER_QUOTE: // check for double ''?
\r
189 UTF16.append(buffer, QUOTE);
\r
193 // OTHERWISE FALL THROUGH!!!
\r
195 if (cp == QUOTE) status = IN_QUOTE;
\r
196 else if (cp == BSLASH) status = AFTER_BSLASH;
\r
197 else if (non_string.contains(cp)) {
\r
198 index -= UTF16.getCharCount(cp); // BACKUP!
\r
200 } else UTF16.append(buffer,cp);
\r
203 if (cp == QUOTE) status = AFTER_QUOTE;
\r
204 else UTF16.append(buffer,cp);
\r
208 case 'n': cp = '\n'; break;
\r
209 case 'r': cp = '\r'; break;
\r
210 case 't': cp = '\t'; break;
\r
212 UTF16.append(buffer,cp);
\r
213 status = IN_STRING;
\r
215 default: throw new IllegalArgumentException("Internal Error");
\r
217 if (index >= source.length()) break;
\r
220 if (status > IN_STRING) return lastValue = UNTERMINATED_QUOTE;
\r
221 return lastValue = STRING;
\r
224 public String getString() {
\r
225 return buffer.toString();
\r
228 public String toString() {
\r
229 return source.substring(0,index) + "$$$" + source.substring(index);
\r
232 public long getNumber() {
\r
236 public UnicodeSet getUnicodeSet() {
\r
240 private int nextChar() {
\r
241 int cp = UTF16.charAt(source,index);
\r
242 index += UTF16.getCharCount(cp);
\r
245 public int getIndex() {
\r
248 public String getSource() {
\r
251 public UnicodeSet getSyntax() {
\r
254 public UnicodeSet getWhiteSpace() {
\r
257 public void setSyntax(UnicodeSet set) {
\r
261 public void setWhiteSpace(UnicodeSet set) {
\r
266 public Set getLookedUpItems() {
\r
267 return symbolTable.itemsLookedUp;
\r
270 public void addSymbol(String var, String value, int start, int limit) {
\r
271 // the limit is after the ';', so remove it
\r
273 char[] body = new char[limit - start];
\r
274 value.getChars(start, limit, body, 0);
\r
275 symbolTable.add(var, body);
\r
278 public class TokenSymbolTable implements SymbolTable {
\r
279 Map contents = new HashMap();
\r
280 Set itemsLookedUp = new HashSet();
\r
282 public void add(String var, char[] body) {
\r
283 // start from 1 to avoid the $
\r
284 contents.put(var.substring(1), body);
\r
288 * @see com.ibm.icu.text.SymbolTable#lookup(java.lang.String)
\r
290 public char[] lookup(String s) {
\r
291 itemsLookedUp.add('$' + s);
\r
292 return (char[])contents.get(s);
\r
296 * @see com.ibm.icu.text.SymbolTable#lookupMatcher(int)
\r
298 public UnicodeMatcher lookupMatcher(int ch) {
\r
299 // TODO Auto-generated method stub
\r
304 * @see com.ibm.icu.text.SymbolTable#parseReference(java.lang.String, java.text.ParsePosition, int)
\r
306 public String parseReference(String text, ParsePosition pos, int limit) {
\r
308 int start = pos.getIndex();
\r
310 for (i = start; i < limit; i += UTF16.getCharCount(cp)) {
\r
311 cp = UTF16.charAt(text, i);
\r
312 if (!com.ibm.icu.lang.UCharacter.isUnicodeIdentifierPart(cp)) {
\r
317 return text.substring(start,i);
\r