3 *******************************************************************************
4 * Copyright (C) 2006-2009, Google, International Business Machines Corporation *
5 * and others. All Rights Reserved. *
6 *******************************************************************************
8 package com.ibm.icu.impl;
10 import com.ibm.icu.text.UTF16;
11 import com.ibm.icu.text.UnicodeSet;
14 * A simple parsing class for patterns and rules. Handles '...' quotations, \\uxxxx and \\Uxxxxxxxx, and symple syntax.
15 * The '' (two quotes) is treated as a single quote, inside or outside a quote
17 * <li>Any ignorable characters are ignored in parsing.</li>
18 * <li>Any syntax characters are broken into separate tokens</li>
19 * <li>Quote characters can be specified: '...', "...", and \x </li>
20 * <li>Other characters are treated as literals</li>
23 public class PatternTokenizer {
24 // settings used in the interpretation of the pattern
25 private UnicodeSet ignorableCharacters = new UnicodeSet();
26 private UnicodeSet syntaxCharacters = new UnicodeSet();
27 private UnicodeSet extraQuotingCharacters = new UnicodeSet();
28 private UnicodeSet escapeCharacters = new UnicodeSet();
29 private boolean usingSlash = false;
30 private boolean usingQuote = false;
32 // transient data, set when needed. Null it out for any changes in the above fields.
33 private transient UnicodeSet needingQuoteCharacters = null;
35 // data about the current pattern being parsed. start gets moved as we go along.
38 private String pattern;
40 public UnicodeSet getIgnorableCharacters() {
41 return (UnicodeSet) ignorableCharacters.clone();
44 * Sets the characters to be ignored in parsing, eg new UnicodeSet("[:pattern_whitespace:]");
45 * @param ignorableCharacters
48 public PatternTokenizer setIgnorableCharacters(UnicodeSet ignorableCharacters) {
49 this.ignorableCharacters = (UnicodeSet) ignorableCharacters.clone();
50 needingQuoteCharacters = null;
53 public UnicodeSet getSyntaxCharacters() {
54 return (UnicodeSet) syntaxCharacters.clone();
56 public UnicodeSet getExtraQuotingCharacters() {
57 return (UnicodeSet) extraQuotingCharacters.clone();
60 * Sets the characters to be interpreted as syntax characters in parsing, eg new UnicodeSet("[:pattern_syntax:]")
61 * @param syntaxCharacters
64 public PatternTokenizer setSyntaxCharacters(UnicodeSet syntaxCharacters) {
65 this.syntaxCharacters = (UnicodeSet) syntaxCharacters.clone();
66 needingQuoteCharacters = null;
70 * Sets the extra characters to be quoted in literals
71 * @param syntaxCharacters
74 public PatternTokenizer setExtraQuotingCharacters(UnicodeSet syntaxCharacters) {
75 this.extraQuotingCharacters = (UnicodeSet) syntaxCharacters.clone();
76 needingQuoteCharacters = null;
80 public UnicodeSet getEscapeCharacters() {
81 return (UnicodeSet) escapeCharacters.clone();
84 * Set characters to be escaped in literals, in quoteLiteral and normalize, eg new UnicodeSet("[^\\u0020-\\u007E]");
85 * @param escapeCharacters
88 public PatternTokenizer setEscapeCharacters(UnicodeSet escapeCharacters) {
89 this.escapeCharacters = (UnicodeSet) escapeCharacters.clone();
92 public boolean isUsingQuote() {
95 public PatternTokenizer setUsingQuote(boolean usingQuote) {
96 this.usingQuote = usingQuote;
97 needingQuoteCharacters = null;
100 public boolean isUsingSlash() {
103 public PatternTokenizer setUsingSlash(boolean usingSlash) {
104 this.usingSlash = usingSlash;
105 needingQuoteCharacters = null;
108 // public UnicodeSet getQuoteCharacters() {
109 // return (UnicodeSet) quoteCharacters.clone();
111 // public PatternTokenizer setQuoteCharacters(UnicodeSet quoteCharacters) {
112 // this.quoteCharacters = (UnicodeSet) quoteCharacters.clone();
113 // needingQuoteCharacters = null;
116 public int getLimit() {
119 public PatternTokenizer setLimit(int limit) {
123 public int getStart() {
126 public PatternTokenizer setStart(int start) {
131 //#if defined(FOUNDATION10) || defined(J2SE13)
132 //## public PatternTokenizer setPattern(StringBuffer pattern) {
133 //## return setPattern(pattern.toString());
136 public PatternTokenizer setPattern(CharSequence pattern) {
137 return setPattern(pattern.toString());
141 public PatternTokenizer setPattern(String pattern) {
142 if (pattern == null) {
143 throw new IllegalArgumentException("Inconsistent arguments");
146 this.limit = pattern.length();
147 this.pattern = pattern;
151 public static final char SINGLE_QUOTE = '\'';
152 public static final char BACK_SLASH = '\\';
153 private static int NO_QUOTE = -1, IN_QUOTE = -2;
155 //#if defined(FOUNDATION10) || defined(J2SE13)
156 //## public String quoteLiteral(StringBuffer string) {
157 //## return quoteLiteral(string.toString());
160 public String quoteLiteral(CharSequence string) {
161 return quoteLiteral(string.toString());
166 * Quote a literal string, using the available settings. Thus syntax characters, quote characters, and ignorable characters will be put into quotes.
170 public String quoteLiteral(String string) {
171 if (needingQuoteCharacters == null) {
172 needingQuoteCharacters = new UnicodeSet().addAll(syntaxCharacters).addAll(ignorableCharacters).addAll(extraQuotingCharacters); // .addAll(quoteCharacters)
173 if (usingSlash) needingQuoteCharacters.add(BACK_SLASH);
174 if (usingQuote) needingQuoteCharacters.add(SINGLE_QUOTE);
176 StringBuffer result = new StringBuffer();
177 int quotedChar = NO_QUOTE;
179 for (int i = 0; i < string.length(); i += UTF16.getCharCount(cp)) {
180 cp = UTF16.charAt(string, i);
181 if (escapeCharacters.contains(cp)) {
182 // we may have to fix up previous characters
183 if (quotedChar == IN_QUOTE) {
184 result.append(SINGLE_QUOTE);
185 quotedChar = NO_QUOTE;
187 appendEscaped(result, cp);
191 if (needingQuoteCharacters.contains(cp)) {
192 // if we have already started a quote
193 if (quotedChar == IN_QUOTE) {
194 UTF16.append(result, cp);
195 if (usingQuote && cp == SINGLE_QUOTE) { // double it
196 result.append(SINGLE_QUOTE);
200 // otherwise not already in quote
202 result.append(BACK_SLASH);
203 UTF16.append(result, cp);
207 if (cp == SINGLE_QUOTE) { // double it and continue
208 result.append(SINGLE_QUOTE);
209 result.append(SINGLE_QUOTE);
212 result.append(SINGLE_QUOTE);
213 UTF16.append(result, cp);
214 quotedChar = IN_QUOTE;
217 // we have no choice but to use \\u or \\U
218 appendEscaped(result, cp);
221 // otherwise cp doesn't need quoting
222 // we may have to fix up previous characters
223 if (quotedChar == IN_QUOTE) {
224 result.append(SINGLE_QUOTE);
225 quotedChar = NO_QUOTE;
227 UTF16.append(result, cp);
230 // we may have to fix up previous characters
231 if (quotedChar == IN_QUOTE) {
232 result.append(SINGLE_QUOTE);
234 return result.toString();
237 private void appendEscaped(StringBuffer result, int cp) {
239 result.append("\\u").append(Utility.hex(cp,4));
241 result.append("\\U").append(Utility.hex(cp,8));
245 public String normalize() {
246 int oldStart = start;
247 StringBuffer result = new StringBuffer();
248 StringBuffer buffer = new StringBuffer();
251 int status = next(buffer);
252 if (status == DONE) {
254 return result.toString();
256 if (status != SYNTAX) {
257 result.append(quoteLiteral(buffer));
259 result.append(buffer);
264 public static final int DONE = 0, SYNTAX = 1, LITERAL = 2, BROKEN_QUOTE = 3, BROKEN_ESCAPE = 4, UNKNOWN = 5;
266 private static final int AFTER_QUOTE = -1, NONE = 0, START_QUOTE = 1, NORMAL_QUOTE = 2, SLASH_START = 3, HEX = 4;
268 public int next(StringBuffer buffer) {
269 if (start >= limit) return DONE;
270 int status = UNKNOWN;
271 int lastQuote = UNKNOWN;
272 int quoteStatus = NONE;
277 for (int i = start; i < limit; i += UTF16.getCharCount(cp)) {
278 cp = UTF16.charAt(pattern, i);
279 // if we are in a quote, then handle it.
280 switch (quoteStatus) {
295 UTF16.append(buffer, cp);
299 buffer.append(BACK_SLASH);
303 break; // fall through to NONE
308 case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
309 hexValue -= '0'; break;
310 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
311 hexValue -= 'a' - 10; break;
312 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
313 hexValue -= 'A' - 10; break;
316 return BROKEN_ESCAPE;
321 UTF16.append(buffer, hexValue);
325 // see if we get another quote character
326 // if we just ended a quote BUT the following character is the lastQuote character, then we have a situation like '...''...', so we restart the quote
327 if (cp == lastQuote) {
328 UTF16.append(buffer, cp);
329 quoteStatus = NORMAL_QUOTE;
333 break; // fall through to NONE
335 // if we are at the very start of a quote, and we hit another quote mark then we emit a literal quote character and end the quote
336 if (cp == lastQuote) {
337 UTF16.append(buffer, cp);
338 quoteStatus = NONE; // get out of quote, with no trace remaining
341 // otherwise get into quote
342 UTF16.append(buffer, cp);
343 quoteStatus = NORMAL_QUOTE;
346 if (cp == lastQuote) {
347 quoteStatus = AFTER_QUOTE; // get out of quote
350 UTF16.append(buffer, cp);
354 if (ignorableCharacters.contains(cp)) {
357 // do syntax characters
358 if (syntaxCharacters.contains(cp)) {
359 if (status == UNKNOWN) {
360 UTF16.append(buffer, cp);
361 start = i + UTF16.getCharCount(cp);
363 } else { // LITERAL, so back up and break
368 // otherwise it is a literal; keep on going
370 if (cp == BACK_SLASH) {
371 quoteStatus = SLASH_START;
373 } else if (usingQuote && cp == SINGLE_QUOTE) {
375 quoteStatus = START_QUOTE;
379 UTF16.append(buffer, cp);
381 // handle final cleanup
383 switch (quoteStatus) {
385 status = BROKEN_ESCAPE;
389 status = BROKEN_ESCAPE;
391 buffer.append(BACK_SLASH);
394 case START_QUOTE: case NORMAL_QUOTE:
395 status = BROKEN_QUOTE;