2 *******************************************************************************
3 * Copyright (C) 2006-2009, Google, International Business Machines Corporation *
4 * and others. All Rights Reserved. *
5 *******************************************************************************
7 package com.ibm.icu.impl;
9 import com.ibm.icu.text.UTF16;
10 import com.ibm.icu.text.UnicodeSet;
13 * A simple parsing class for patterns and rules. Handles '...' quotations, \\uxxxx and \\Uxxxxxxxx, and symple syntax.
14 * The '' (two quotes) is treated as a single quote, inside or outside a quote
16 * <li>Any ignorable characters are ignored in parsing.</li>
17 * <li>Any syntax characters are broken into separate tokens</li>
18 * <li>Quote characters can be specified: '...', "...", and \x </li>
19 * <li>Other characters are treated as literals</li>
22 public class PatternTokenizer {
23 // settings used in the interpretation of the pattern
24 private UnicodeSet ignorableCharacters = new UnicodeSet();
25 private UnicodeSet syntaxCharacters = new UnicodeSet();
26 private UnicodeSet extraQuotingCharacters = new UnicodeSet();
27 private UnicodeSet escapeCharacters = new UnicodeSet();
28 private boolean usingSlash = false;
29 private boolean usingQuote = false;
31 // transient data, set when needed. Null it out for any changes in the above fields.
32 private transient UnicodeSet needingQuoteCharacters = null;
34 // data about the current pattern being parsed. start gets moved as we go along.
37 private String pattern;
39 public UnicodeSet getIgnorableCharacters() {
40 return (UnicodeSet) ignorableCharacters.clone();
43 * Sets the characters to be ignored in parsing, eg new UnicodeSet("[:pattern_whitespace:]");
44 * @param ignorableCharacters Characters to be ignored.
45 * @return A PatternTokenizer object in which characters are specified as ignored characters.
47 public PatternTokenizer setIgnorableCharacters(UnicodeSet ignorableCharacters) {
48 this.ignorableCharacters = (UnicodeSet) ignorableCharacters.clone();
49 needingQuoteCharacters = null;
52 public UnicodeSet getSyntaxCharacters() {
53 return (UnicodeSet) syntaxCharacters.clone();
55 public UnicodeSet getExtraQuotingCharacters() {
56 return (UnicodeSet) extraQuotingCharacters.clone();
59 * Sets the characters to be interpreted as syntax characters in parsing, eg new UnicodeSet("[:pattern_syntax:]")
60 * @param syntaxCharacters Characters to be set as syntax characters.
61 * @return A PatternTokenizer object in which characters are specified as syntax characters.
63 public PatternTokenizer setSyntaxCharacters(UnicodeSet syntaxCharacters) {
64 this.syntaxCharacters = (UnicodeSet) syntaxCharacters.clone();
65 needingQuoteCharacters = null;
69 * Sets the extra characters to be quoted in literals
70 * @param syntaxCharacters Characters to be set as extra quoting characters.
71 * @return A PatternTokenizer object in which characters are specified as extra quoting characters.
73 public PatternTokenizer setExtraQuotingCharacters(UnicodeSet syntaxCharacters) {
74 this.extraQuotingCharacters = (UnicodeSet) syntaxCharacters.clone();
75 needingQuoteCharacters = null;
79 public UnicodeSet getEscapeCharacters() {
80 return (UnicodeSet) escapeCharacters.clone();
83 * Set characters to be escaped in literals, in quoteLiteral and normalize, eg new UnicodeSet("[^\\u0020-\\u007E]");
84 * @param escapeCharacters Characters to be set as escape characters.
85 * @return A PatternTokenizer object in which characters are specified as escape characters.
87 public PatternTokenizer setEscapeCharacters(UnicodeSet escapeCharacters) {
88 this.escapeCharacters = (UnicodeSet) escapeCharacters.clone();
91 public boolean isUsingQuote() {
94 public PatternTokenizer setUsingQuote(boolean usingQuote) {
95 this.usingQuote = usingQuote;
96 needingQuoteCharacters = null;
99 public boolean isUsingSlash() {
102 public PatternTokenizer setUsingSlash(boolean usingSlash) {
103 this.usingSlash = usingSlash;
104 needingQuoteCharacters = null;
107 // public UnicodeSet getQuoteCharacters() {
108 // return (UnicodeSet) quoteCharacters.clone();
110 // public PatternTokenizer setQuoteCharacters(UnicodeSet quoteCharacters) {
111 // this.quoteCharacters = (UnicodeSet) quoteCharacters.clone();
112 // needingQuoteCharacters = null;
115 public int getLimit() {
118 public PatternTokenizer setLimit(int limit) {
122 public int getStart() {
125 public PatternTokenizer setStart(int start) {
130 public PatternTokenizer setPattern(CharSequence pattern) {
131 return setPattern(pattern.toString());
134 public PatternTokenizer setPattern(String pattern) {
135 if (pattern == null) {
136 throw new IllegalArgumentException("Inconsistent arguments");
139 this.limit = pattern.length();
140 this.pattern = pattern;
144 public static final char SINGLE_QUOTE = '\'';
145 public static final char BACK_SLASH = '\\';
146 private static int NO_QUOTE = -1, IN_QUOTE = -2;
148 public String quoteLiteral(CharSequence string) {
149 return quoteLiteral(string.toString());
153 * Quote a literal string, using the available settings. Thus syntax characters, quote characters, and ignorable characters will be put into quotes.
154 * @param string String passed to quote a literal string.
155 * @return A string using the available settings will place syntax, quote, or ignorable characters into quotes.
157 public String quoteLiteral(String string) {
158 if (needingQuoteCharacters == null) {
159 needingQuoteCharacters = new UnicodeSet().addAll(syntaxCharacters).addAll(ignorableCharacters).addAll(extraQuotingCharacters); // .addAll(quoteCharacters)
160 if (usingSlash) needingQuoteCharacters.add(BACK_SLASH);
161 if (usingQuote) needingQuoteCharacters.add(SINGLE_QUOTE);
163 StringBuffer result = new StringBuffer();
164 int quotedChar = NO_QUOTE;
166 for (int i = 0; i < string.length(); i += UTF16.getCharCount(cp)) {
167 cp = UTF16.charAt(string, i);
168 if (escapeCharacters.contains(cp)) {
169 // we may have to fix up previous characters
170 if (quotedChar == IN_QUOTE) {
171 result.append(SINGLE_QUOTE);
172 quotedChar = NO_QUOTE;
174 appendEscaped(result, cp);
178 if (needingQuoteCharacters.contains(cp)) {
179 // if we have already started a quote
180 if (quotedChar == IN_QUOTE) {
181 UTF16.append(result, cp);
182 if (usingQuote && cp == SINGLE_QUOTE) { // double it
183 result.append(SINGLE_QUOTE);
187 // otherwise not already in quote
189 result.append(BACK_SLASH);
190 UTF16.append(result, cp);
194 if (cp == SINGLE_QUOTE) { // double it and continue
195 result.append(SINGLE_QUOTE);
196 result.append(SINGLE_QUOTE);
199 result.append(SINGLE_QUOTE);
200 UTF16.append(result, cp);
201 quotedChar = IN_QUOTE;
204 // we have no choice but to use \\u or \\U
205 appendEscaped(result, cp);
208 // otherwise cp doesn't need quoting
209 // we may have to fix up previous characters
210 if (quotedChar == IN_QUOTE) {
211 result.append(SINGLE_QUOTE);
212 quotedChar = NO_QUOTE;
214 UTF16.append(result, cp);
217 // we may have to fix up previous characters
218 if (quotedChar == IN_QUOTE) {
219 result.append(SINGLE_QUOTE);
221 return result.toString();
224 private void appendEscaped(StringBuffer result, int cp) {
226 result.append("\\u").append(Utility.hex(cp,4));
228 result.append("\\U").append(Utility.hex(cp,8));
232 public String normalize() {
233 int oldStart = start;
234 StringBuffer result = new StringBuffer();
235 StringBuffer buffer = new StringBuffer();
238 int status = next(buffer);
239 if (status == DONE) {
241 return result.toString();
243 if (status != SYNTAX) {
244 result.append(quoteLiteral(buffer));
246 result.append(buffer);
251 public static final int DONE = 0, SYNTAX = 1, LITERAL = 2, BROKEN_QUOTE = 3, BROKEN_ESCAPE = 4, UNKNOWN = 5;
253 private static final int AFTER_QUOTE = -1, NONE = 0, START_QUOTE = 1, NORMAL_QUOTE = 2, SLASH_START = 3, HEX = 4;
255 public int next(StringBuffer buffer) {
256 if (start >= limit) return DONE;
257 int status = UNKNOWN;
258 int lastQuote = UNKNOWN;
259 int quoteStatus = NONE;
264 for (int i = start; i < limit; i += UTF16.getCharCount(cp)) {
265 cp = UTF16.charAt(pattern, i);
266 // if we are in a quote, then handle it.
267 switch (quoteStatus) {
282 UTF16.append(buffer, cp);
286 buffer.append(BACK_SLASH);
290 break; // fall through to NONE
295 case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
296 hexValue -= '0'; break;
297 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
298 hexValue -= 'a' - 10; break;
299 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
300 hexValue -= 'A' - 10; break;
303 return BROKEN_ESCAPE;
308 UTF16.append(buffer, hexValue);
312 // see if we get another quote character
313 // if we just ended a quote BUT the following character is the lastQuote character, then we have a situation like '...''...', so we restart the quote
314 if (cp == lastQuote) {
315 UTF16.append(buffer, cp);
316 quoteStatus = NORMAL_QUOTE;
320 break; // fall through to NONE
322 // if we are at the very start of a quote, and we hit another quote mark then we emit a literal quote character and end the quote
323 if (cp == lastQuote) {
324 UTF16.append(buffer, cp);
325 quoteStatus = NONE; // get out of quote, with no trace remaining
328 // otherwise get into quote
329 UTF16.append(buffer, cp);
330 quoteStatus = NORMAL_QUOTE;
333 if (cp == lastQuote) {
334 quoteStatus = AFTER_QUOTE; // get out of quote
337 UTF16.append(buffer, cp);
341 if (ignorableCharacters.contains(cp)) {
344 // do syntax characters
345 if (syntaxCharacters.contains(cp)) {
346 if (status == UNKNOWN) {
347 UTF16.append(buffer, cp);
348 start = i + UTF16.getCharCount(cp);
350 } else { // LITERAL, so back up and break
355 // otherwise it is a literal; keep on going
357 if (cp == BACK_SLASH) {
358 quoteStatus = SLASH_START;
360 } else if (usingQuote && cp == SINGLE_QUOTE) {
362 quoteStatus = START_QUOTE;
366 UTF16.append(buffer, cp);
368 // handle final cleanup
370 switch (quoteStatus) {
372 status = BROKEN_ESCAPE;
376 status = BROKEN_ESCAPE;
378 buffer.append(BACK_SLASH);
381 case START_QUOTE: case NORMAL_QUOTE:
382 status = BROKEN_QUOTE;