]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-52_1/main/classes/core/src/com/ibm/icu/impl/PatternTokenizer.java
Upgrade ICU4J.
[Dictionary.git] / jars / icu4j-52_1 / main / classes / core / src / com / ibm / icu / impl / PatternTokenizer.java
1 /*
2  *******************************************************************************
3  * Copyright (C) 2006-2009, Google, International Business Machines Corporation *
4  * and others. All Rights Reserved.                                            *
5  *******************************************************************************
6  */
7 package com.ibm.icu.impl;
8
9 import com.ibm.icu.text.UTF16;
10 import com.ibm.icu.text.UnicodeSet;
11
12 /**
13  * A simple parsing class for patterns and rules. Handles '...' quotations, \\uxxxx and \\Uxxxxxxxx, and symple syntax.
14  * The '' (two quotes) is treated as a single quote, inside or outside a quote
15  * <ul>
16  * <li>Any ignorable characters are ignored in parsing.</li>
17  * <li>Any syntax characters are broken into separate tokens</li>
18  * <li>Quote characters can be specified: '...', "...", and \x </li>
19  * <li>Other characters are treated as literals</li>
20  * </ul>
21  */
22 public class PatternTokenizer {
23     // settings used in the interpretation of the pattern
24     private UnicodeSet ignorableCharacters = new UnicodeSet();
25     private UnicodeSet syntaxCharacters = new UnicodeSet();
26     private UnicodeSet extraQuotingCharacters = new UnicodeSet();
27     private UnicodeSet escapeCharacters = new UnicodeSet();
28     private boolean usingSlash = false;
29     private boolean usingQuote = false;
30     
31     // transient data, set when needed. Null it out for any changes in the above fields.
32     private transient UnicodeSet needingQuoteCharacters = null;
33     
34     // data about the current pattern being parsed. start gets moved as we go along.
35     private int start;
36     private int limit;
37     private String pattern;
38     
39     public UnicodeSet getIgnorableCharacters() {
40         return (UnicodeSet) ignorableCharacters.clone();
41     }
42     /**
43      * Sets the characters to be ignored in parsing, eg new UnicodeSet("[:pattern_whitespace:]");
44      * @param ignorableCharacters Characters to be ignored.
45      * @return A PatternTokenizer object in which characters are specified as ignored characters.
46      */
47     public PatternTokenizer setIgnorableCharacters(UnicodeSet ignorableCharacters) {
48         this.ignorableCharacters = (UnicodeSet) ignorableCharacters.clone();
49         needingQuoteCharacters = null;
50         return this;
51     }
52     public UnicodeSet getSyntaxCharacters() {
53         return (UnicodeSet) syntaxCharacters.clone();
54     }
55     public UnicodeSet getExtraQuotingCharacters() {
56         return (UnicodeSet) extraQuotingCharacters.clone();
57     }
58     /**
59      *  Sets the characters to be interpreted as syntax characters in parsing, eg new UnicodeSet("[:pattern_syntax:]")
60      * @param syntaxCharacters Characters to be set as syntax characters.
61      * @return A PatternTokenizer object in which characters are specified as syntax characters.
62      */
63     public PatternTokenizer setSyntaxCharacters(UnicodeSet syntaxCharacters) {
64         this.syntaxCharacters = (UnicodeSet) syntaxCharacters.clone();
65         needingQuoteCharacters = null;
66         return this;
67     }   
68     /**
69      *  Sets the extra characters to be quoted in literals
70      * @param syntaxCharacters Characters to be set as extra quoting characters.
71      * @return A PatternTokenizer object in which characters are specified as extra quoting characters.
72      */
73     public PatternTokenizer setExtraQuotingCharacters(UnicodeSet syntaxCharacters) {
74         this.extraQuotingCharacters = (UnicodeSet) syntaxCharacters.clone();
75         needingQuoteCharacters = null;
76         return this;
77     }   
78     
79     public UnicodeSet getEscapeCharacters() {
80         return (UnicodeSet) escapeCharacters.clone();
81     }
82     /**
83      * Set characters to be escaped in literals, in quoteLiteral and normalize, eg new UnicodeSet("[^\\u0020-\\u007E]");
84      * @param escapeCharacters Characters to be set as escape characters.
85      * @return A PatternTokenizer object in which characters are specified as escape characters.
86      */
87     public PatternTokenizer setEscapeCharacters(UnicodeSet escapeCharacters) {
88         this.escapeCharacters = (UnicodeSet) escapeCharacters.clone();
89         return this;
90     }
91     public boolean isUsingQuote() {
92         return usingQuote;
93     }
94     public PatternTokenizer setUsingQuote(boolean usingQuote) {
95         this.usingQuote = usingQuote;
96         needingQuoteCharacters = null;
97         return this;
98     }
99     public boolean isUsingSlash() {
100         return usingSlash;
101     }
102     public PatternTokenizer setUsingSlash(boolean usingSlash) {
103         this.usingSlash = usingSlash;
104         needingQuoteCharacters = null;
105         return this;
106     }
107     //    public UnicodeSet getQuoteCharacters() {
108 //  return (UnicodeSet) quoteCharacters.clone();
109 //  }
110 //  public PatternTokenizer setQuoteCharacters(UnicodeSet quoteCharacters) {
111 //  this.quoteCharacters = (UnicodeSet) quoteCharacters.clone();
112 //  needingQuoteCharacters = null;
113 //  return this;
114 //  }
115     public int getLimit() {
116         return limit;
117     }
118     public PatternTokenizer setLimit(int limit) {
119         this.limit = limit;
120         return this;
121     }
122     public int getStart() {
123         return start;
124     }
125     public PatternTokenizer setStart(int start) {
126         this.start = start;
127         return this;
128     }
129
130     public PatternTokenizer setPattern(CharSequence pattern) {
131         return setPattern(pattern.toString());
132     }
133
134     public PatternTokenizer setPattern(String pattern) {
135         if (pattern == null) {
136             throw new IllegalArgumentException("Inconsistent arguments");
137         }
138         this.start = 0;
139         this.limit = pattern.length();
140         this.pattern = pattern;
141         return this;
142     }
143
144     public static final char SINGLE_QUOTE = '\'';
145     public static final char BACK_SLASH = '\\';
146     private static int NO_QUOTE = -1, IN_QUOTE = -2;
147
148     public String quoteLiteral(CharSequence string) {
149         return quoteLiteral(string.toString());
150     }
151
152     /**
153      * Quote a literal string, using the available settings. Thus syntax characters, quote characters, and ignorable characters will be put into quotes.
154      * @param string String passed to quote a literal string.
155      * @return A string using the available settings will place syntax, quote, or ignorable characters into quotes.
156      */
157     public String quoteLiteral(String string) {
158         if (needingQuoteCharacters == null) {
159             needingQuoteCharacters = new UnicodeSet().addAll(syntaxCharacters).addAll(ignorableCharacters).addAll(extraQuotingCharacters); // .addAll(quoteCharacters)
160             if (usingSlash) needingQuoteCharacters.add(BACK_SLASH);
161             if (usingQuote) needingQuoteCharacters.add(SINGLE_QUOTE);
162         }
163         StringBuffer result = new StringBuffer();
164         int quotedChar = NO_QUOTE;
165         int cp;
166         for (int i = 0; i < string.length(); i += UTF16.getCharCount(cp)) {
167             cp = UTF16.charAt(string, i);
168             if (escapeCharacters.contains(cp)) {
169                 // we may have to fix up previous characters
170                 if (quotedChar == IN_QUOTE) {
171                     result.append(SINGLE_QUOTE);
172                     quotedChar = NO_QUOTE;
173                 }
174                 appendEscaped(result, cp);
175                 continue;
176             }
177             
178             if (needingQuoteCharacters.contains(cp)) {
179                 // if we have already started a quote
180                 if (quotedChar == IN_QUOTE) {
181                     UTF16.append(result, cp);
182                     if (usingQuote && cp == SINGLE_QUOTE) { // double it
183                         result.append(SINGLE_QUOTE);
184                     }
185                     continue;
186                 }
187                 // otherwise not already in quote
188                 if (usingSlash) {
189                     result.append(BACK_SLASH);
190                     UTF16.append(result, cp);
191                     continue;
192                 }
193                 if (usingQuote) {
194                     if (cp == SINGLE_QUOTE) { // double it and continue
195                         result.append(SINGLE_QUOTE);
196                         result.append(SINGLE_QUOTE);
197                         continue;
198                     }
199                     result.append(SINGLE_QUOTE);
200                     UTF16.append(result, cp);
201                     quotedChar = IN_QUOTE;
202                     continue;
203                 }
204                 // we have no choice but to use \\u or \\U
205                 appendEscaped(result, cp);
206                 continue;
207             }
208             // otherwise cp doesn't need quoting
209             // we may have to fix up previous characters
210             if (quotedChar == IN_QUOTE) {
211                 result.append(SINGLE_QUOTE);
212                 quotedChar = NO_QUOTE;
213             }
214             UTF16.append(result, cp);
215         }
216         // all done. 
217         // we may have to fix up previous characters
218         if (quotedChar == IN_QUOTE) {
219             result.append(SINGLE_QUOTE);
220         }
221         return result.toString();
222     }
223
224     private void appendEscaped(StringBuffer result, int cp) {
225         if (cp <= 0xFFFF) {
226             result.append("\\u").append(Utility.hex(cp,4));
227         } else {
228             result.append("\\U").append(Utility.hex(cp,8));
229         }
230     }
231     
232     public String normalize() {
233         int oldStart = start;
234         StringBuffer result = new StringBuffer();
235         StringBuffer buffer = new StringBuffer();
236         while (true) {
237             buffer.setLength(0);
238             int status = next(buffer);
239             if (status == DONE) {
240                 start = oldStart;
241                 return result.toString();
242             }
243             if (status != SYNTAX) {
244                 result.append(quoteLiteral(buffer));
245             } else {
246                 result.append(buffer);
247             }
248         }
249     }
250     
251     public static final int DONE = 0, SYNTAX = 1, LITERAL = 2, BROKEN_QUOTE = 3, BROKEN_ESCAPE = 4, UNKNOWN = 5;
252     
253     private static final int AFTER_QUOTE = -1, NONE = 0, START_QUOTE = 1, NORMAL_QUOTE = 2, SLASH_START = 3, HEX = 4;
254     
255     public int next(StringBuffer buffer) {
256         if (start >= limit) return DONE;
257         int status = UNKNOWN;
258         int lastQuote = UNKNOWN;
259         int quoteStatus = NONE;
260         int hexCount = 0;
261         int hexValue = 0;
262         int cp;
263         main:
264             for (int i = start; i < limit; i += UTF16.getCharCount(cp)) {
265                 cp = UTF16.charAt(pattern, i);
266                 // if we are in a quote, then handle it.
267                 switch (quoteStatus) {
268                 case SLASH_START:
269                     switch (cp) {
270                     case 'u':
271                         quoteStatus = HEX;
272                         hexCount = 4;
273                         hexValue = 0;
274                         continue main;
275                     case 'U': 
276                         quoteStatus = HEX;
277                         hexCount = 8;
278                         hexValue = 0;
279                         continue main;
280                     default:
281                         if (usingSlash) {
282                             UTF16.append(buffer, cp);
283                             quoteStatus = NONE;
284                             continue main;
285                         } else {
286                             buffer.append(BACK_SLASH);
287                             quoteStatus = NONE;
288                         }
289                     }
290                     break; // fall through to NONE
291                 case HEX:
292                     hexValue <<= 4;
293                     hexValue += cp;
294                     switch (cp) {
295                     case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
296                         hexValue -= '0'; break;
297                     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
298                         hexValue -= 'a' - 10; break;
299                     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
300                         hexValue -= 'A' - 10; break;
301                     default:
302                         start = i;
303                     return BROKEN_ESCAPE;
304                     }
305                     --hexCount;
306                     if (hexCount == 0) {
307                         quoteStatus = NONE;
308                         UTF16.append(buffer, hexValue);
309                     }
310                     continue main;
311                 case AFTER_QUOTE:
312                     // see if we get another quote character
313                     // if we just ended a quote BUT the following character is the lastQuote character, then we have a situation like '...''...', so we restart the quote
314                     if (cp == lastQuote) {
315                         UTF16.append(buffer, cp);
316                         quoteStatus = NORMAL_QUOTE;
317                         continue main;
318                     }
319                     quoteStatus = NONE;
320                     break; // fall through to NONE
321                 case START_QUOTE:
322                     // if we are at the very start of a quote, and we hit another quote mark then we emit a literal quote character and end the quote
323                     if (cp == lastQuote) {
324                         UTF16.append(buffer, cp);
325                         quoteStatus = NONE; // get out of quote, with no trace remaining
326                         continue;                            
327                     }
328                     // otherwise get into quote
329                     UTF16.append(buffer, cp);
330                     quoteStatus = NORMAL_QUOTE;
331                     continue main;
332                 case NORMAL_QUOTE: 
333                     if (cp == lastQuote) {
334                         quoteStatus = AFTER_QUOTE; // get out of quote
335                         continue main;
336                     }
337                     UTF16.append(buffer, cp);
338                     continue main;
339                 }
340                 
341                 if (ignorableCharacters.contains(cp)) {
342                     continue;
343                 }
344                 // do syntax characters
345                 if (syntaxCharacters.contains(cp)) {
346                     if (status == UNKNOWN) {
347                         UTF16.append(buffer, cp);
348                         start = i + UTF16.getCharCount(cp);
349                         return SYNTAX;
350                     } else { // LITERAL, so back up and break
351                         start = i;
352                         return status;
353                     }
354                 }
355                 // otherwise it is a literal; keep on going
356                 status = LITERAL;
357                 if (cp == BACK_SLASH) {
358                     quoteStatus = SLASH_START;
359                     continue;
360                 } else if (usingQuote && cp == SINGLE_QUOTE) {
361                     lastQuote = cp;
362                     quoteStatus = START_QUOTE;
363                     continue;
364                 }
365                 // normal literals
366                 UTF16.append(buffer, cp);
367             }
368         // handle final cleanup
369         start = limit;
370         switch (quoteStatus) {
371         case HEX:
372             status = BROKEN_ESCAPE;
373             break;
374         case SLASH_START:
375             if (usingSlash) {
376                 status = BROKEN_ESCAPE;
377             } else {
378                 buffer.append(BACK_SLASH);
379             }
380             break;
381         case START_QUOTE: case NORMAL_QUOTE:
382             status = BROKEN_QUOTE;
383             break;
384         }
385         return status;
386     }
387     
388     
389 }
390 //eof