]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-4_2_1-src/src/com/ibm/icu/impl/PatternTokenizer.java
go
[Dictionary.git] / jars / icu4j-4_2_1-src / src / com / ibm / icu / impl / PatternTokenizer.java
1 //##header J2SE15
2 /*
3  *******************************************************************************
4  * Copyright (C) 2006-2009, Google, International Business Machines Corporation *
5  * and others. All Rights Reserved.                                            *
6  *******************************************************************************
7  */
8 package com.ibm.icu.impl;
9
10 import com.ibm.icu.text.UTF16;
11 import com.ibm.icu.text.UnicodeSet;
12
13 /**
14  * A simple parsing class for patterns and rules. Handles '...' quotations, \\uxxxx and \\Uxxxxxxxx, and symple syntax.
15  * The '' (two quotes) is treated as a single quote, inside or outside a quote
16  * <ul>
17  * <li>Any ignorable characters are ignored in parsing.</li>
18  * <li>Any syntax characters are broken into separate tokens</li>
19  * <li>Quote characters can be specified: '...', "...", and \x </li>
20  * <li>Other characters are treated as literals</li>
21  * </ul>
22  */
23 public class PatternTokenizer {
24     // settings used in the interpretation of the pattern
25     private UnicodeSet ignorableCharacters = new UnicodeSet();
26     private UnicodeSet syntaxCharacters = new UnicodeSet();
27     private UnicodeSet extraQuotingCharacters = new UnicodeSet();
28     private UnicodeSet escapeCharacters = new UnicodeSet();
29     private boolean usingSlash = false;
30     private boolean usingQuote = false;
31     
32     // transient data, set when needed. Null it out for any changes in the above fields.
33     private transient UnicodeSet needingQuoteCharacters = null;
34     
35     // data about the current pattern being parsed. start gets moved as we go along.
36     private int start;
37     private int limit;
38     private String pattern;
39     
40     public UnicodeSet getIgnorableCharacters() {
41         return (UnicodeSet) ignorableCharacters.clone();
42     }
43     /**
44      * Sets the characters to be ignored in parsing, eg new UnicodeSet("[:pattern_whitespace:]");
45      * @param ignorableCharacters
46      * @return
47      */
48     public PatternTokenizer setIgnorableCharacters(UnicodeSet ignorableCharacters) {
49         this.ignorableCharacters = (UnicodeSet) ignorableCharacters.clone();
50         needingQuoteCharacters = null;
51         return this;
52     }
53     public UnicodeSet getSyntaxCharacters() {
54         return (UnicodeSet) syntaxCharacters.clone();
55     }
56     public UnicodeSet getExtraQuotingCharacters() {
57         return (UnicodeSet) extraQuotingCharacters.clone();
58     }
59     /**
60      *  Sets the characters to be interpreted as syntax characters in parsing, eg new UnicodeSet("[:pattern_syntax:]")
61      * @param syntaxCharacters
62      * @return
63      */
64     public PatternTokenizer setSyntaxCharacters(UnicodeSet syntaxCharacters) {
65         this.syntaxCharacters = (UnicodeSet) syntaxCharacters.clone();
66         needingQuoteCharacters = null;
67         return this;
68     }   
69     /**
70      *  Sets the extra characters to be quoted in literals
71      * @param syntaxCharacters
72      * @return
73      */
74     public PatternTokenizer setExtraQuotingCharacters(UnicodeSet syntaxCharacters) {
75         this.extraQuotingCharacters = (UnicodeSet) syntaxCharacters.clone();
76         needingQuoteCharacters = null;
77         return this;
78     }   
79     
80     public UnicodeSet getEscapeCharacters() {
81         return (UnicodeSet) escapeCharacters.clone();
82     }
83     /**
84      * Set characters to be escaped in literals, in quoteLiteral and normalize, eg new UnicodeSet("[^\\u0020-\\u007E]");
85      * @param escapeCharacters
86      * @return
87      */
88     public PatternTokenizer setEscapeCharacters(UnicodeSet escapeCharacters) {
89         this.escapeCharacters = (UnicodeSet) escapeCharacters.clone();
90         return this;
91     }
92     public boolean isUsingQuote() {
93         return usingQuote;
94     }
95     public PatternTokenizer setUsingQuote(boolean usingQuote) {
96         this.usingQuote = usingQuote;
97         needingQuoteCharacters = null;
98         return this;
99     }
100     public boolean isUsingSlash() {
101         return usingSlash;
102     }
103     public PatternTokenizer setUsingSlash(boolean usingSlash) {
104         this.usingSlash = usingSlash;
105         needingQuoteCharacters = null;
106         return this;
107     }
108     //    public UnicodeSet getQuoteCharacters() {
109 //  return (UnicodeSet) quoteCharacters.clone();
110 //  }
111 //  public PatternTokenizer setQuoteCharacters(UnicodeSet quoteCharacters) {
112 //  this.quoteCharacters = (UnicodeSet) quoteCharacters.clone();
113 //  needingQuoteCharacters = null;
114 //  return this;
115 //  }
116     public int getLimit() {
117         return limit;
118     }
119     public PatternTokenizer setLimit(int limit) {
120         this.limit = limit;
121         return this;
122     }
123     public int getStart() {
124         return start;
125     }
126     public PatternTokenizer setStart(int start) {
127         this.start = start;
128         return this;
129     }
130
131 //#if defined(FOUNDATION10) || defined(J2SE13)
132 //##    public PatternTokenizer setPattern(StringBuffer pattern) {
133 //##        return setPattern(pattern.toString());
134 //##    }
135 //#else 
136     public PatternTokenizer setPattern(CharSequence pattern) {
137         return setPattern(pattern.toString());
138     }
139 //#endif
140
141     public PatternTokenizer setPattern(String pattern) {
142         if (pattern == null) {
143             throw new IllegalArgumentException("Inconsistent arguments");
144         }
145         this.start = 0;
146         this.limit = pattern.length();
147         this.pattern = pattern;
148         return this;
149     }
150
151     public static final char SINGLE_QUOTE = '\'';
152     public static final char BACK_SLASH = '\\';
153     private static int NO_QUOTE = -1, IN_QUOTE = -2;
154
155 //#if defined(FOUNDATION10) || defined(J2SE13)
156 //##    public String quoteLiteral(StringBuffer string) {
157 //##        return quoteLiteral(string.toString());
158 //##    }
159 //#else
160     public String quoteLiteral(CharSequence string) {
161         return quoteLiteral(string.toString());
162     }
163 //#endif
164
165     /**
166      * Quote a literal string, using the available settings. Thus syntax characters, quote characters, and ignorable characters will be put into quotes.
167      * @param string
168      * @return
169      */
170     public String quoteLiteral(String string) {
171         if (needingQuoteCharacters == null) {
172             needingQuoteCharacters = new UnicodeSet().addAll(syntaxCharacters).addAll(ignorableCharacters).addAll(extraQuotingCharacters); // .addAll(quoteCharacters)
173             if (usingSlash) needingQuoteCharacters.add(BACK_SLASH);
174             if (usingQuote) needingQuoteCharacters.add(SINGLE_QUOTE);
175         }
176         StringBuffer result = new StringBuffer();
177         int quotedChar = NO_QUOTE;
178         int cp;
179         for (int i = 0; i < string.length(); i += UTF16.getCharCount(cp)) {
180             cp = UTF16.charAt(string, i);
181             if (escapeCharacters.contains(cp)) {
182                 // we may have to fix up previous characters
183                 if (quotedChar == IN_QUOTE) {
184                     result.append(SINGLE_QUOTE);
185                     quotedChar = NO_QUOTE;
186                 }
187                 appendEscaped(result, cp);
188                 continue;
189             }
190             
191             if (needingQuoteCharacters.contains(cp)) {
192                 // if we have already started a quote
193                 if (quotedChar == IN_QUOTE) {
194                     UTF16.append(result, cp);
195                     if (usingQuote && cp == SINGLE_QUOTE) { // double it
196                         result.append(SINGLE_QUOTE);
197                     }
198                     continue;
199                 }
200                 // otherwise not already in quote
201                 if (usingSlash) {
202                     result.append(BACK_SLASH);
203                     UTF16.append(result, cp);
204                     continue;
205                 }
206                 if (usingQuote) {
207                     if (cp == SINGLE_QUOTE) { // double it and continue
208                         result.append(SINGLE_QUOTE);
209                         result.append(SINGLE_QUOTE);
210                         continue;
211                     }
212                     result.append(SINGLE_QUOTE);
213                     UTF16.append(result, cp);
214                     quotedChar = IN_QUOTE;
215                     continue;
216                 }
217                 // we have no choice but to use \\u or \\U
218                 appendEscaped(result, cp);
219                 continue;
220             }
221             // otherwise cp doesn't need quoting
222             // we may have to fix up previous characters
223             if (quotedChar == IN_QUOTE) {
224                 result.append(SINGLE_QUOTE);
225                 quotedChar = NO_QUOTE;
226             }
227             UTF16.append(result, cp);
228         }
229         // all done. 
230         // we may have to fix up previous characters
231         if (quotedChar == IN_QUOTE) {
232             result.append(SINGLE_QUOTE);
233         }
234         return result.toString();
235     }
236
237     private void appendEscaped(StringBuffer result, int cp) {
238         if (cp <= 0xFFFF) {
239             result.append("\\u").append(Utility.hex(cp,4));
240         } else {
241             result.append("\\U").append(Utility.hex(cp,8));
242         }
243     }
244     
245     public String normalize() {
246         int oldStart = start;
247         StringBuffer result = new StringBuffer();
248         StringBuffer buffer = new StringBuffer();
249         while (true) {
250             buffer.setLength(0);
251             int status = next(buffer);
252             if (status == DONE) {
253                 start = oldStart;
254                 return result.toString();
255             }
256             if (status != SYNTAX) {
257                 result.append(quoteLiteral(buffer));
258             } else {
259                 result.append(buffer);
260             }
261         }
262     }
263     
264     public static final int DONE = 0, SYNTAX = 1, LITERAL = 2, BROKEN_QUOTE = 3, BROKEN_ESCAPE = 4, UNKNOWN = 5;
265     
266     private static final int AFTER_QUOTE = -1, NONE = 0, START_QUOTE = 1, NORMAL_QUOTE = 2, SLASH_START = 3, HEX = 4;
267     
268     public int next(StringBuffer buffer) {
269         if (start >= limit) return DONE;
270         int status = UNKNOWN;
271         int lastQuote = UNKNOWN;
272         int quoteStatus = NONE;
273         int hexCount = 0;
274         int hexValue = 0;
275         int cp;
276         main:
277             for (int i = start; i < limit; i += UTF16.getCharCount(cp)) {
278                 cp = UTF16.charAt(pattern, i);
279                 // if we are in a quote, then handle it.
280                 switch (quoteStatus) {
281                 case SLASH_START:
282                     switch (cp) {
283                     case 'u':
284                         quoteStatus = HEX;
285                         hexCount = 4;
286                         hexValue = 0;
287                         continue main;
288                     case 'U': 
289                         quoteStatus = HEX;
290                         hexCount = 8;
291                         hexValue = 0;
292                         continue main;
293                     default:
294                         if (usingSlash) {
295                             UTF16.append(buffer, cp);
296                             quoteStatus = NONE;
297                             continue main;
298                         } else {
299                             buffer.append(BACK_SLASH);
300                             quoteStatus = NONE;
301                         }
302                     }
303                     break; // fall through to NONE
304                 case HEX:
305                     hexValue <<= 4;
306                     hexValue += cp;
307                     switch (cp) {
308                     case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
309                         hexValue -= '0'; break;
310                     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
311                         hexValue -= 'a' - 10; break;
312                     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
313                         hexValue -= 'A' - 10; break;
314                     default:
315                         start = i;
316                     return BROKEN_ESCAPE;
317                     }
318                     --hexCount;
319                     if (hexCount == 0) {
320                         quoteStatus = NONE;
321                         UTF16.append(buffer, hexValue);
322                     }
323                     continue main;
324                 case AFTER_QUOTE:
325                     // see if we get another quote character
326                     // if we just ended a quote BUT the following character is the lastQuote character, then we have a situation like '...''...', so we restart the quote
327                     if (cp == lastQuote) {
328                         UTF16.append(buffer, cp);
329                         quoteStatus = NORMAL_QUOTE;
330                         continue main;
331                     }
332                     quoteStatus = NONE;
333                     break; // fall through to NONE
334                 case START_QUOTE:
335                     // if we are at the very start of a quote, and we hit another quote mark then we emit a literal quote character and end the quote
336                     if (cp == lastQuote) {
337                         UTF16.append(buffer, cp);
338                         quoteStatus = NONE; // get out of quote, with no trace remaining
339                         continue;                            
340                     }
341                     // otherwise get into quote
342                     UTF16.append(buffer, cp);
343                     quoteStatus = NORMAL_QUOTE;
344                     continue main;
345                 case NORMAL_QUOTE: 
346                     if (cp == lastQuote) {
347                         quoteStatus = AFTER_QUOTE; // get out of quote
348                         continue main;
349                     }
350                     UTF16.append(buffer, cp);
351                     continue main;
352                 }
353                 
354                 if (ignorableCharacters.contains(cp)) {
355                     continue;
356                 }
357                 // do syntax characters
358                 if (syntaxCharacters.contains(cp)) {
359                     if (status == UNKNOWN) {
360                         UTF16.append(buffer, cp);
361                         start = i + UTF16.getCharCount(cp);
362                         return SYNTAX;
363                     } else { // LITERAL, so back up and break
364                         start = i;
365                         return status;
366                     }
367                 }
368                 // otherwise it is a literal; keep on going
369                 status = LITERAL;
370                 if (cp == BACK_SLASH) {
371                     quoteStatus = SLASH_START;
372                     continue;
373                 } else if (usingQuote && cp == SINGLE_QUOTE) {
374                     lastQuote = cp;
375                     quoteStatus = START_QUOTE;
376                     continue;
377                 }
378                 // normal literals
379                 UTF16.append(buffer, cp);
380             }
381         // handle final cleanup
382         start = limit;
383         switch (quoteStatus) {
384         case HEX:
385             status = BROKEN_ESCAPE;
386             break;
387         case SLASH_START:
388             if (usingSlash) {
389                 status = BROKEN_ESCAPE;
390             } else {
391                 buffer.append(BACK_SLASH);
392             }
393             break;
394         case START_QUOTE: case NORMAL_QUOTE:
395             status = BROKEN_QUOTE;
396             break;
397         }
398         return status;
399     }
400     
401     
402 }
403 //eof