]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-4_2_1-src/src/com/ibm/icu/impl/PatternTokenizer.java
419e67b76e23535bb46a1b19b4097afabf70a561
[Dictionary.git] / jars / icu4j-4_2_1-src / src / com / ibm / icu / impl / PatternTokenizer.java
1 //##header\r
2 /*\r
3  *******************************************************************************\r
4  * Copyright (C) 2006-2009, Google, International Business Machines Corporation *\r
5  * and others. All Rights Reserved.                                            *\r
6  *******************************************************************************\r
7  */\r
8 package com.ibm.icu.impl;\r
9 \r
10 import com.ibm.icu.text.UTF16;\r
11 import com.ibm.icu.text.UnicodeSet;\r
12 \r
13 /**\r
14  * A simple parsing class for patterns and rules. Handles '...' quotations, \\uxxxx and \\Uxxxxxxxx, and symple syntax.\r
15  * The '' (two quotes) is treated as a single quote, inside or outside a quote\r
16  * <ul>\r
17  * <li>Any ignorable characters are ignored in parsing.</li>\r
18  * <li>Any syntax characters are broken into separate tokens</li>\r
19  * <li>Quote characters can be specified: '...', "...", and \x </li>\r
20  * <li>Other characters are treated as literals</li>\r
21  * </ul>\r
22  */\r
23 public class PatternTokenizer {\r
24     // settings used in the interpretation of the pattern\r
25     private UnicodeSet ignorableCharacters = new UnicodeSet();\r
26     private UnicodeSet syntaxCharacters = new UnicodeSet();\r
27     private UnicodeSet extraQuotingCharacters = new UnicodeSet();\r
28     private UnicodeSet escapeCharacters = new UnicodeSet();\r
29     private boolean usingSlash = false;\r
30     private boolean usingQuote = false;\r
31     \r
32     // transient data, set when needed. Null it out for any changes in the above fields.\r
33     private transient UnicodeSet needingQuoteCharacters = null;\r
34     \r
35     // data about the current pattern being parsed. start gets moved as we go along.\r
36     private int start;\r
37     private int limit;\r
38     private String pattern;\r
39     \r
40     public UnicodeSet getIgnorableCharacters() {\r
41         return (UnicodeSet) ignorableCharacters.clone();\r
42     }\r
43     /**\r
44      * Sets the characters to be ignored in parsing, eg new UnicodeSet("[:pattern_whitespace:]");\r
45      * @param ignorableCharacters\r
46      * @return\r
47      */\r
48     public PatternTokenizer setIgnorableCharacters(UnicodeSet ignorableCharacters) {\r
49         this.ignorableCharacters = (UnicodeSet) ignorableCharacters.clone();\r
50         needingQuoteCharacters = null;\r
51         return this;\r
52     }\r
53     public UnicodeSet getSyntaxCharacters() {\r
54         return (UnicodeSet) syntaxCharacters.clone();\r
55     }\r
56     public UnicodeSet getExtraQuotingCharacters() {\r
57         return (UnicodeSet) extraQuotingCharacters.clone();\r
58     }\r
59     /**\r
60      *  Sets the characters to be interpreted as syntax characters in parsing, eg new UnicodeSet("[:pattern_syntax:]")\r
61      * @param syntaxCharacters\r
62      * @return\r
63      */\r
64     public PatternTokenizer setSyntaxCharacters(UnicodeSet syntaxCharacters) {\r
65         this.syntaxCharacters = (UnicodeSet) syntaxCharacters.clone();\r
66         needingQuoteCharacters = null;\r
67         return this;\r
68     }   \r
69     /**\r
70      *  Sets the extra characters to be quoted in literals\r
71      * @param syntaxCharacters\r
72      * @return\r
73      */\r
74     public PatternTokenizer setExtraQuotingCharacters(UnicodeSet syntaxCharacters) {\r
75         this.extraQuotingCharacters = (UnicodeSet) syntaxCharacters.clone();\r
76         needingQuoteCharacters = null;\r
77         return this;\r
78     }   \r
79     \r
80     public UnicodeSet getEscapeCharacters() {\r
81         return (UnicodeSet) escapeCharacters.clone();\r
82     }\r
83     /**\r
84      * Set characters to be escaped in literals, in quoteLiteral and normalize, eg new UnicodeSet("[^\\u0020-\\u007E]");\r
85      * @param escapeCharacters\r
86      * @return\r
87      */\r
88     public PatternTokenizer setEscapeCharacters(UnicodeSet escapeCharacters) {\r
89         this.escapeCharacters = (UnicodeSet) escapeCharacters.clone();\r
90         return this;\r
91     }\r
92     public boolean isUsingQuote() {\r
93         return usingQuote;\r
94     }\r
95     public PatternTokenizer setUsingQuote(boolean usingQuote) {\r
96         this.usingQuote = usingQuote;\r
97         needingQuoteCharacters = null;\r
98         return this;\r
99     }\r
100     public boolean isUsingSlash() {\r
101         return usingSlash;\r
102     }\r
103     public PatternTokenizer setUsingSlash(boolean usingSlash) {\r
104         this.usingSlash = usingSlash;\r
105         needingQuoteCharacters = null;\r
106         return this;\r
107     }\r
108     //    public UnicodeSet getQuoteCharacters() {\r
109 //  return (UnicodeSet) quoteCharacters.clone();\r
110 //  }\r
111 //  public PatternTokenizer setQuoteCharacters(UnicodeSet quoteCharacters) {\r
112 //  this.quoteCharacters = (UnicodeSet) quoteCharacters.clone();\r
113 //  needingQuoteCharacters = null;\r
114 //  return this;\r
115 //  }\r
116     public int getLimit() {\r
117         return limit;\r
118     }\r
119     public PatternTokenizer setLimit(int limit) {\r
120         this.limit = limit;\r
121         return this;\r
122     }\r
123     public int getStart() {\r
124         return start;\r
125     }\r
126     public PatternTokenizer setStart(int start) {\r
127         this.start = start;\r
128         return this;\r
129     }\r
130 \r
131 //#if defined(FOUNDATION10) || defined(J2SE13)\r
132 //##    public PatternTokenizer setPattern(StringBuffer pattern) {\r
133 //##        return setPattern(pattern.toString());\r
134 //##    }\r
135 //#else \r
136     public PatternTokenizer setPattern(CharSequence pattern) {\r
137         return setPattern(pattern.toString());\r
138     }\r
139 //#endif\r
140 \r
141     public PatternTokenizer setPattern(String pattern) {\r
142         if (pattern == null) {\r
143             throw new IllegalArgumentException("Inconsistent arguments");\r
144         }\r
145         this.start = 0;\r
146         this.limit = pattern.length();\r
147         this.pattern = pattern;\r
148         return this;\r
149     }\r
150 \r
151     public static final char SINGLE_QUOTE = '\'';\r
152     public static final char BACK_SLASH = '\\';\r
153     private static int NO_QUOTE = -1, IN_QUOTE = -2;\r
154 \r
155 //#if defined(FOUNDATION10) || defined(J2SE13)\r
156 //##    public String quoteLiteral(StringBuffer string) {\r
157 //##        return quoteLiteral(string.toString());\r
158 //##    }\r
159 //#else\r
160     public String quoteLiteral(CharSequence string) {\r
161         return quoteLiteral(string.toString());\r
162     }\r
163 //#endif\r
164 \r
165     /**\r
166      * Quote a literal string, using the available settings. Thus syntax characters, quote characters, and ignorable characters will be put into quotes.\r
167      * @param string\r
168      * @return\r
169      */\r
170     public String quoteLiteral(String string) {\r
171         if (needingQuoteCharacters == null) {\r
172             needingQuoteCharacters = new UnicodeSet().addAll(syntaxCharacters).addAll(ignorableCharacters).addAll(extraQuotingCharacters); // .addAll(quoteCharacters)\r
173             if (usingSlash) needingQuoteCharacters.add(BACK_SLASH);\r
174             if (usingQuote) needingQuoteCharacters.add(SINGLE_QUOTE);\r
175         }\r
176         StringBuffer result = new StringBuffer();\r
177         int quotedChar = NO_QUOTE;\r
178         int cp;\r
179         for (int i = 0; i < string.length(); i += UTF16.getCharCount(cp)) {\r
180             cp = UTF16.charAt(string, i);\r
181             if (escapeCharacters.contains(cp)) {\r
182                 // we may have to fix up previous characters\r
183                 if (quotedChar == IN_QUOTE) {\r
184                     result.append(SINGLE_QUOTE);\r
185                     quotedChar = NO_QUOTE;\r
186                 }\r
187                 appendEscaped(result, cp);\r
188                 continue;\r
189             }\r
190             \r
191             if (needingQuoteCharacters.contains(cp)) {\r
192                 // if we have already started a quote\r
193                 if (quotedChar == IN_QUOTE) {\r
194                     UTF16.append(result, cp);\r
195                     if (usingQuote && cp == SINGLE_QUOTE) { // double it\r
196                         result.append(SINGLE_QUOTE);\r
197                     }\r
198                     continue;\r
199                 }\r
200                 // otherwise not already in quote\r
201                 if (usingSlash) {\r
202                     result.append(BACK_SLASH);\r
203                     UTF16.append(result, cp);\r
204                     continue;\r
205                 }\r
206                 if (usingQuote) {\r
207                     if (cp == SINGLE_QUOTE) { // double it and continue\r
208                         result.append(SINGLE_QUOTE);\r
209                         result.append(SINGLE_QUOTE);\r
210                         continue;\r
211                     }\r
212                     result.append(SINGLE_QUOTE);\r
213                     UTF16.append(result, cp);\r
214                     quotedChar = IN_QUOTE;\r
215                     continue;\r
216                 }\r
217                 // we have no choice but to use \\u or \\U\r
218                 appendEscaped(result, cp);\r
219                 continue;\r
220             }\r
221             // otherwise cp doesn't need quoting\r
222             // we may have to fix up previous characters\r
223             if (quotedChar == IN_QUOTE) {\r
224                 result.append(SINGLE_QUOTE);\r
225                 quotedChar = NO_QUOTE;\r
226             }\r
227             UTF16.append(result, cp);\r
228         }\r
229         // all done. \r
230         // we may have to fix up previous characters\r
231         if (quotedChar == IN_QUOTE) {\r
232             result.append(SINGLE_QUOTE);\r
233         }\r
234         return result.toString();\r
235     }\r
236 \r
237     private void appendEscaped(StringBuffer result, int cp) {\r
238         if (cp <= 0xFFFF) {\r
239             result.append("\\u").append(Utility.hex(cp,4));\r
240         } else {\r
241             result.append("\\U").append(Utility.hex(cp,8));\r
242         }\r
243     }\r
244     \r
245     public String normalize() {\r
246         int oldStart = start;\r
247         StringBuffer result = new StringBuffer();\r
248         StringBuffer buffer = new StringBuffer();\r
249         while (true) {\r
250             buffer.setLength(0);\r
251             int status = next(buffer);\r
252             if (status == DONE) {\r
253                 start = oldStart;\r
254                 return result.toString();\r
255             }\r
256             if (status != SYNTAX) {\r
257                 result.append(quoteLiteral(buffer));\r
258             } else {\r
259                 result.append(buffer);\r
260             }\r
261         }\r
262     }\r
263     \r
264     public static final int DONE = 0, SYNTAX = 1, LITERAL = 2, BROKEN_QUOTE = 3, BROKEN_ESCAPE = 4, UNKNOWN = 5;\r
265     \r
266     private static final int AFTER_QUOTE = -1, NONE = 0, START_QUOTE = 1, NORMAL_QUOTE = 2, SLASH_START = 3, HEX = 4;\r
267     \r
268     public int next(StringBuffer buffer) {\r
269         if (start >= limit) return DONE;\r
270         int status = UNKNOWN;\r
271         int lastQuote = UNKNOWN;\r
272         int quoteStatus = NONE;\r
273         int hexCount = 0;\r
274         int hexValue = 0;\r
275         int cp;\r
276         main:\r
277             for (int i = start; i < limit; i += UTF16.getCharCount(cp)) {\r
278                 cp = UTF16.charAt(pattern, i);\r
279                 // if we are in a quote, then handle it.\r
280                 switch (quoteStatus) {\r
281                 case SLASH_START:\r
282                     switch (cp) {\r
283                     case 'u':\r
284                         quoteStatus = HEX;\r
285                         hexCount = 4;\r
286                         hexValue = 0;\r
287                         continue main;\r
288                     case 'U': \r
289                         quoteStatus = HEX;\r
290                         hexCount = 8;\r
291                         hexValue = 0;\r
292                         continue main;\r
293                     default:\r
294                         if (usingSlash) {\r
295                             UTF16.append(buffer, cp);\r
296                             quoteStatus = NONE;\r
297                             continue main;\r
298                         } else {\r
299                             buffer.append(BACK_SLASH);\r
300                             quoteStatus = NONE;\r
301                         }\r
302                     }\r
303                     break; // fall through to NONE\r
304                 case HEX:\r
305                     hexValue <<= 4;\r
306                     hexValue += cp;\r
307                     switch (cp) {\r
308                     case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':\r
309                         hexValue -= '0'; break;\r
310                     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':\r
311                         hexValue -= 'a' - 10; break;\r
312                     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':\r
313                         hexValue -= 'A' - 10; break;\r
314                     default:\r
315                         start = i;\r
316                     return BROKEN_ESCAPE;\r
317                     }\r
318                     --hexCount;\r
319                     if (hexCount == 0) {\r
320                         quoteStatus = NONE;\r
321                         UTF16.append(buffer, hexValue);\r
322                     }\r
323                     continue main;\r
324                 case AFTER_QUOTE:\r
325                     // see if we get another quote character\r
326                     // if we just ended a quote BUT the following character is the lastQuote character, then we have a situation like '...''...', so we restart the quote\r
327                     if (cp == lastQuote) {\r
328                         UTF16.append(buffer, cp);\r
329                         quoteStatus = NORMAL_QUOTE;\r
330                         continue main;\r
331                     }\r
332                     quoteStatus = NONE;\r
333                     break; // fall through to NONE\r
334                 case START_QUOTE:\r
335                     // if we are at the very start of a quote, and we hit another quote mark then we emit a literal quote character and end the quote\r
336                     if (cp == lastQuote) {\r
337                         UTF16.append(buffer, cp);\r
338                         quoteStatus = NONE; // get out of quote, with no trace remaining\r
339                         continue;                            \r
340                     }\r
341                     // otherwise get into quote\r
342                     UTF16.append(buffer, cp);\r
343                     quoteStatus = NORMAL_QUOTE;\r
344                     continue main;\r
345                 case NORMAL_QUOTE: \r
346                     if (cp == lastQuote) {\r
347                         quoteStatus = AFTER_QUOTE; // get out of quote\r
348                         continue main;\r
349                     }\r
350                     UTF16.append(buffer, cp);\r
351                     continue main;\r
352                 }\r
353                 \r
354                 if (ignorableCharacters.contains(cp)) {\r
355                     continue;\r
356                 }\r
357                 // do syntax characters\r
358                 if (syntaxCharacters.contains(cp)) {\r
359                     if (status == UNKNOWN) {\r
360                         UTF16.append(buffer, cp);\r
361                         start = i + UTF16.getCharCount(cp);\r
362                         return SYNTAX;\r
363                     } else { // LITERAL, so back up and break\r
364                         start = i;\r
365                         return status;\r
366                     }\r
367                 }\r
368                 // otherwise it is a literal; keep on going\r
369                 status = LITERAL;\r
370                 if (cp == BACK_SLASH) {\r
371                     quoteStatus = SLASH_START;\r
372                     continue;\r
373                 } else if (usingQuote && cp == SINGLE_QUOTE) {\r
374                     lastQuote = cp;\r
375                     quoteStatus = START_QUOTE;\r
376                     continue;\r
377                 }\r
378                 // normal literals\r
379                 UTF16.append(buffer, cp);\r
380             }\r
381         // handle final cleanup\r
382         start = limit;\r
383         switch (quoteStatus) {\r
384         case HEX:\r
385             status = BROKEN_ESCAPE;\r
386             break;\r
387         case SLASH_START:\r
388             if (usingSlash) {\r
389                 status = BROKEN_ESCAPE;\r
390             } else {\r
391                 buffer.append(BACK_SLASH);\r
392             }\r
393             break;\r
394         case START_QUOTE: case NORMAL_QUOTE:\r
395             status = BROKEN_QUOTE;\r
396             break;\r
397         }\r
398         return status;\r
399     }\r
400     \r
401     \r
402 }\r
403 //eof\r