]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-4_4_2-src/main/classes/core/src/com/ibm/icu/impl/PatternTokenizer.java
go
[Dictionary.git] / jars / icu4j-4_4_2-src / main / classes / core / src / com / ibm / icu / impl / PatternTokenizer.java
1 /*\r
2  *******************************************************************************\r
3  * Copyright (C) 2006-2009, Google, International Business Machines Corporation *\r
4  * and others. All Rights Reserved.                                            *\r
5  *******************************************************************************\r
6  */\r
7 package com.ibm.icu.impl;\r
8 \r
9 import com.ibm.icu.text.UTF16;\r
10 import com.ibm.icu.text.UnicodeSet;\r
11 \r
12 /**\r
13  * A simple parsing class for patterns and rules. Handles '...' quotations, \\uxxxx and \\Uxxxxxxxx, and symple syntax.\r
14  * The '' (two quotes) is treated as a single quote, inside or outside a quote\r
15  * <ul>\r
16  * <li>Any ignorable characters are ignored in parsing.</li>\r
17  * <li>Any syntax characters are broken into separate tokens</li>\r
18  * <li>Quote characters can be specified: '...', "...", and \x </li>\r
19  * <li>Other characters are treated as literals</li>\r
20  * </ul>\r
21  */\r
22 public class PatternTokenizer {\r
23     // settings used in the interpretation of the pattern\r
24     private UnicodeSet ignorableCharacters = new UnicodeSet();\r
25     private UnicodeSet syntaxCharacters = new UnicodeSet();\r
26     private UnicodeSet extraQuotingCharacters = new UnicodeSet();\r
27     private UnicodeSet escapeCharacters = new UnicodeSet();\r
28     private boolean usingSlash = false;\r
29     private boolean usingQuote = false;\r
30     \r
31     // transient data, set when needed. Null it out for any changes in the above fields.\r
32     private transient UnicodeSet needingQuoteCharacters = null;\r
33     \r
34     // data about the current pattern being parsed. start gets moved as we go along.\r
35     private int start;\r
36     private int limit;\r
37     private String pattern;\r
38     \r
39     public UnicodeSet getIgnorableCharacters() {\r
40         return (UnicodeSet) ignorableCharacters.clone();\r
41     }\r
42     /**\r
43      * Sets the characters to be ignored in parsing, eg new UnicodeSet("[:pattern_whitespace:]");\r
44      * @param ignorableCharacters Characters to be ignored.\r
45      * @return A PatternTokenizer object in which characters are specified as ignored characters.\r
46      */\r
47     public PatternTokenizer setIgnorableCharacters(UnicodeSet ignorableCharacters) {\r
48         this.ignorableCharacters = (UnicodeSet) ignorableCharacters.clone();\r
49         needingQuoteCharacters = null;\r
50         return this;\r
51     }\r
52     public UnicodeSet getSyntaxCharacters() {\r
53         return (UnicodeSet) syntaxCharacters.clone();\r
54     }\r
55     public UnicodeSet getExtraQuotingCharacters() {\r
56         return (UnicodeSet) extraQuotingCharacters.clone();\r
57     }\r
58     /**\r
59      *  Sets the characters to be interpreted as syntax characters in parsing, eg new UnicodeSet("[:pattern_syntax:]")\r
60      * @param syntaxCharacters Characters to be set as syntax characters.\r
61      * @return A PatternTokenizer object in which characters are specified as syntax characters.\r
62      */\r
63     public PatternTokenizer setSyntaxCharacters(UnicodeSet syntaxCharacters) {\r
64         this.syntaxCharacters = (UnicodeSet) syntaxCharacters.clone();\r
65         needingQuoteCharacters = null;\r
66         return this;\r
67     }   \r
68     /**\r
69      *  Sets the extra characters to be quoted in literals\r
70      * @param syntaxCharacters Characters to be set as extra quoting characters.\r
71      * @return A PatternTokenizer object in which characters are specified as extra quoting characters.\r
72      */\r
73     public PatternTokenizer setExtraQuotingCharacters(UnicodeSet syntaxCharacters) {\r
74         this.extraQuotingCharacters = (UnicodeSet) syntaxCharacters.clone();\r
75         needingQuoteCharacters = null;\r
76         return this;\r
77     }   \r
78     \r
79     public UnicodeSet getEscapeCharacters() {\r
80         return (UnicodeSet) escapeCharacters.clone();\r
81     }\r
82     /**\r
83      * Set characters to be escaped in literals, in quoteLiteral and normalize, eg new UnicodeSet("[^\\u0020-\\u007E]");\r
84      * @param escapeCharacters Characters to be set as escape characters.\r
85      * @return A PatternTokenizer object in which characters are specified as escape characters.\r
86      */\r
87     public PatternTokenizer setEscapeCharacters(UnicodeSet escapeCharacters) {\r
88         this.escapeCharacters = (UnicodeSet) escapeCharacters.clone();\r
89         return this;\r
90     }\r
91     public boolean isUsingQuote() {\r
92         return usingQuote;\r
93     }\r
94     public PatternTokenizer setUsingQuote(boolean usingQuote) {\r
95         this.usingQuote = usingQuote;\r
96         needingQuoteCharacters = null;\r
97         return this;\r
98     }\r
99     public boolean isUsingSlash() {\r
100         return usingSlash;\r
101     }\r
102     public PatternTokenizer setUsingSlash(boolean usingSlash) {\r
103         this.usingSlash = usingSlash;\r
104         needingQuoteCharacters = null;\r
105         return this;\r
106     }\r
107     //    public UnicodeSet getQuoteCharacters() {\r
108 //  return (UnicodeSet) quoteCharacters.clone();\r
109 //  }\r
110 //  public PatternTokenizer setQuoteCharacters(UnicodeSet quoteCharacters) {\r
111 //  this.quoteCharacters = (UnicodeSet) quoteCharacters.clone();\r
112 //  needingQuoteCharacters = null;\r
113 //  return this;\r
114 //  }\r
115     public int getLimit() {\r
116         return limit;\r
117     }\r
118     public PatternTokenizer setLimit(int limit) {\r
119         this.limit = limit;\r
120         return this;\r
121     }\r
122     public int getStart() {\r
123         return start;\r
124     }\r
125     public PatternTokenizer setStart(int start) {\r
126         this.start = start;\r
127         return this;\r
128     }\r
129 \r
130     public PatternTokenizer setPattern(CharSequence pattern) {\r
131         return setPattern(pattern.toString());\r
132     }\r
133 \r
134     public PatternTokenizer setPattern(String pattern) {\r
135         if (pattern == null) {\r
136             throw new IllegalArgumentException("Inconsistent arguments");\r
137         }\r
138         this.start = 0;\r
139         this.limit = pattern.length();\r
140         this.pattern = pattern;\r
141         return this;\r
142     }\r
143 \r
144     public static final char SINGLE_QUOTE = '\'';\r
145     public static final char BACK_SLASH = '\\';\r
146     private static int NO_QUOTE = -1, IN_QUOTE = -2;\r
147 \r
148     public String quoteLiteral(CharSequence string) {\r
149         return quoteLiteral(string.toString());\r
150     }\r
151 \r
152     /**\r
153      * Quote a literal string, using the available settings. Thus syntax characters, quote characters, and ignorable characters will be put into quotes.\r
154      * @param string String passed to quote a literal string.\r
155      * @return A string using the available settings will place syntax, quote, or ignorable characters into quotes.\r
156      */\r
157     public String quoteLiteral(String string) {\r
158         if (needingQuoteCharacters == null) {\r
159             needingQuoteCharacters = new UnicodeSet().addAll(syntaxCharacters).addAll(ignorableCharacters).addAll(extraQuotingCharacters); // .addAll(quoteCharacters)\r
160             if (usingSlash) needingQuoteCharacters.add(BACK_SLASH);\r
161             if (usingQuote) needingQuoteCharacters.add(SINGLE_QUOTE);\r
162         }\r
163         StringBuffer result = new StringBuffer();\r
164         int quotedChar = NO_QUOTE;\r
165         int cp;\r
166         for (int i = 0; i < string.length(); i += UTF16.getCharCount(cp)) {\r
167             cp = UTF16.charAt(string, i);\r
168             if (escapeCharacters.contains(cp)) {\r
169                 // we may have to fix up previous characters\r
170                 if (quotedChar == IN_QUOTE) {\r
171                     result.append(SINGLE_QUOTE);\r
172                     quotedChar = NO_QUOTE;\r
173                 }\r
174                 appendEscaped(result, cp);\r
175                 continue;\r
176             }\r
177             \r
178             if (needingQuoteCharacters.contains(cp)) {\r
179                 // if we have already started a quote\r
180                 if (quotedChar == IN_QUOTE) {\r
181                     UTF16.append(result, cp);\r
182                     if (usingQuote && cp == SINGLE_QUOTE) { // double it\r
183                         result.append(SINGLE_QUOTE);\r
184                     }\r
185                     continue;\r
186                 }\r
187                 // otherwise not already in quote\r
188                 if (usingSlash) {\r
189                     result.append(BACK_SLASH);\r
190                     UTF16.append(result, cp);\r
191                     continue;\r
192                 }\r
193                 if (usingQuote) {\r
194                     if (cp == SINGLE_QUOTE) { // double it and continue\r
195                         result.append(SINGLE_QUOTE);\r
196                         result.append(SINGLE_QUOTE);\r
197                         continue;\r
198                     }\r
199                     result.append(SINGLE_QUOTE);\r
200                     UTF16.append(result, cp);\r
201                     quotedChar = IN_QUOTE;\r
202                     continue;\r
203                 }\r
204                 // we have no choice but to use \\u or \\U\r
205                 appendEscaped(result, cp);\r
206                 continue;\r
207             }\r
208             // otherwise cp doesn't need quoting\r
209             // we may have to fix up previous characters\r
210             if (quotedChar == IN_QUOTE) {\r
211                 result.append(SINGLE_QUOTE);\r
212                 quotedChar = NO_QUOTE;\r
213             }\r
214             UTF16.append(result, cp);\r
215         }\r
216         // all done. \r
217         // we may have to fix up previous characters\r
218         if (quotedChar == IN_QUOTE) {\r
219             result.append(SINGLE_QUOTE);\r
220         }\r
221         return result.toString();\r
222     }\r
223 \r
224     private void appendEscaped(StringBuffer result, int cp) {\r
225         if (cp <= 0xFFFF) {\r
226             result.append("\\u").append(Utility.hex(cp,4));\r
227         } else {\r
228             result.append("\\U").append(Utility.hex(cp,8));\r
229         }\r
230     }\r
231     \r
232     public String normalize() {\r
233         int oldStart = start;\r
234         StringBuffer result = new StringBuffer();\r
235         StringBuffer buffer = new StringBuffer();\r
236         while (true) {\r
237             buffer.setLength(0);\r
238             int status = next(buffer);\r
239             if (status == DONE) {\r
240                 start = oldStart;\r
241                 return result.toString();\r
242             }\r
243             if (status != SYNTAX) {\r
244                 result.append(quoteLiteral(buffer));\r
245             } else {\r
246                 result.append(buffer);\r
247             }\r
248         }\r
249     }\r
250     \r
251     public static final int DONE = 0, SYNTAX = 1, LITERAL = 2, BROKEN_QUOTE = 3, BROKEN_ESCAPE = 4, UNKNOWN = 5;\r
252     \r
253     private static final int AFTER_QUOTE = -1, NONE = 0, START_QUOTE = 1, NORMAL_QUOTE = 2, SLASH_START = 3, HEX = 4;\r
254     \r
255     public int next(StringBuffer buffer) {\r
256         if (start >= limit) return DONE;\r
257         int status = UNKNOWN;\r
258         int lastQuote = UNKNOWN;\r
259         int quoteStatus = NONE;\r
260         int hexCount = 0;\r
261         int hexValue = 0;\r
262         int cp;\r
263         main:\r
264             for (int i = start; i < limit; i += UTF16.getCharCount(cp)) {\r
265                 cp = UTF16.charAt(pattern, i);\r
266                 // if we are in a quote, then handle it.\r
267                 switch (quoteStatus) {\r
268                 case SLASH_START:\r
269                     switch (cp) {\r
270                     case 'u':\r
271                         quoteStatus = HEX;\r
272                         hexCount = 4;\r
273                         hexValue = 0;\r
274                         continue main;\r
275                     case 'U': \r
276                         quoteStatus = HEX;\r
277                         hexCount = 8;\r
278                         hexValue = 0;\r
279                         continue main;\r
280                     default:\r
281                         if (usingSlash) {\r
282                             UTF16.append(buffer, cp);\r
283                             quoteStatus = NONE;\r
284                             continue main;\r
285                         } else {\r
286                             buffer.append(BACK_SLASH);\r
287                             quoteStatus = NONE;\r
288                         }\r
289                     }\r
290                     break; // fall through to NONE\r
291                 case HEX:\r
292                     hexValue <<= 4;\r
293                     hexValue += cp;\r
294                     switch (cp) {\r
295                     case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':\r
296                         hexValue -= '0'; break;\r
297                     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':\r
298                         hexValue -= 'a' - 10; break;\r
299                     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':\r
300                         hexValue -= 'A' - 10; break;\r
301                     default:\r
302                         start = i;\r
303                     return BROKEN_ESCAPE;\r
304                     }\r
305                     --hexCount;\r
306                     if (hexCount == 0) {\r
307                         quoteStatus = NONE;\r
308                         UTF16.append(buffer, hexValue);\r
309                     }\r
310                     continue main;\r
311                 case AFTER_QUOTE:\r
312                     // see if we get another quote character\r
313                     // if we just ended a quote BUT the following character is the lastQuote character, then we have a situation like '...''...', so we restart the quote\r
314                     if (cp == lastQuote) {\r
315                         UTF16.append(buffer, cp);\r
316                         quoteStatus = NORMAL_QUOTE;\r
317                         continue main;\r
318                     }\r
319                     quoteStatus = NONE;\r
320                     break; // fall through to NONE\r
321                 case START_QUOTE:\r
322                     // if we are at the very start of a quote, and we hit another quote mark then we emit a literal quote character and end the quote\r
323                     if (cp == lastQuote) {\r
324                         UTF16.append(buffer, cp);\r
325                         quoteStatus = NONE; // get out of quote, with no trace remaining\r
326                         continue;                            \r
327                     }\r
328                     // otherwise get into quote\r
329                     UTF16.append(buffer, cp);\r
330                     quoteStatus = NORMAL_QUOTE;\r
331                     continue main;\r
332                 case NORMAL_QUOTE: \r
333                     if (cp == lastQuote) {\r
334                         quoteStatus = AFTER_QUOTE; // get out of quote\r
335                         continue main;\r
336                     }\r
337                     UTF16.append(buffer, cp);\r
338                     continue main;\r
339                 }\r
340                 \r
341                 if (ignorableCharacters.contains(cp)) {\r
342                     continue;\r
343                 }\r
344                 // do syntax characters\r
345                 if (syntaxCharacters.contains(cp)) {\r
346                     if (status == UNKNOWN) {\r
347                         UTF16.append(buffer, cp);\r
348                         start = i + UTF16.getCharCount(cp);\r
349                         return SYNTAX;\r
350                     } else { // LITERAL, so back up and break\r
351                         start = i;\r
352                         return status;\r
353                     }\r
354                 }\r
355                 // otherwise it is a literal; keep on going\r
356                 status = LITERAL;\r
357                 if (cp == BACK_SLASH) {\r
358                     quoteStatus = SLASH_START;\r
359                     continue;\r
360                 } else if (usingQuote && cp == SINGLE_QUOTE) {\r
361                     lastQuote = cp;\r
362                     quoteStatus = START_QUOTE;\r
363                     continue;\r
364                 }\r
365                 // normal literals\r
366                 UTF16.append(buffer, cp);\r
367             }\r
368         // handle final cleanup\r
369         start = limit;\r
370         switch (quoteStatus) {\r
371         case HEX:\r
372             status = BROKEN_ESCAPE;\r
373             break;\r
374         case SLASH_START:\r
375             if (usingSlash) {\r
376                 status = BROKEN_ESCAPE;\r
377             } else {\r
378                 buffer.append(BACK_SLASH);\r
379             }\r
380             break;\r
381         case START_QUOTE: case NORMAL_QUOTE:\r
382             status = BROKEN_QUOTE;\r
383             break;\r
384         }\r
385         return status;\r
386     }\r
387     \r
388     \r
389 }\r
390 //eof\r