2 ********************************************************************************
\r
3 * Copyright (C) 2009-2010, Google, International Business Machines Corporation *
\r
4 * and others. All Rights Reserved. *
\r
5 ********************************************************************************
\r
7 package com.ibm.icu.impl;
\r
9 import java.io.BufferedReader;
\r
10 import java.io.FileInputStream;
\r
11 import java.io.IOException;
\r
12 import java.io.InputStream;
\r
13 import java.io.InputStreamReader;
\r
14 import java.io.UnsupportedEncodingException;
\r
15 import java.text.ParsePosition;
\r
16 import java.util.Arrays;
\r
17 import java.util.Comparator;
\r
18 import java.util.Iterator;
\r
19 import java.util.LinkedHashSet;
\r
20 import java.util.List;
\r
21 import java.util.Map;
\r
22 import java.util.Set;
\r
23 import java.util.TreeMap;
\r
24 import java.util.regex.Pattern;
\r
26 import com.ibm.icu.text.StringTransform;
\r
27 import com.ibm.icu.text.UnicodeSet;
\r
28 import com.ibm.icu.util.Freezable;
\r
31 * Contains utilities to supplement the JDK Regex, since it doesn't handle
\r
36 public class UnicodeRegex implements Cloneable, Freezable<UnicodeRegex>, StringTransform {
\r
37 // Note: we don't currently have any state, but intend to in the future,
\r
38 // particularly for the regex style supported.
\r
41 * Adds full Unicode property support, with the latest version of Unicode,
\r
42 * to Java Regex, bringing it up to Level 1 (see
\r
43 * http://www.unicode.org/reports/tr18/). It does this by preprocessing the
\r
44 * regex pattern string and interpreting the character classes (\p{...},
\r
45 * \P{...}, [...]) according to their syntax and meaning in UnicodeSet. With
\r
46 * this utility, Java regex expressions can be updated to work with the
\r
47 * latest version of Unicode, and with all Unicode properties. Note that the
\r
48 * UnicodeSet syntax has not yet, however, been updated to be completely
\r
49 * consistent with Java regex, so be careful of the differences.
\r
50 * <p>Not thread-safe; create a separate copy for different threads.
\r
51 * <p>In the future, we may extend this to support other regex packages.
\r
53 * @regex A modified Java regex pattern, as in the input to
\r
54 * Pattern.compile(), except that all "character classes" are
\r
55 * processed as if they were UnicodeSet patterns. Example:
\r
56 * "abc[:bc=N:]. See UnicodeSet for the differences in syntax.
\r
57 * @return A processed Java regex pattern, suitable for input to
\r
58 * Pattern.compile().
\r
60 public String transform(String regex) {
\r
61 StringBuilder result = new StringBuilder();
\r
62 UnicodeSet temp = new UnicodeSet();
\r
63 ParsePosition pos = new ParsePosition(0);
\r
64 int state = 0; // 1 = after \
\r
66 // We add each character unmodified to the output, unless we have a
\r
67 // UnicodeSet. Note that we don't worry about supplementary characters,
\r
68 // since none of the syntax uses them.
\r
70 for (int i = 0; i < regex.length(); ++i) {
\r
71 // look for UnicodeSets, allowing for quoting with \ and \Q
\r
72 char ch = regex.charAt(i);
\r
74 case 0: // we only care about \, and '['.
\r
76 if (UnicodeSet.resemblesPattern(regex, i)) {
\r
77 // should only happen with \p
\r
78 i = processSet(regex, i, result, temp, pos);
\r
82 } else if (ch == '[') {
\r
83 // if we have what looks like a UnicodeSet
\r
84 if (UnicodeSet.resemblesPattern(regex, i)) {
\r
85 i = processSet(regex, i, result, temp, pos);
\r
91 case 1: // we are after a \
\r
99 case 2: // we are in a \Q...
\r
105 case 3: // we are in at \Q...\
\r
114 return result.toString();
\r
118 * Convenience static function, using standard parameters.
\r
119 * @param regex as in process()
\r
120 * @return processed regex pattern, as in process()
\r
122 public static String fix(String regex) {
\r
123 return STANDARD.transform(regex);
\r
127 * Compile a regex string, after processing by fix(...).
\r
129 * @param regex Raw regex pattern, as in fix(...).
\r
132 public static Pattern compile(String regex) {
\r
133 return Pattern.compile(STANDARD.transform(regex));
\r
137 * Compile a regex string, after processing by fix(...).
\r
139 * @param regex Raw regex pattern, as in fix(...).
\r
142 public static Pattern compile(String regex, int options) {
\r
143 return Pattern.compile(STANDARD.transform(regex), options);
\r
147 * Compile a composed string from a set of BNF lines; see the List version for more information.
\r
149 * @param bnfLines Series of BNF lines.
\r
152 public String compileBnf(String bnfLines) {
\r
153 return compileBnf(Arrays.asList(bnfLines.split("\\r\\n?|\\n")));
\r
157 * Compile a composed string from a set of BNF lines, such as for composing a regex
\r
158 * expression. The lines can be in any order, but there must not be any
\r
159 * cycles. The result can be used as input for fix().
\r
163 * uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?;
\r
164 * scheme = reserved+;
\r
165 * host = // reserved+;
\r
166 * query = [\\=reserved]+;
\r
167 * fragment = reserved+;
\r
168 * reserved = [[:ascii:][:alphabetic:]];
\r
171 * Caveats: at this point the parsing is simple; for example, # cannot be
\r
172 * quoted (use \\u0023); you can set it to null to disable.
\r
173 * The equality sign and a few others can be reset with
\r
176 * @param lines Series of lines that represent a BNF expression. The lines contain
\r
177 * a series of statements that of the form x=y;. A statement can take
\r
178 * multiple lines, but there can't be multiple statements on a line.
\r
179 * A hash quotes to the end of the line.
\r
182 public String compileBnf(List<String> lines) {
\r
183 Map<String, String> variables = getVariables(lines);
\r
184 Set<String> unused = new LinkedHashSet<String>(variables.keySet());
\r
185 // brute force replacement; do twice to allow for different order
\r
186 // later on can optimize
\r
187 for (int i = 0; i < 2; ++i) {
\r
188 for (Iterator<String> it = variables.keySet().iterator(); it.hasNext();) {
\r
189 String variable = it.next();
\r
190 String definition = variables.get(variable);
\r
191 for (Iterator<String> it2 = variables.keySet().iterator(); it2.hasNext();) {
\r
192 String variable2 = it2.next();
\r
193 if (variable.equals(variable2)) continue;
\r
194 String definition2 = variables.get(variable2);
\r
195 String altered2 = definition2.replace(variable, definition);
\r
196 if (!altered2.equals(definition2)) {
\r
197 unused.remove(variable);
\r
198 variables.put(variable2, altered2);
\r
201 log.append(variable2 + "=" + altered2 + ";");
\r
202 } catch (IOException e) {
\r
203 throw (IllegalArgumentException) new IllegalArgumentException().initCause(e);
\r
210 if (unused.size() != 1) {
\r
211 throw new IllegalArgumentException("Not a single root: " + unused);
\r
213 return variables.get(unused.iterator().next());
\r
216 public String getBnfCommentString() {
\r
217 return bnfCommentString;
\r
220 public void setBnfCommentString(String bnfCommentString) {
\r
221 this.bnfCommentString = bnfCommentString;
\r
224 public String getBnfVariableInfix() {
\r
225 return bnfVariableInfix;
\r
228 public void setBnfVariableInfix(String bnfVariableInfix) {
\r
229 this.bnfVariableInfix = bnfVariableInfix;
\r
232 public String getBnfLineSeparator() {
\r
233 return bnfLineSeparator;
\r
236 public void setBnfLineSeparator(String bnfLineSeparator) {
\r
237 this.bnfLineSeparator = bnfLineSeparator;
\r
241 * Utility for loading lines from a file.
\r
242 * @param result The result of the appended lines.
\r
243 * @param file The file to have an input stream.
\r
244 * @param encoding if null, then UTF-8
\r
245 * @return filled list
\r
246 * @throws IOException If there were problems opening the file for input stream.
\r
248 public static List<String> appendLines(List<String> result, String file, String encoding) throws IOException {
\r
249 return appendLines(result, new FileInputStream(file), encoding);
\r
253 * Utility for loading lines from a UTF8 file.
\r
254 * @param result The result of the appended lines.
\r
255 * @param inputStream The input stream.
\r
256 * @param encoding if null, then UTF-8
\r
257 * @return filled list
\r
258 * @throws IOException If there were problems opening the input stream for reading.
\r
260 public static List<String> appendLines(List<String> result, InputStream inputStream, String encoding)
\r
261 throws UnsupportedEncodingException, IOException {
\r
262 BufferedReader in = new BufferedReader(new InputStreamReader(inputStream, encoding == null ? "UTF-8" : encoding));
\r
264 String line = in.readLine();
\r
265 if (line == null) break;
\r
274 * @see com.ibm.icu.util.Freezable#cloneAsThawed()
\r
276 public UnicodeRegex cloneAsThawed() {
\r
277 // TODO Auto-generated method stub
\r
279 return (UnicodeRegex)clone();
\r
280 } catch (CloneNotSupportedException e) {
\r
281 throw new IllegalArgumentException(); // should never happen
\r
286 * @see com.ibm.icu.util.Freezable#freeze()
\r
288 public UnicodeRegex freeze() {
\r
289 // no action needed now.
\r
294 * @see com.ibm.icu.util.Freezable#isFrozen()
\r
296 public boolean isFrozen() {
\r
297 // at this point, always true
\r
301 // ===== PRIVATES =====
\r
303 private int processSet(String regex, int i, StringBuilder result, UnicodeSet temp, ParsePosition pos) {
\r
306 UnicodeSet x = temp.clear().applyPattern(regex, pos, null, 0);
\r
307 x.complement().complement(); // hack to fix toPattern
\r
308 result.append(x.toPattern(false));
\r
309 i = pos.getIndex() - 1; // allow for the loop increment
\r
311 } catch (Exception e) {
\r
312 throw (IllegalArgumentException) new IllegalArgumentException("Error in " + regex).initCause(e);
\r
316 private static UnicodeRegex STANDARD = new UnicodeRegex();
\r
317 private String bnfCommentString = "#";
\r
318 private String bnfVariableInfix = "=";
\r
319 private String bnfLineSeparator = "\n";
\r
320 private Appendable log = null;
\r
322 private Comparator<Object> LongestFirst = new Comparator<Object>() {
\r
323 public int compare(Object obj0, Object obj1) {
\r
324 String arg0 = obj0.toString();
\r
325 String arg1 = obj1.toString();
\r
326 int len0 = arg0.length();
\r
327 int len1 = arg1.length();
\r
328 if (len0 != len1) return len1 - len0;
\r
329 return arg0.compareTo(arg1);
\r
333 private Map<String, String> getVariables(List<String> lines) {
\r
334 Map<String, String> variables = new TreeMap<String, String>(LongestFirst);
\r
335 String variable = null;
\r
336 StringBuffer definition = new StringBuffer();
\r
338 for (Iterator<String> it = lines.iterator(); it.hasNext();) {
\r
339 String line = it.next();
\r
341 // remove initial bom, comments
\r
342 if (line.length() == 0) continue;
\r
343 if (line.charAt(0) == '\uFEFF') line = line.substring(1);
\r
345 if (bnfCommentString != null) {
\r
346 int hashPos = line.indexOf(bnfCommentString);
\r
347 if (hashPos >= 0) line = line.substring(0, hashPos);
\r
349 String trimline = line.trim();
\r
350 if (trimline.length() == 0) continue;
\r
352 // String[] lineParts = line.split(";");
\r
353 String linePart = line; // lineParts[i]; // .trim().replace("\\s+", " ");
\r
354 if (linePart.trim().length() == 0) continue;
\r
355 boolean terminated = trimline.endsWith(";");
\r
357 linePart = linePart.substring(0,linePart.lastIndexOf(';'));
\r
359 int equalsPos = linePart.indexOf(bnfVariableInfix);
\r
360 if (equalsPos >= 0) {
\r
361 if (variable != null) {
\r
362 throw new IllegalArgumentException("Missing ';' before " + count + ") " + line);
\r
364 variable = linePart.substring(0,equalsPos).trim();
\r
365 if (variables.containsKey(variable)) {
\r
366 throw new IllegalArgumentException("Duplicate variable definition in " + line);
\r
368 definition.append(linePart.substring(equalsPos+1).trim());
\r
369 } else { // no equals, so
\r
370 if (variable == null) {
\r
371 throw new IllegalArgumentException("Missing '=' at " + count + ") " + line);
\r
373 definition.append(bnfLineSeparator).append(linePart);
\r
375 // we are terminated if i is not at the end, or the line ends with a ;
\r
377 variables.put(variable, definition.toString());
\r
378 variable = null; // signal we have no variable
\r
379 definition.setLength(0);
\r
382 if (variable != null) {
\r
383 throw new IllegalArgumentException("Missing ';' at end");
\r