2 ********************************************************************************
3 * Copyright (C) 2009-2011, Google, International Business Machines Corporation *
4 * and others. All Rights Reserved. *
5 ********************************************************************************
7 package com.ibm.icu.impl;
9 import java.io.BufferedReader;
10 import java.io.FileInputStream;
11 import java.io.IOException;
12 import java.io.InputStream;
13 import java.io.InputStreamReader;
14 import java.io.UnsupportedEncodingException;
15 import java.text.ParsePosition;
16 import java.util.Arrays;
17 import java.util.Comparator;
18 import java.util.LinkedHashSet;
19 import java.util.List;
21 import java.util.Map.Entry;
23 import java.util.TreeMap;
24 import java.util.regex.Pattern;
26 import com.ibm.icu.text.StringTransform;
27 import com.ibm.icu.text.SymbolTable;
28 import com.ibm.icu.text.UnicodeSet;
29 import com.ibm.icu.util.Freezable;
32 * Contains utilities to supplement the JDK Regex, since it doesn't handle
37 public class UnicodeRegex implements Cloneable, Freezable<UnicodeRegex>, StringTransform {
38 // Note: we don't currently have any state, but intend to in the future,
39 // particularly for the regex style supported.
41 private SymbolTable symbolTable;
44 * Set the symbol table for internal processing
47 public SymbolTable getSymbolTable() {
52 * Get the symbol table for internal processing
55 public UnicodeRegex setSymbolTable(SymbolTable symbolTable) {
56 this.symbolTable = symbolTable;
61 * Adds full Unicode property support, with the latest version of Unicode,
62 * to Java Regex, bringing it up to Level 1 (see
63 * http://www.unicode.org/reports/tr18/). It does this by preprocessing the
64 * regex pattern string and interpreting the character classes (\p{...},
65 * \P{...}, [...]) according to their syntax and meaning in UnicodeSet. With
66 * this utility, Java regex expressions can be updated to work with the
67 * latest version of Unicode, and with all Unicode properties. Note that the
68 * UnicodeSet syntax has not yet, however, been updated to be completely
69 * consistent with Java regex, so be careful of the differences.
70 * <p>Not thread-safe; create a separate copy for different threads.
71 * <p>In the future, we may extend this to support other regex packages.
73 * @regex A modified Java regex pattern, as in the input to
74 * Pattern.compile(), except that all "character classes" are
75 * processed as if they were UnicodeSet patterns. Example:
76 * "abc[:bc=N:]. See UnicodeSet for the differences in syntax.
77 * @return A processed Java regex pattern, suitable for input to
80 public String transform(String regex) {
81 StringBuilder result = new StringBuilder();
82 UnicodeSet temp = new UnicodeSet();
83 ParsePosition pos = new ParsePosition(0);
84 int state = 0; // 1 = after \
86 // We add each character unmodified to the output, unless we have a
87 // UnicodeSet. Note that we don't worry about supplementary characters,
88 // since none of the syntax uses them.
90 for (int i = 0; i < regex.length(); ++i) {
91 // look for UnicodeSets, allowing for quoting with \ and \Q
92 char ch = regex.charAt(i);
94 case 0: // we only care about \, and '['.
96 if (UnicodeSet.resemblesPattern(regex, i)) {
97 // should only happen with \p
98 i = processSet(regex, i, result, temp, pos);
102 } else if (ch == '[') {
103 // if we have what looks like a UnicodeSet
104 if (UnicodeSet.resemblesPattern(regex, i)) {
105 i = processSet(regex, i, result, temp, pos);
111 case 1: // we are after a \
119 case 2: // we are in a \Q...
125 case 3: // we are in at \Q...\
134 return result.toString();
138 * Convenience static function, using standard parameters.
139 * @param regex as in process()
140 * @return processed regex pattern, as in process()
142 public static String fix(String regex) {
143 return STANDARD.transform(regex);
147 * Compile a regex string, after processing by fix(...).
149 * @param regex Raw regex pattern, as in fix(...).
152 public static Pattern compile(String regex) {
153 return Pattern.compile(STANDARD.transform(regex));
157 * Compile a regex string, after processing by fix(...).
159 * @param regex Raw regex pattern, as in fix(...).
162 public static Pattern compile(String regex, int options) {
163 return Pattern.compile(STANDARD.transform(regex), options);
167 * Compile a composed string from a set of BNF lines; see the List version for more information.
169 * @param bnfLines Series of BNF lines.
172 public String compileBnf(String bnfLines) {
173 return compileBnf(Arrays.asList(bnfLines.split("\\r\\n?|\\n")));
177 * Compile a composed string from a set of BNF lines, such as for composing a regex
178 * expression. The lines can be in any order, but there must not be any
179 * cycles. The result can be used as input for fix().
183 * uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?;
184 * scheme = reserved+;
185 * host = // reserved+;
186 * query = [\\=reserved]+;
187 * fragment = reserved+;
188 * reserved = [[:ascii:][:alphabetic:]];
191 * Caveats: at this point the parsing is simple; for example, # cannot be
192 * quoted (use \\u0023); you can set it to null to disable.
193 * The equality sign and a few others can be reset with
196 * @param lines Series of lines that represent a BNF expression. The lines contain
197 * a series of statements that of the form x=y;. A statement can take
198 * multiple lines, but there can't be multiple statements on a line.
199 * A hash quotes to the end of the line.
202 public String compileBnf(List<String> lines) {
203 Map<String, String> variables = getVariables(lines);
204 Set<String> unused = new LinkedHashSet<String>(variables.keySet());
205 // brute force replacement; do twice to allow for different order
206 // later on can optimize
207 for (int i = 0; i < 2; ++i) {
208 for (Entry<String, String> entry : variables.entrySet()) {
209 String variable = entry.getKey(),
210 definition = entry.getValue();
212 for (Entry<String, String> entry2 : variables.entrySet()) {
213 String variable2 = entry2.getKey(),
214 definition2 = entry2.getValue();
215 if (variable.equals(variable2)) {
218 String altered2 = definition2.replace(variable, definition);
219 if (!altered2.equals(definition2)) {
220 unused.remove(variable);
221 variables.put(variable2, altered2);
224 log.append(variable2 + "=" + altered2 + ";");
225 } catch (IOException e) {
226 throw (IllegalArgumentException) new IllegalArgumentException().initCause(e);
233 if (unused.size() != 1) {
234 throw new IllegalArgumentException("Not a single root: " + unused);
236 return variables.get(unused.iterator().next());
239 public String getBnfCommentString() {
240 return bnfCommentString;
243 public void setBnfCommentString(String bnfCommentString) {
244 this.bnfCommentString = bnfCommentString;
247 public String getBnfVariableInfix() {
248 return bnfVariableInfix;
251 public void setBnfVariableInfix(String bnfVariableInfix) {
252 this.bnfVariableInfix = bnfVariableInfix;
255 public String getBnfLineSeparator() {
256 return bnfLineSeparator;
259 public void setBnfLineSeparator(String bnfLineSeparator) {
260 this.bnfLineSeparator = bnfLineSeparator;
264 * Utility for loading lines from a file.
265 * @param result The result of the appended lines.
266 * @param file The file to have an input stream.
267 * @param encoding if null, then UTF-8
268 * @return filled list
269 * @throws IOException If there were problems opening the file for input stream.
271 public static List<String> appendLines(List<String> result, String file, String encoding) throws IOException {
272 return appendLines(result, new FileInputStream(file), encoding);
276 * Utility for loading lines from a UTF8 file.
277 * @param result The result of the appended lines.
278 * @param inputStream The input stream.
279 * @param encoding if null, then UTF-8
280 * @return filled list
281 * @throws IOException If there were problems opening the input stream for reading.
283 public static List<String> appendLines(List<String> result, InputStream inputStream, String encoding)
284 throws UnsupportedEncodingException, IOException {
285 BufferedReader in = new BufferedReader(new InputStreamReader(inputStream, encoding == null ? "UTF-8" : encoding));
287 String line = in.readLine();
288 if (line == null) break;
297 * @see com.ibm.icu.util.Freezable#cloneAsThawed()
299 public UnicodeRegex cloneAsThawed() {
300 // TODO Auto-generated method stub
302 return (UnicodeRegex)clone();
303 } catch (CloneNotSupportedException e) {
304 throw new IllegalArgumentException(); // should never happen
309 * @see com.ibm.icu.util.Freezable#freeze()
311 public UnicodeRegex freeze() {
312 // no action needed now.
317 * @see com.ibm.icu.util.Freezable#isFrozen()
319 public boolean isFrozen() {
320 // at this point, always true
324 // ===== PRIVATES =====
326 private int processSet(String regex, int i, StringBuilder result, UnicodeSet temp, ParsePosition pos) {
329 UnicodeSet x = temp.clear().applyPattern(regex, pos, symbolTable, 0);
330 x.complement().complement(); // hack to fix toPattern
331 result.append(x.toPattern(false));
332 i = pos.getIndex() - 1; // allow for the loop increment
334 } catch (Exception e) {
335 throw (IllegalArgumentException) new IllegalArgumentException("Error in " + regex).initCause(e);
339 private static UnicodeRegex STANDARD = new UnicodeRegex();
340 private String bnfCommentString = "#";
341 private String bnfVariableInfix = "=";
342 private String bnfLineSeparator = "\n";
343 private Appendable log = null;
345 private Comparator<Object> LongestFirst = new Comparator<Object>() {
346 public int compare(Object obj0, Object obj1) {
347 String arg0 = obj0.toString();
348 String arg1 = obj1.toString();
349 int len0 = arg0.length();
350 int len1 = arg1.length();
351 if (len0 != len1) return len1 - len0;
352 return arg0.compareTo(arg1);
356 private Map<String, String> getVariables(List<String> lines) {
357 Map<String, String> variables = new TreeMap<String, String>(LongestFirst);
358 String variable = null;
359 StringBuffer definition = new StringBuffer();
361 for (String line : lines) {
363 // remove initial bom, comments
364 if (line.length() == 0) continue;
365 if (line.charAt(0) == '\uFEFF') line = line.substring(1);
367 if (bnfCommentString != null) {
368 int hashPos = line.indexOf(bnfCommentString);
369 if (hashPos >= 0) line = line.substring(0, hashPos);
371 String trimline = line.trim();
372 if (trimline.length() == 0) continue;
374 // String[] lineParts = line.split(";");
375 String linePart = line; // lineParts[i]; // .trim().replace("\\s+", " ");
376 if (linePart.trim().length() == 0) continue;
377 boolean terminated = trimline.endsWith(";");
379 linePart = linePart.substring(0,linePart.lastIndexOf(';'));
381 int equalsPos = linePart.indexOf(bnfVariableInfix);
382 if (equalsPos >= 0) {
383 if (variable != null) {
384 throw new IllegalArgumentException("Missing ';' before " + count + ") " + line);
386 variable = linePart.substring(0,equalsPos).trim();
387 if (variables.containsKey(variable)) {
388 throw new IllegalArgumentException("Duplicate variable definition in " + line);
390 definition.append(linePart.substring(equalsPos+1).trim());
391 } else { // no equals, so
392 if (variable == null) {
393 throw new IllegalArgumentException("Missing '=' at " + count + ") " + line);
395 definition.append(bnfLineSeparator).append(linePart);
397 // we are terminated if i is not at the end, or the line ends with a ;
399 variables.put(variable, definition.toString());
400 variable = null; // signal we have no variable
401 definition.setLength(0);
404 if (variable != null) {
405 throw new IllegalArgumentException("Missing ';' at end");