2 //#if defined(FOUNDATION10) || defined(J2SE13) || defined(J2SE14)
5 *******************************************************************************
6 * Copyright (C) 2009, Google, International Business Machines Corporation and *
7 * others. All Rights Reserved. *
8 *******************************************************************************
10 package com.ibm.icu.impl;
12 import java.io.BufferedReader;
13 import java.io.FileInputStream;
14 import java.io.IOException;
15 import java.io.InputStream;
16 import java.io.InputStreamReader;
17 import java.io.UnsupportedEncodingException;
18 import java.text.ParsePosition;
19 import java.util.Arrays;
20 import java.util.Comparator;
21 import java.util.Iterator;
22 import java.util.LinkedHashSet;
23 import java.util.List;
26 import java.util.TreeMap;
27 import java.util.regex.Pattern;
29 import com.ibm.icu.text.StringTransform;
30 import com.ibm.icu.text.UnicodeSet;
31 import com.ibm.icu.util.Freezable;
34 * Contains utilities to supplement the JDK Regex, since it doesn't handle
39 public class UnicodeRegex implements Cloneable, Freezable, StringTransform {
40 // Note: we don't currently have any state, but intend to in the future,
41 // particularly for the regex style supported.
44 * Adds full Unicode property support, with the latest version of Unicode,
45 * to Java Regex, bringing it up to Level 1 (see
46 * http://www.unicode.org/reports/tr18/). It does this by preprocessing the
47 * regex pattern string and interpreting the character classes (\p{...},
48 * \P{...}, [...]) according to their syntax and meaning in UnicodeSet. With
49 * this utility, Java regex expressions can be updated to work with the
50 * latest version of Unicode, and with all Unicode properties. Note that the
51 * UnicodeSet syntax has not yet, however, been updated to be completely
52 * consistent with Java regex, so be careful of the differences.
53 * <p>Not thread-safe; create a separate copy for different threads.
54 * <p>In the future, we may extend this to support other regex packages.
56 * @regex A modified Java regex pattern, as in the input to
57 * Pattern.compile(), except that all "character classes" are
58 * processed as if they were UnicodeSet patterns. Example:
59 * "abc[:bc=N:]. See UnicodeSet for the differences in syntax.
60 * @return A processed Java regex pattern, suitable for input to
63 public String transform(String regex) {
64 StringBuffer result = new StringBuffer();
65 UnicodeSet temp = new UnicodeSet();
66 ParsePosition pos = new ParsePosition(0);
67 int state = 0; // 1 = after \
69 // We add each character unmodified to the output, unless we have a
70 // UnicodeSet. Note that we don't worry about supplementary characters,
71 // since none of the syntax uses them.
73 for (int i = 0; i < regex.length(); ++i) {
74 // look for UnicodeSets, allowing for quoting with \ and \Q
75 char ch = regex.charAt(i);
77 case 0: // we only care about \, and '['.
79 if (UnicodeSet.resemblesPattern(regex, i)) {
80 // should only happen with \p
81 i = processSet(regex, i, result, temp, pos);
85 } else if (ch == '[') {
86 // if we have what looks like a UnicodeSet
87 if (UnicodeSet.resemblesPattern(regex, i)) {
88 i = processSet(regex, i, result, temp, pos);
94 case 1: // we are after a \
102 case 2: // we are in a \Q...
108 case 3: // we are in at \Q...\
117 return result.toString();
121 * Convenience static function, using standard parameters.
122 * @param regex as in process()
123 * @return processed regex pattern, as in process()
125 public static String fix(String regex) {
126 return STANDARD.transform(regex);
130 * Compile a regex string, after processing by fix(...).
133 * Raw regex pattern, as in fix(...).
136 public static Pattern compile(String regex) {
137 return Pattern.compile(STANDARD.transform(regex));
141 * Compile a regex string, after processing by fix(...).
144 * Raw regex pattern, as in fix(...).
147 public static Pattern compile(String regex, int options) {
148 return Pattern.compile(STANDARD.transform(regex), options);
152 * Compile a composed string from a set of BNF lines; see the List version for more information.
154 * @param bnfLines Series of BNF lines.
157 public String compileBnf(String bnfLines) {
158 return compileBnf(Arrays.asList(bnfLines.split("\\r\\n?|\\n")));
162 * Compile a composed string from a set of BNF lines, such as for composing a regex
163 * expression. The lines can be in any order, but there must not be any
164 * cycles. The result can be used as input for fix().
168 * uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?;
169 * scheme = reserved+;
170 * host = // reserved+;
171 * query = [\\=reserved]+;
172 * fragment = reserved+;
173 * reserved = [[:ascii:][:alphabetic:]];
176 * Caveats: at this point the parsing is simple; for example, # cannot be
177 * quoted (use \\u0023); you can set it to null to disable.
178 * The equality sign and a few others can be reset with
182 * Series of lines that represent a BNF expression. The lines contain
183 * a series of statements that of the form x=y;. A statement can take
184 * multiple lines, but there can't be multiple statements on a line.
185 * A hash quotes to the end of the line.
188 public String compileBnf(List lines) {
189 Map variables = getVariables(lines);
190 Set unused = new LinkedHashSet(variables.keySet());
191 // brute force replacement; do twice to allow for different order
192 // later on can optimize
193 for (int i = 0; i < 2; ++i) {
194 for (Iterator it = variables.keySet().iterator(); it.hasNext();) {
195 String variable = (String) it.next();
196 String definition = (String) variables.get(variable);
197 for (Iterator it2 = variables.keySet().iterator(); it2.hasNext();) {
198 String variable2 = (String) it2.next();
199 if (variable.equals(variable2)) continue;
200 String definition2 = (String) variables.get(variable2);
201 String altered2 = definition2.replace(variable, definition);
202 if (!altered2.equals(definition2)) {
203 unused.remove(variable);
204 variables.put(variable2, altered2);
207 log.append(variable2 + "=" + altered2 + ";");
208 } catch (IOException e) {
209 throw (IllegalArgumentException) new IllegalArgumentException().initCause(e);
216 if (unused.size() != 1) {
217 throw new IllegalArgumentException("Not a single root: " + unused);
219 return (String) variables.get(unused.iterator().next());
222 public String getBnfCommentString() {
223 return bnfCommentString;
226 public void setBnfCommentString(String bnfCommentString) {
227 this.bnfCommentString = bnfCommentString;
230 public String getBnfVariableInfix() {
231 return bnfVariableInfix;
234 public void setBnfVariableInfix(String bnfVariableInfix) {
235 this.bnfVariableInfix = bnfVariableInfix;
238 public String getBnfLineSeparator() {
239 return bnfLineSeparator;
242 public void setBnfLineSeparator(String bnfLineSeparator) {
243 this.bnfLineSeparator = bnfLineSeparator;
247 * Utility for loading lines from a file.
250 * @param encoding if null, then UTF-8
251 * @return filled list
252 * @throws IOException
254 public static List appendLines(List result, String file, String encoding) throws IOException {
255 return appendLines(result, new FileInputStream(file), encoding);
259 * Utility for loading lines from a UTF8 file.
262 * @param encoding if null, then UTF-8
263 * @return filled list
264 * @throws IOException
266 public static List appendLines(List result, InputStream inputStream, String encoding)
267 throws UnsupportedEncodingException, IOException {
268 BufferedReader in = new BufferedReader(new InputStreamReader(inputStream, encoding == null ? "UTF-8" : encoding));
270 String line = in.readLine();
271 if (line == null) break;
280 * @see com.ibm.icu.util.Freezable#cloneAsThawed()
282 public Object cloneAsThawed() {
283 // TODO Auto-generated method stub
286 } catch (CloneNotSupportedException e) {
287 throw new IllegalArgumentException(); // should never happen
292 * @see com.ibm.icu.util.Freezable#freeze()
294 public Object freeze() {
295 // no action needed now.
300 * @see com.ibm.icu.util.Freezable#isFrozen()
302 public boolean isFrozen() {
303 // at this point, always true
307 // ===== PRIVATES =====
309 private int processSet(String regex, int i, StringBuffer result, UnicodeSet temp, ParsePosition pos) {
312 UnicodeSet x = temp.clear().applyPattern(regex, pos, null, 0);
313 x.complement().complement(); // hack to fix toPattern
314 result.append(x.toPattern(false));
315 i = pos.getIndex() - 1; // allow for the loop increment
317 } catch (Exception e) {
318 throw (IllegalArgumentException) new IllegalArgumentException("Error in " + regex).initCause(e);
322 private static UnicodeRegex STANDARD = new UnicodeRegex();
323 private String bnfCommentString = "#";
324 private String bnfVariableInfix = "=";
325 private String bnfLineSeparator = "\n";
326 private Appendable log = null;
328 private Comparator LongestFirst = new Comparator () {
329 public int compare(Object obj0, Object obj1) {
330 String arg0 = obj0.toString();
331 String arg1 = obj1.toString();
332 int len0 = arg0.length();
333 int len1 = arg1.length();
334 if (len0 != len1) return len1 - len0;
335 return arg0.compareTo(arg1);
339 private Map getVariables(List lines) {
340 Map variables = new TreeMap(LongestFirst);
341 String variable = null;
342 StringBuffer definition = new StringBuffer();
344 for (Iterator it = lines.iterator(); it.hasNext();) {
345 String line = (String)it.next();
347 // remove initial bom, comments
348 if (line.length() == 0) continue;
349 if (line.charAt(0) == '\uFEFF') line = line.substring(1);
351 if (bnfCommentString != null) {
352 int hashPos = line.indexOf(bnfCommentString);
353 if (hashPos >= 0) line = line.substring(0, hashPos);
355 String trimline = line.trim();
356 if (trimline.length() == 0) continue;
358 // String[] lineParts = line.split(";");
359 String linePart = line; // lineParts[i]; // .trim().replace("\\s+", " ");
360 if (linePart.trim().length() == 0) continue;
361 boolean terminated = trimline.endsWith(";");
363 linePart = linePart.substring(0,linePart.lastIndexOf(';'));
365 int equalsPos = linePart.indexOf(bnfVariableInfix);
366 if (equalsPos >= 0) {
367 if (variable != null) {
368 throw new IllegalArgumentException("Missing ';' before " + count + ") " + line);
370 variable = linePart.substring(0,equalsPos).trim();
371 if (variables.containsKey(variable)) {
372 throw new IllegalArgumentException("Duplicate variable definition in " + line);
374 definition.append(linePart.substring(equalsPos+1).trim());
375 } else { // no equals, so
376 if (variable == null) {
377 throw new IllegalArgumentException("Missing '=' at " + count + ") " + line);
379 definition.append(bnfLineSeparator).append(linePart);
381 // we are terminated if i is not at the end, or the line ends with a ;
383 variables.put(variable, definition.toString());
384 variable = null; // signal we have no variable
385 definition.setLength(0);
388 if (variable != null) {
389 throw new IllegalArgumentException("Missing ';' at end");