jars/icu4j-4_2_1-src/src/com/ibm/icu/impl/UnicodeRegex.java

   1 //##header\r
   2 //#if defined(FOUNDATION10) || defined(J2SE13) || defined(J2SE14)\r
   3 //#else\r
   4 /*\r
   5  *******************************************************************************\r
   6  * Copyright (C) 2009, Google, International Business Machines Corporation and *\r
   7  * others. All Rights Reserved.                                                *\r
   8  *******************************************************************************\r
   9  */\r
  10 package com.ibm.icu.impl;\r
  11 \r
  12 import java.io.BufferedReader;\r
  13 import java.io.FileInputStream;\r
  14 import java.io.IOException;\r
  15 import java.io.InputStream;\r
  16 import java.io.InputStreamReader;\r
  17 import java.io.UnsupportedEncodingException;\r
  18 import java.text.ParsePosition;\r
  19 import java.util.Arrays;\r
  20 import java.util.Comparator;\r
  21 import java.util.Iterator;\r
  22 import java.util.LinkedHashSet;\r
  23 import java.util.List;\r
  24 import java.util.Map;\r
  25 import java.util.Set;\r
  26 import java.util.TreeMap;\r
  27 import java.util.regex.Pattern;\r
  28 \r
  29 import com.ibm.icu.text.StringTransform;\r
  30 import com.ibm.icu.text.UnicodeSet;\r
  31 import com.ibm.icu.util.Freezable;\r
  32 \r
  33 /**\r
  34  * Contains utilities to supplement the JDK Regex, since it doesn't handle\r
  35  * Unicode well.\r
  36  * \r
  37  * @author markdavis\r
  38  */\r
  39 public class UnicodeRegex implements Cloneable, Freezable, StringTransform {\r
  40     // Note: we don't currently have any state, but intend to in the future,\r
  41     // particularly for the regex style supported.\r
  42 \r
  43     /**\r
  44      * Adds full Unicode property support, with the latest version of Unicode,\r
  45      * to Java Regex, bringing it up to Level 1 (see\r
  46      * http://www.unicode.org/reports/tr18/). It does this by preprocessing the\r
  47      * regex pattern string and interpreting the character classes (\p{...},\r
  48      * \P{...}, [...]) according to their syntax and meaning in UnicodeSet. With\r
  49      * this utility, Java regex expressions can be updated to work with the\r
  50      * latest version of Unicode, and with all Unicode properties. Note that the\r
  51      * UnicodeSet syntax has not yet, however, been updated to be completely\r
  52      * consistent with Java regex, so be careful of the differences.\r
  53      * <p>Not thread-safe; create a separate copy for different threads.\r
  54      * <p>In the future, we may extend this to support other regex packages.\r
  55      * \r
  56      * @regex A modified Java regex pattern, as in the input to\r
  57      *        Pattern.compile(), except that all "character classes" are\r
  58      *        processed as if they were UnicodeSet patterns. Example:\r
  59      *        "abc[:bc=N:]. See UnicodeSet for the differences in syntax.\r
  60      * @return A processed Java regex pattern, suitable for input to\r
  61      *         Pattern.compile().\r
  62      */\r
  63     public String transform(String regex) {\r
  64         StringBuffer result = new StringBuffer();\r
  65         UnicodeSet temp = new UnicodeSet();\r
  66         ParsePosition pos = new ParsePosition(0);\r
  67         int state = 0; // 1 = after \\r
  68 \r
  69         // We add each character unmodified to the output, unless we have a\r
  70         // UnicodeSet. Note that we don't worry about supplementary characters,\r
  71         // since none of the syntax uses them.\r
  72 \r
  73         for (int i = 0; i < regex.length(); ++i) {\r
  74             // look for UnicodeSets, allowing for quoting with \ and \Q\r
  75             char ch = regex.charAt(i);\r
  76             switch (state) {\r
  77             case 0: // we only care about \, and '['.\r
  78                 if (ch == '\\') {\r
  79                     if (UnicodeSet.resemblesPattern(regex, i)) {\r
  80                         // should only happen with \p\r
  81                         i = processSet(regex, i, result, temp, pos);\r
  82                         continue;\r
  83                     }\r
  84                     state = 1;\r
  85                 } else if (ch == '[') {\r
  86                     // if we have what looks like a UnicodeSet\r
  87                     if (UnicodeSet.resemblesPattern(regex, i)) {\r
  88                         i = processSet(regex, i, result, temp, pos);\r
  89                         continue;\r
  90                     }\r
  91                 }\r
  92                 break;\r
  93 \r
  94             case 1: // we are after a \\r
  95                 if (ch == 'Q') {\r
  96                     state = 1;\r
  97                 } else {\r
  98                     state = 0;\r
  99                 }\r
 100                 break;\r
 101 \r
 102             case 2: // we are in a \Q...\r
 103                 if (ch == '\\') {\r
 104                     state = 3;\r
 105                 }\r
 106                 break;\r
 107 \r
 108             case 3: // we are in at \Q...\\r
 109                 if (ch == 'E') {\r
 110                     state = 0;\r
 111                 }\r
 112                 state = 2;\r
 113                 break;\r
 114             }\r
 115             result.append(ch);\r
 116         }\r
 117         return result.toString();\r
 118     }\r
 119 \r
 120     /**\r
 121      * Convenience static function, using standard parameters.\r
 122      * @param regex as in process()\r
 123      * @return processed regex pattern, as in process()\r
 124      */\r
 125     public static String fix(String regex) {\r
 126         return STANDARD.transform(regex);\r
 127     }\r
 128 \r
 129     /**\r
 130      * Compile a regex string, after processing by fix(...).\r
 131      * \r
 132      * @param regex\r
 133      *            Raw regex pattern, as in fix(...).\r
 134      * @return Pattern\r
 135      */\r
 136     public static Pattern compile(String regex) {\r
 137         return Pattern.compile(STANDARD.transform(regex));\r
 138     }\r
 139 \r
 140     /**\r
 141      * Compile a regex string, after processing by fix(...).\r
 142      * \r
 143      * @param regex\r
 144      *            Raw regex pattern, as in fix(...).\r
 145      * @return Pattern\r
 146      */\r
 147     public static Pattern compile(String regex, int options) {\r
 148         return Pattern.compile(STANDARD.transform(regex), options);\r
 149     }\r
 150 \r
 151     /**\r
 152      * Compile a composed string from a set of BNF lines; see the List version for more information.\r
 153      * \r
 154      * @param bnfLines Series of BNF lines.\r
 155      * @return Pattern\r
 156      */\r
 157     public String compileBnf(String bnfLines) {\r
 158         return compileBnf(Arrays.asList(bnfLines.split("\\r\\n?|\\n")));\r
 159     }\r
 160 \r
 161     /**\r
 162      * Compile a composed string from a set of BNF lines, such as for composing a regex\r
 163      * expression. The lines can be in any order, but there must not be any\r
 164      * cycles. The result can be used as input for fix().\r
 165      * <p>\r
 166      * Example:\r
 167      * <pre>\r
 168      * uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?;\r
 169      * scheme = reserved+;\r
 170      * host = // reserved+;\r
 171      * query = [\\=reserved]+;\r
 172      * fragment = reserved+;\r
 173      * reserved = [[:ascii:][:alphabetic:]];\r
 174      * </pre>\r
 175      * <p>\r
 176      * Caveats: at this point the parsing is simple; for example, # cannot be\r
 177      * quoted (use \\u0023); you can set it to null to disable. \r
 178      * The equality sign and a few others can be reset with\r
 179      * setBnfX().\r
 180      * \r
 181      * @param bnfLines\r
 182      *          Series of lines that represent a BNF expression. The lines contain\r
 183      *          a series of statements that of the form x=y;. A statement can take\r
 184      *          multiple lines, but there can't be multiple statements on a line.\r
 185      *          A hash quotes to the end of the line.\r
 186      * @return Pattern\r
 187      */\r
 188     public String compileBnf(List lines) {\r
 189         Map variables = getVariables(lines);\r
 190         Set unused = new LinkedHashSet(variables.keySet());\r
 191         // brute force replacement; do twice to allow for different order\r
 192         // later on can optimize\r
 193         for (int i = 0; i < 2; ++i) {\r
 194             for (Iterator it = variables.keySet().iterator(); it.hasNext();) {\r
 195                 String variable = (String) it.next();\r
 196                 String definition = (String) variables.get(variable);\r
 197                 for (Iterator it2 = variables.keySet().iterator(); it2.hasNext();) {\r
 198                     String variable2 = (String) it2.next();\r
 199                     if (variable.equals(variable2)) continue;\r
 200                     String definition2 = (String) variables.get(variable2);\r
 201                     String altered2 = definition2.replace(variable, definition);\r
 202                     if (!altered2.equals(definition2)) {\r
 203                         unused.remove(variable);\r
 204                         variables.put(variable2, altered2);\r
 205                         if (log != null) {\r
 206                             try {\r
 207                                 log.append(variable2 + "=" + altered2 + ";");\r
 208                             } catch (IOException e) {\r
 209                                 throw (IllegalArgumentException) new IllegalArgumentException().initCause(e);\r
 210                             }\r
 211                         }\r
 212                     }\r
 213                 }\r
 214             }\r
 215         }\r
 216         if (unused.size() != 1) {\r
 217             throw new IllegalArgumentException("Not a single root: " + unused);\r
 218         }\r
 219         return (String) variables.get(unused.iterator().next());\r
 220     }\r
 221 \r
 222     public String getBnfCommentString() {\r
 223         return bnfCommentString;\r
 224     }\r
 225 \r
 226     public void setBnfCommentString(String bnfCommentString) {\r
 227         this.bnfCommentString = bnfCommentString;\r
 228     }\r
 229 \r
 230     public String getBnfVariableInfix() {\r
 231         return bnfVariableInfix;\r
 232     }\r
 233 \r
 234     public void setBnfVariableInfix(String bnfVariableInfix) {\r
 235         this.bnfVariableInfix = bnfVariableInfix;\r
 236     }\r
 237 \r
 238     public String getBnfLineSeparator() {\r
 239         return bnfLineSeparator;\r
 240     }\r
 241 \r
 242     public void setBnfLineSeparator(String bnfLineSeparator) {\r
 243         this.bnfLineSeparator = bnfLineSeparator;\r
 244     }\r
 245 \r
 246     /**\r
 247      * Utility for loading lines from a file.\r
 248      * @param result\r
 249      * @param file\r
 250      * @param encoding if null, then UTF-8\r
 251      * @return filled list\r
 252      * @throws IOException\r
 253      */\r
 254     public static List appendLines(List result, String file, String encoding) throws IOException {\r
 255         return appendLines(result, new FileInputStream(file), encoding);\r
 256     }\r
 257 \r
 258     /**\r
 259      * Utility for loading lines from a UTF8 file.\r
 260      * @param result\r
 261      * @param inputStream\r
 262      * @param encoding if null, then UTF-8\r
 263      * @return filled list\r
 264      * @throws IOException\r
 265      */\r
 266     public static List appendLines(List result, InputStream inputStream, String encoding)\r
 267             throws UnsupportedEncodingException, IOException {\r
 268         BufferedReader in = new BufferedReader(new InputStreamReader(inputStream, encoding == null ? "UTF-8" : encoding));\r
 269         while (true) {\r
 270             String line = in.readLine();\r
 271             if (line == null) break;\r
 272             result.add(line);\r
 273         }\r
 274         return result;\r
 275     }\r
 276     \r
 277     \r
 278 \r
 279     /* (non-Javadoc)\r
 280      * @see com.ibm.icu.util.Freezable#cloneAsThawed()\r
 281      */\r
 282     public Object cloneAsThawed() {\r
 283         // TODO Auto-generated method stub\r
 284         try {\r
 285             return this.clone();\r
 286         } catch (CloneNotSupportedException e) {\r
 287             throw new IllegalArgumentException(); // should never happen\r
 288         }\r
 289     }\r
 290 \r
 291     /* (non-Javadoc)\r
 292      * @see com.ibm.icu.util.Freezable#freeze()\r
 293      */\r
 294     public Object freeze() {\r
 295         // no action needed now.\r
 296         return this;\r
 297     }\r
 298 \r
 299     /* (non-Javadoc)\r
 300      * @see com.ibm.icu.util.Freezable#isFrozen()\r
 301      */\r
 302     public boolean isFrozen() {\r
 303         // at this point, always true\r
 304         return true;\r
 305     }\r
 306 \r
 307     // ===== PRIVATES =====\r
 308 \r
 309     private int processSet(String regex, int i, StringBuffer result, UnicodeSet temp, ParsePosition pos) {\r
 310         try {\r
 311             pos.setIndex(i);\r
 312             UnicodeSet x = temp.clear().applyPattern(regex, pos, null, 0);\r
 313             x.complement().complement(); // hack to fix toPattern\r
 314             result.append(x.toPattern(false));\r
 315             i = pos.getIndex() - 1; // allow for the loop increment\r
 316             return i;\r
 317         } catch (Exception e) {\r
 318             throw (IllegalArgumentException) new IllegalArgumentException("Error in " + regex).initCause(e);\r
 319         }\r
 320     }\r
 321 \r
 322     private static UnicodeRegex STANDARD = new UnicodeRegex();\r
 323     private String bnfCommentString = "#";\r
 324     private String bnfVariableInfix = "=";\r
 325     private String bnfLineSeparator = "\n";\r
 326     private Appendable log = null;\r
 327 \r
 328     private Comparator LongestFirst = new Comparator () {\r
 329         public int compare(Object obj0, Object obj1) {\r
 330             String arg0 = obj0.toString();\r
 331             String arg1 = obj1.toString();\r
 332             int len0 = arg0.length();\r
 333             int len1 = arg1.length();\r
 334             if (len0 != len1) return len1 - len0;\r
 335             return arg0.compareTo(arg1);\r
 336         }\r
 337     };\r
 338 \r
 339     private Map getVariables(List lines) {\r
 340         Map variables = new TreeMap(LongestFirst);\r
 341         String variable = null;\r
 342         StringBuffer definition = new StringBuffer();\r
 343         int count = 0;\r
 344         for (Iterator it = lines.iterator(); it.hasNext();) {\r
 345             String line = (String)it.next();\r
 346             ++count;\r
 347             // remove initial bom, comments\r
 348             if (line.length() == 0) continue;\r
 349             if (line.charAt(0) == '\uFEFF') line = line.substring(1);\r
 350 \r
 351             if (bnfCommentString != null) {\r
 352                 int hashPos = line.indexOf(bnfCommentString);\r
 353                 if (hashPos >= 0) line = line.substring(0, hashPos);\r
 354             }\r
 355             String trimline = line.trim();\r
 356             if (trimline.length() == 0) continue;\r
 357 \r
 358             // String[] lineParts = line.split(";");\r
 359             String linePart = line; // lineParts[i]; // .trim().replace("\\s+", " ");\r
 360             if (linePart.trim().length() == 0) continue;\r
 361             boolean terminated = trimline.endsWith(";");\r
 362             if (terminated) {\r
 363                 linePart = linePart.substring(0,linePart.lastIndexOf(';'));\r
 364             }\r
 365             int equalsPos = linePart.indexOf(bnfVariableInfix);\r
 366             if (equalsPos >= 0) {\r
 367                 if (variable != null) {\r
 368                     throw new IllegalArgumentException("Missing ';' before " + count + ") " + line);\r
 369                 }\r
 370                 variable = linePart.substring(0,equalsPos).trim();\r
 371                 if (variables.containsKey(variable)) {\r
 372                     throw new IllegalArgumentException("Duplicate variable definition in " + line);\r
 373                 }\r
 374                 definition.append(linePart.substring(equalsPos+1).trim());\r
 375             } else { // no equals, so\r
 376                 if (variable == null) {\r
 377                     throw new IllegalArgumentException("Missing '=' at " + count + ") " + line);\r
 378                 }\r
 379                 definition.append(bnfLineSeparator).append(linePart);\r
 380             }\r
 381             // we are terminated if i is not at the end, or the line ends with a ;\r
 382             if (terminated) {\r
 383                 variables.put(variable, definition.toString());\r
 384                 variable = null; // signal we have no variable\r
 385                 definition.setLength(0);\r
 386             }\r
 387         }\r
 388         if (variable != null) {\r
 389             throw new IllegalArgumentException("Missing ';' at end");\r
 390         }\r
 391         return variables;\r
 392     }\r
 393 }\r
 394 //#endif\r
 395 \r