2 //#if defined(FOUNDATION10) || defined(J2SE13) || defined(J2SE14)
\r
5 *******************************************************************************
\r
6 * Copyright (C) 2009, International Business Machines Corporation and *
\r
7 * others. All Rights Reserved. *
\r
8 *******************************************************************************
\r
10 package com.ibm.icu.dev.test.translit;
\r
12 import java.util.ArrayList;
\r
13 import java.util.List;
\r
14 import java.util.regex.Matcher;
\r
15 import java.util.regex.Pattern;
\r
17 import com.ibm.icu.dev.test.TestFmwk;
\r
18 import com.ibm.icu.impl.UnicodeRegex;
\r
19 import com.ibm.icu.lang.UCharacter;
\r
20 import com.ibm.icu.lang.UProperty;
\r
21 import com.ibm.icu.lang.UProperty.NameChoice;
\r
22 import com.ibm.icu.text.Transliterator;
\r
23 import com.ibm.icu.text.UTF16;
\r
24 import com.ibm.icu.text.UnicodeSet;
\r
29 public class RegexUtilitiesTest extends TestFmwk {
\r
31 public static void main(String[] args) throws Exception {
\r
32 new RegexUtilitiesTest().run(args);
\r
36 * Check basic construction.
\r
38 public void TestConstruction() {
\r
39 String[][] tests = {
\r
42 {"[ba-z]", "[a-z]"},
\r
43 {"q[ba-z]", "q[a-z]"},
\r
44 {"[ba-z]q", "[a-z]q"},
\r
45 {"a\\p{joincontrol}b", "a[\u200C\u200D]b"},
\r
46 {"a\\P{joincontrol}b", "a[^\u200C\u200D]b"},
\r
47 {"a[[:whitespace:]&[:Zl:]]b", "a[\\\u2028]b"},
\r
48 {"a [[:bc=cs:]&[:wspace:]] b", "a [\u00A0\u202F] b"},
\r
50 for (int i = 0; i < tests.length; ++i) {
\r
51 final String source = tests[i][0];
\r
52 String expected = tests[i].length == 1 ? source : tests[i][1];
\r
53 String actual = UnicodeRegex.fix(source);
\r
54 assertEquals(source, expected, actual);
\r
58 Transliterator hex = Transliterator.getInstance("hex");
\r
61 * Perform an exhaustive test on all Unicode characters to make sure that the UnicodeSet with each
\r
64 public void TestCharacters() {
\r
65 UnicodeSet requiresQuote = new UnicodeSet("[\\$\\&\\-\\:\\[\\\\\\]\\^\\{\\}[:pattern_whitespace:]]");
\r
66 boolean skip = getInclusion() < 10;
\r
67 for (int cp = 0; cp < 0x110000; ++cp) {
\r
68 if (cp > 0xFF && skip && (cp % 37 != 0)) {
\r
71 String cpString = UTF16.valueOf(cp);
\r
72 String s = requiresQuote.contains(cp) ? "\\" + cpString : cpString;
\r
73 String pattern = null;
\r
74 final String rawPattern = "[" + s + s + "]";
\r
76 pattern = UnicodeRegex.fix(rawPattern);
\r
77 } catch (Exception e) {
\r
78 errln(e.getMessage());
\r
81 final String expected = "[" + s + "]";
\r
82 assertEquals("Doubled character works" + hex.transform(s), expected, pattern);
\r
84 // verify that we can create a regex pattern and use as expected
\r
85 String shouldNotMatch = UTF16.valueOf((cp + 1) % 0x110000);
\r
86 checkCharPattern(Pattern.compile(pattern), pattern, cpString, shouldNotMatch);
\r
88 // verify that the Pattern.compile works
\r
89 checkCharPattern(UnicodeRegex.compile(rawPattern), pattern, cpString, shouldNotMatch);
\r
94 * Check all integer Unicode properties to make sure they work.
\r
96 public void TestUnicodeProperties() {
\r
97 final boolean skip = getInclusion() < 10;
\r
98 UnicodeSet temp = new UnicodeSet();
\r
99 for (int propNum = UProperty.INT_START; propNum < UProperty.INT_LIMIT; ++propNum) {
\r
100 if (skip && (propNum % 5 != 0)) {
\r
103 String propName = UCharacter.getPropertyName(propNum, NameChoice.LONG);
\r
104 final int intPropertyMinValue = UCharacter.getIntPropertyMinValue(propNum);
\r
105 int intPropertyMaxValue = UCharacter.getIntPropertyMaxValue(propNum);
\r
106 if (skip) { // only test first if not exhaustive
\r
107 intPropertyMaxValue = intPropertyMinValue;
\r
109 for (int valueNum = intPropertyMinValue; valueNum <= intPropertyMaxValue; ++valueNum) {
\r
110 // hack for getting property value name
\r
111 String valueName = UCharacter.getPropertyValueName(propNum, valueNum, NameChoice.LONG);
\r
112 if (valueName == null) {
\r
113 valueName = UCharacter.getPropertyValueName(propNum, valueNum, NameChoice.SHORT);
\r
114 if (valueName == null) {
\r
115 valueName = Integer.toString(valueNum);
\r
118 temp.applyIntPropertyValue(propNum, valueNum);
\r
119 if (temp.size() == 0) {
\r
122 final String prefix = "a";
\r
123 final String suffix = "b";
\r
124 String shouldMatch = prefix + UTF16.valueOf(temp.charAt(0)) + suffix;
\r
126 String shouldNotMatch = prefix + UTF16.valueOf(temp.charAt(0)) + suffix;
\r
128 // posix style pattern
\r
129 String rawPattern = prefix + "[:" + propName + "=" + valueName + ":]" + suffix;
\r
130 String rawNegativePattern = prefix + "[:^" + propName + "=" + valueName + ":]" + suffix;
\r
131 checkCharPattern(UnicodeRegex.compile(rawPattern), rawPattern, shouldMatch, shouldNotMatch);
\r
132 checkCharPattern(UnicodeRegex.compile(rawNegativePattern), rawNegativePattern, shouldNotMatch, shouldMatch);
\r
134 // perl style pattern
\r
135 rawPattern = prefix + "\\p{" + propName + "=" + valueName + "}" + suffix;
\r
136 rawNegativePattern = prefix + "\\P{" + propName + "=" + valueName + "}" + suffix;
\r
137 checkCharPattern(UnicodeRegex.compile(rawPattern), rawPattern, shouldMatch, shouldNotMatch);
\r
138 checkCharPattern(UnicodeRegex.compile(rawNegativePattern), rawNegativePattern, shouldNotMatch, shouldMatch);
\r
143 public void TestBnf() {
\r
144 UnicodeRegex regex = new UnicodeRegex();
\r
145 final String[][] tests = {
\r
158 "uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?;\n" +
\r
159 "scheme = reserved+;\n" +
\r
160 "host = // reserved+;\n" +
\r
161 "query = [\\=reserved]+;\n" +
\r
162 "fragment = reserved+;\n" +
\r
163 "reserved = [[:ascii:][:sc=grek:]&[:alphabetic:]];\n",
\r
164 "http://\u03B1\u03B2\u03B3?huh=hi#there"},
\r
169 for (int i = 0; i < tests.length; ++i) {
\r
170 String test = tests[i][0];
\r
171 final boolean expectException = tests[i].length < 2 ? false : tests[i][1].equals("Exception");
\r
174 if (test.endsWith(".txt")) {
\r
175 java.io.InputStream is = RegexUtilitiesTest.class.getResourceAsStream(test);
\r
176 List lines = UnicodeRegex.appendLines(new ArrayList(), is, "UTF-8");
\r
177 result = regex.compileBnf(lines);
\r
179 result = regex.compileBnf(test);
\r
181 if (expectException) {
\r
182 errln("Expected exception for " + test);
\r
185 result = result.replaceAll("[0-9]+%", ""); // just so we can use the language subtag stuff
\r
186 String resolved = regex.transform(result);
\r
188 Matcher m = Pattern.compile(resolved, Pattern.COMMENTS).matcher("");
\r
189 String checks = "";
\r
190 for (int j = 1; j < tests[i].length; ++j) {
\r
191 String check = tests[i][j];
\r
192 if (!m.reset(check).matches()) {
\r
193 checks = checks + "Fails " + check + "\n";
\r
195 for (int k = 1; k <= m.groupCount(); ++k) {
\r
196 checks += "(" + m.group(k) + ")";
\r
201 logln("Result: " + result + "\n" + checks + "\n" + test);
\r
202 } catch (Exception e) {
\r
203 if (!expectException) {
\r
204 errln(e.getClass().getName() + ": " + e.getMessage());
\r
212 * Utility for checking patterns
\r
214 private void checkCharPattern(Pattern pat, String matchTitle, String shouldMatch, String shouldNotMatch) {
\r
215 Matcher matcher = pat.matcher(shouldMatch);
\r
216 assertTrue(matchTitle + " and " + shouldMatch, matcher.matches());
\r
217 matcher.reset(shouldNotMatch);
\r
218 assertFalse(matchTitle + " and " + shouldNotMatch, matcher.matches());
\r