]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-4_8_1_1/main/tests/translit/src/com/ibm/icu/dev/test/translit/RegexUtilitiesTest.java
Added flags.
[Dictionary.git] / jars / icu4j-4_8_1_1 / main / tests / translit / src / com / ibm / icu / dev / test / translit / RegexUtilitiesTest.java
1 /*
2  *******************************************************************************
3  * Copyright (C) 2009, International Business Machines Corporation and         *
4  * others. All Rights Reserved.                                                *
5  *******************************************************************************
6  */
7 package com.ibm.icu.dev.test.translit;
8
9 import java.util.ArrayList;
10 import java.util.List;
11 import java.util.regex.Matcher;
12 import java.util.regex.Pattern;
13
14 import com.ibm.icu.dev.test.TestFmwk;
15 import com.ibm.icu.impl.UnicodeRegex;
16 import com.ibm.icu.lang.UCharacter;
17 import com.ibm.icu.lang.UProperty;
18 import com.ibm.icu.lang.UProperty.NameChoice;
19 import com.ibm.icu.text.Transliterator;
20 import com.ibm.icu.text.UTF16;
21 import com.ibm.icu.text.UnicodeSet;
22
23 /**
24  * @author markdavis
25  */
26 public class RegexUtilitiesTest extends TestFmwk {
27
28     public static void main(String[] args) throws Exception {
29         new RegexUtilitiesTest().run(args);
30     }
31
32     /**
33      * Check basic construction.
34      */
35     public void TestConstruction() {
36         String[][] tests = {
37                 {"a"},
38                 {"a[a-z]b"},
39                 {"[ba-z]", "[a-z]"},
40                 {"q[ba-z]", "q[a-z]"},
41                 {"[ba-z]q", "[a-z]q"},
42                 {"a\\p{joincontrol}b", "a[\u200C\u200D]b"},
43                 {"a\\P{joincontrol}b", "a[^\u200C\u200D]b"},
44                 {"a[[:whitespace:]&[:Zl:]]b", "a[\\\u2028]b"},
45                 {"a [[:bc=cs:]&[:wspace:]] b", "a [\u00A0\u202F] b"},
46         };
47         for (int i = 0; i < tests.length; ++i) {
48             final String source = tests[i][0];
49             String expected = tests[i].length == 1 ? source : tests[i][1];
50             String actual = UnicodeRegex.fix(source);
51             assertEquals(source, expected, actual);
52         } 
53     }
54
55     Transliterator hex = Transliterator.getInstance("hex");
56
57     /**
58      * Perform an exhaustive test on all Unicode characters to make sure that the UnicodeSet with each
59      * character works.
60      */
61     public void TestCharacters() {
62         UnicodeSet requiresQuote = new UnicodeSet("[\\$\\&\\-\\:\\[\\\\\\]\\^\\{\\}[:pattern_whitespace:]]");
63         boolean skip = getInclusion() < 10;
64         for (int cp = 0; cp < 0x110000; ++cp) {
65             if (cp > 0xFF && skip && (cp % 37 != 0)) {
66                 continue;
67             }
68             String cpString = UTF16.valueOf(cp);
69             String s = requiresQuote.contains(cp) ? "\\" + cpString : cpString;
70             String pattern = null;
71             final String rawPattern = "[" + s + s + "]";
72             try {
73                 pattern = UnicodeRegex.fix(rawPattern);
74             } catch (Exception e) {
75                 errln(e.getMessage());
76                 continue;
77             }
78             final String expected = "[" + s + "]";
79             assertEquals("Doubled character works" + hex.transform(s), expected, pattern);
80
81             // verify that we can create a regex pattern and use as expected
82             String shouldNotMatch = UTF16.valueOf((cp + 1) % 0x110000);
83             checkCharPattern(Pattern.compile(pattern), pattern, cpString, shouldNotMatch);
84
85             // verify that the Pattern.compile works
86             checkCharPattern(UnicodeRegex.compile(rawPattern), pattern, cpString, shouldNotMatch);
87         }
88     }
89
90     /**
91      * Check all integer Unicode properties to make sure they work.
92      */
93     public void TestUnicodeProperties() {
94         final boolean skip = getInclusion() < 10;
95         UnicodeSet temp = new UnicodeSet();
96         for (int propNum = UProperty.INT_START; propNum < UProperty.INT_LIMIT; ++propNum) {
97             if (skip && (propNum % 5 != 0)) {
98                 continue;
99             }
100             String propName = UCharacter.getPropertyName(propNum, NameChoice.LONG);
101             final int intPropertyMinValue = UCharacter.getIntPropertyMinValue(propNum);
102             int intPropertyMaxValue = UCharacter.getIntPropertyMaxValue(propNum);
103             if (skip) { // only test first if not exhaustive
104                 intPropertyMaxValue = intPropertyMinValue;
105             }
106             for (int valueNum = intPropertyMinValue; valueNum <= intPropertyMaxValue; ++valueNum) {
107                 // hack for getting property value name
108                 String valueName = UCharacter.getPropertyValueName(propNum, valueNum, NameChoice.LONG);
109                 if (valueName == null) {
110                     valueName = UCharacter.getPropertyValueName(propNum, valueNum, NameChoice.SHORT);
111                     if (valueName == null) {
112                         valueName = Integer.toString(valueNum);
113                     }
114                 }
115                 temp.applyIntPropertyValue(propNum, valueNum);
116                 if (temp.size() == 0) {
117                     continue;
118                 }
119                 final String prefix = "a";
120                 final String suffix = "b";
121                 String shouldMatch = prefix + UTF16.valueOf(temp.charAt(0)) + suffix;
122                 temp.complement();
123                 String shouldNotMatch = prefix + UTF16.valueOf(temp.charAt(0)) + suffix;
124
125                 // posix style pattern
126                 String rawPattern = prefix + "[:" + propName + "=" + valueName + ":]" + suffix;
127                 String rawNegativePattern = prefix + "[:^" + propName + "=" + valueName + ":]" + suffix;
128                 checkCharPattern(UnicodeRegex.compile(rawPattern), rawPattern, shouldMatch, shouldNotMatch);
129                 checkCharPattern(UnicodeRegex.compile(rawNegativePattern), rawNegativePattern, shouldNotMatch, shouldMatch);
130
131                 // perl style pattern
132                 rawPattern = prefix + "\\p{" + propName + "=" + valueName + "}" + suffix;
133                 rawNegativePattern = prefix + "\\P{" + propName + "=" + valueName + "}" + suffix;
134                 checkCharPattern(UnicodeRegex.compile(rawPattern), rawPattern, shouldMatch, shouldNotMatch);
135                 checkCharPattern(UnicodeRegex.compile(rawNegativePattern), rawNegativePattern, shouldNotMatch, shouldMatch);
136             }
137         }
138     }
139
140     public void TestBnf() {
141         UnicodeRegex regex = new UnicodeRegex();
142         final String[][] tests = {
143                 {
144                     "c = a wq;\n" +
145                     "a = xyz;\n" +
146                     "b = a a c;\n"
147                 },
148                 {
149                     "c = a b;\n" +
150                     "a = xyz;\n" +
151                     "b = a a c;\n",
152                     "Exception"
153                 },
154                 {
155                     "uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?;\n" +
156                     "scheme = reserved+;\n" +
157                     "host = // reserved+;\n" +
158                     "query = [\\=reserved]+;\n" +
159                     "fragment = reserved+;\n" +
160                     "reserved = [[:ascii:][:sc=grek:]&[:alphabetic:]];\n",
161                 "http://\u03B1\u03B2\u03B3?huh=hi#there"},
162                 {
163                     "langtagRegex.txt"
164                 }
165         };
166         for (int i = 0; i < tests.length; ++i) {
167             String test = tests[i][0];
168             final boolean expectException = tests[i].length < 2 ? false : tests[i][1].equals("Exception");
169             try {
170                 String result;
171                 if (test.endsWith(".txt")) {
172                     java.io.InputStream is = RegexUtilitiesTest.class.getResourceAsStream(test);
173                     List lines = UnicodeRegex.appendLines(new ArrayList(), is, "UTF-8");
174                     result = regex.compileBnf(lines);
175                 } else {
176                     result = regex.compileBnf(test);
177                 }
178                 if (expectException) {
179                     errln("Expected exception for " + test);
180                     continue;
181                 }
182                 result = result.replaceAll("[0-9]+%", ""); // just so we can use the language subtag stuff
183                 String resolved = regex.transform(result);
184                 logln(resolved);
185                 Matcher m = Pattern.compile(resolved, Pattern.COMMENTS).matcher("");
186                 String checks = "";
187                 for (int j = 1; j < tests[i].length; ++j) {
188                     String check = tests[i][j];
189                     if (!m.reset(check).matches()) {
190                         checks = checks + "Fails " + check + "\n";
191                     } else {
192                         for (int k = 1; k <= m.groupCount(); ++k) {
193                             checks += "(" + m.group(k) + ")";
194                         }
195                         checks += "\n";
196                     }
197                 }
198                 logln("Result: " + result + "\n" + checks + "\n" + test);
199             } catch (Exception e) {
200                 if (!expectException) {
201                     errln(e.getClass().getName() + ": " + e.getMessage());
202                 }
203                 continue;
204             }
205         }
206     }
207
208     /**
209      * Utility for checking patterns
210      */
211     private void checkCharPattern(Pattern pat, String matchTitle, String shouldMatch, String shouldNotMatch) {
212         Matcher matcher = pat.matcher(shouldMatch);
213         assertTrue(matchTitle + " and " + shouldMatch, matcher.matches());
214         matcher.reset(shouldNotMatch);
215         assertFalse(matchTitle + " and " + shouldNotMatch, matcher.matches());
216     }
217 }