]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-4_4_2-src/main/tests/translit/src/com/ibm/icu/dev/test/translit/RegexUtilitiesTest.java
go
[Dictionary.git] / jars / icu4j-4_4_2-src / main / tests / translit / src / com / ibm / icu / dev / test / translit / RegexUtilitiesTest.java
1 /*\r
2  *******************************************************************************\r
3  * Copyright (C) 2009, International Business Machines Corporation and         *\r
4  * others. All Rights Reserved.                                                *\r
5  *******************************************************************************\r
6  */\r
7 package com.ibm.icu.dev.test.translit;\r
8 \r
9 import java.util.ArrayList;\r
10 import java.util.List;\r
11 import java.util.regex.Matcher;\r
12 import java.util.regex.Pattern;\r
13 \r
14 import com.ibm.icu.dev.test.TestFmwk;\r
15 import com.ibm.icu.impl.UnicodeRegex;\r
16 import com.ibm.icu.lang.UCharacter;\r
17 import com.ibm.icu.lang.UProperty;\r
18 import com.ibm.icu.lang.UProperty.NameChoice;\r
19 import com.ibm.icu.text.Transliterator;\r
20 import com.ibm.icu.text.UTF16;\r
21 import com.ibm.icu.text.UnicodeSet;\r
22 \r
23 /**\r
24  * @author markdavis\r
25  */\r
26 public class RegexUtilitiesTest extends TestFmwk {\r
27 \r
28     public static void main(String[] args) throws Exception {\r
29         new RegexUtilitiesTest().run(args);\r
30     }\r
31 \r
32     /**\r
33      * Check basic construction.\r
34      */\r
35     public void TestConstruction() {\r
36         String[][] tests = {\r
37                 {"a"},\r
38                 {"a[a-z]b"},\r
39                 {"[ba-z]", "[a-z]"},\r
40                 {"q[ba-z]", "q[a-z]"},\r
41                 {"[ba-z]q", "[a-z]q"},\r
42                 {"a\\p{joincontrol}b", "a[\u200C\u200D]b"},\r
43                 {"a\\P{joincontrol}b", "a[^\u200C\u200D]b"},\r
44                 {"a[[:whitespace:]&[:Zl:]]b", "a[\\\u2028]b"},\r
45                 {"a [[:bc=cs:]&[:wspace:]] b", "a [\u00A0\u202F] b"},\r
46         };\r
47         for (int i = 0; i < tests.length; ++i) {\r
48             final String source = tests[i][0];\r
49             String expected = tests[i].length == 1 ? source : tests[i][1];\r
50             String actual = UnicodeRegex.fix(source);\r
51             assertEquals(source, expected, actual);\r
52         } \r
53     }\r
54 \r
55     Transliterator hex = Transliterator.getInstance("hex");\r
56 \r
57     /**\r
58      * Perform an exhaustive test on all Unicode characters to make sure that the UnicodeSet with each\r
59      * character works.\r
60      */\r
61     public void TestCharacters() {\r
62         UnicodeSet requiresQuote = new UnicodeSet("[\\$\\&\\-\\:\\[\\\\\\]\\^\\{\\}[:pattern_whitespace:]]");\r
63         boolean skip = getInclusion() < 10;\r
64         for (int cp = 0; cp < 0x110000; ++cp) {\r
65             if (cp > 0xFF && skip && (cp % 37 != 0)) {\r
66                 continue;\r
67             }\r
68             String cpString = UTF16.valueOf(cp);\r
69             String s = requiresQuote.contains(cp) ? "\\" + cpString : cpString;\r
70             String pattern = null;\r
71             final String rawPattern = "[" + s + s + "]";\r
72             try {\r
73                 pattern = UnicodeRegex.fix(rawPattern);\r
74             } catch (Exception e) {\r
75                 errln(e.getMessage());\r
76                 continue;\r
77             }\r
78             final String expected = "[" + s + "]";\r
79             assertEquals("Doubled character works" + hex.transform(s), expected, pattern);\r
80 \r
81             // verify that we can create a regex pattern and use as expected\r
82             String shouldNotMatch = UTF16.valueOf((cp + 1) % 0x110000);\r
83             checkCharPattern(Pattern.compile(pattern), pattern, cpString, shouldNotMatch);\r
84 \r
85             // verify that the Pattern.compile works\r
86             checkCharPattern(UnicodeRegex.compile(rawPattern), pattern, cpString, shouldNotMatch);\r
87         }\r
88     }\r
89 \r
90     /**\r
91      * Check all integer Unicode properties to make sure they work.\r
92      */\r
93     public void TestUnicodeProperties() {\r
94         final boolean skip = getInclusion() < 10;\r
95         UnicodeSet temp = new UnicodeSet();\r
96         for (int propNum = UProperty.INT_START; propNum < UProperty.INT_LIMIT; ++propNum) {\r
97             if (skip && (propNum % 5 != 0)) {\r
98                 continue;\r
99             }\r
100             String propName = UCharacter.getPropertyName(propNum, NameChoice.LONG);\r
101             final int intPropertyMinValue = UCharacter.getIntPropertyMinValue(propNum);\r
102             int intPropertyMaxValue = UCharacter.getIntPropertyMaxValue(propNum);\r
103             if (skip) { // only test first if not exhaustive\r
104                 intPropertyMaxValue = intPropertyMinValue;\r
105             }\r
106             for (int valueNum = intPropertyMinValue; valueNum <= intPropertyMaxValue; ++valueNum) {\r
107                 // hack for getting property value name\r
108                 String valueName = UCharacter.getPropertyValueName(propNum, valueNum, NameChoice.LONG);\r
109                 if (valueName == null) {\r
110                     valueName = UCharacter.getPropertyValueName(propNum, valueNum, NameChoice.SHORT);\r
111                     if (valueName == null) {\r
112                         valueName = Integer.toString(valueNum);\r
113                     }\r
114                 }\r
115                 temp.applyIntPropertyValue(propNum, valueNum);\r
116                 if (temp.size() == 0) {\r
117                     continue;\r
118                 }\r
119                 final String prefix = "a";\r
120                 final String suffix = "b";\r
121                 String shouldMatch = prefix + UTF16.valueOf(temp.charAt(0)) + suffix;\r
122                 temp.complement();\r
123                 String shouldNotMatch = prefix + UTF16.valueOf(temp.charAt(0)) + suffix;\r
124 \r
125                 // posix style pattern\r
126                 String rawPattern = prefix + "[:" + propName + "=" + valueName + ":]" + suffix;\r
127                 String rawNegativePattern = prefix + "[:^" + propName + "=" + valueName + ":]" + suffix;\r
128                 checkCharPattern(UnicodeRegex.compile(rawPattern), rawPattern, shouldMatch, shouldNotMatch);\r
129                 checkCharPattern(UnicodeRegex.compile(rawNegativePattern), rawNegativePattern, shouldNotMatch, shouldMatch);\r
130 \r
131                 // perl style pattern\r
132                 rawPattern = prefix + "\\p{" + propName + "=" + valueName + "}" + suffix;\r
133                 rawNegativePattern = prefix + "\\P{" + propName + "=" + valueName + "}" + suffix;\r
134                 checkCharPattern(UnicodeRegex.compile(rawPattern), rawPattern, shouldMatch, shouldNotMatch);\r
135                 checkCharPattern(UnicodeRegex.compile(rawNegativePattern), rawNegativePattern, shouldNotMatch, shouldMatch);\r
136             }\r
137         }\r
138     }\r
139 \r
140     public void TestBnf() {\r
141         UnicodeRegex regex = new UnicodeRegex();\r
142         final String[][] tests = {\r
143                 {\r
144                     "c = a wq;\n" +\r
145                     "a = xyz;\n" +\r
146                     "b = a a c;\n"\r
147                 },\r
148                 {\r
149                     "c = a b;\n" +\r
150                     "a = xyz;\n" +\r
151                     "b = a a c;\n",\r
152                     "Exception"\r
153                 },\r
154                 {\r
155                     "uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?;\n" +\r
156                     "scheme = reserved+;\n" +\r
157                     "host = // reserved+;\n" +\r
158                     "query = [\\=reserved]+;\n" +\r
159                     "fragment = reserved+;\n" +\r
160                     "reserved = [[:ascii:][:sc=grek:]&[:alphabetic:]];\n",\r
161                 "http://\u03B1\u03B2\u03B3?huh=hi#there"},\r
162                 {\r
163                     "langtagRegex.txt"\r
164                 }\r
165         };\r
166         for (int i = 0; i < tests.length; ++i) {\r
167             String test = tests[i][0];\r
168             final boolean expectException = tests[i].length < 2 ? false : tests[i][1].equals("Exception");\r
169             try {\r
170                 String result;\r
171                 if (test.endsWith(".txt")) {\r
172                     java.io.InputStream is = RegexUtilitiesTest.class.getResourceAsStream(test);\r
173                     List lines = UnicodeRegex.appendLines(new ArrayList(), is, "UTF-8");\r
174                     result = regex.compileBnf(lines);\r
175                 } else {\r
176                     result = regex.compileBnf(test);\r
177                 }\r
178                 if (expectException) {\r
179                     errln("Expected exception for " + test);\r
180                     continue;\r
181                 }\r
182                 result = result.replaceAll("[0-9]+%", ""); // just so we can use the language subtag stuff\r
183                 String resolved = regex.transform(result);\r
184                 logln(resolved);\r
185                 Matcher m = Pattern.compile(resolved, Pattern.COMMENTS).matcher("");\r
186                 String checks = "";\r
187                 for (int j = 1; j < tests[i].length; ++j) {\r
188                     String check = tests[i][j];\r
189                     if (!m.reset(check).matches()) {\r
190                         checks = checks + "Fails " + check + "\n";\r
191                     } else {\r
192                         for (int k = 1; k <= m.groupCount(); ++k) {\r
193                             checks += "(" + m.group(k) + ")";\r
194                         }\r
195                         checks += "\n";\r
196                     }\r
197                 }\r
198                 logln("Result: " + result + "\n" + checks + "\n" + test);\r
199             } catch (Exception e) {\r
200                 if (!expectException) {\r
201                     errln(e.getClass().getName() + ": " + e.getMessage());\r
202                 }\r
203                 continue;\r
204             }\r
205         }\r
206     }\r
207 \r
208     /**\r
209      * Utility for checking patterns\r
210      */\r
211     private void checkCharPattern(Pattern pat, String matchTitle, String shouldMatch, String shouldNotMatch) {\r
212         Matcher matcher = pat.matcher(shouldMatch);\r
213         assertTrue(matchTitle + " and " + shouldMatch, matcher.matches());\r
214         matcher.reset(shouldNotMatch);\r
215         assertFalse(matchTitle + " and " + shouldNotMatch, matcher.matches());\r
216     }\r
217 }\r