]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-4_2_1-src/src/com/ibm/icu/dev/test/translit/RegexUtilitiesTest.java
go
[Dictionary.git] / jars / icu4j-4_2_1-src / src / com / ibm / icu / dev / test / translit / RegexUtilitiesTest.java
1 //##header J2SE15
2 //#if defined(FOUNDATION10) || defined(J2SE13) || defined(J2SE14)
3 //#else
4 /*
5  *******************************************************************************
6  * Copyright (C) 2009, International Business Machines Corporation and         *
7  * others. All Rights Reserved.                                                *
8  *******************************************************************************
9  */
10 package com.ibm.icu.dev.test.translit;
11
12 import java.util.ArrayList;
13 import java.util.List;
14 import java.util.regex.Matcher;
15 import java.util.regex.Pattern;
16
17 import com.ibm.icu.dev.test.TestFmwk;
18 import com.ibm.icu.impl.UnicodeRegex;
19 import com.ibm.icu.lang.UCharacter;
20 import com.ibm.icu.lang.UProperty;
21 import com.ibm.icu.lang.UProperty.NameChoice;
22 import com.ibm.icu.text.Transliterator;
23 import com.ibm.icu.text.UTF16;
24 import com.ibm.icu.text.UnicodeSet;
25
26 /**
27  * @author markdavis
28  */
29 public class RegexUtilitiesTest extends TestFmwk {
30
31     public static void main(String[] args) throws Exception {
32         new RegexUtilitiesTest().run(args);
33     }
34
35     /**
36      * Check basic construction.
37      */
38     public void TestConstruction() {
39         String[][] tests = {
40                 {"a"},
41                 {"a[a-z]b"},
42                 {"[ba-z]", "[a-z]"},
43                 {"q[ba-z]", "q[a-z]"},
44                 {"[ba-z]q", "[a-z]q"},
45                 {"a\\p{joincontrol}b", "a[\u200C\u200D]b"},
46                 {"a\\P{joincontrol}b", "a[^\u200C\u200D]b"},
47                 {"a[[:whitespace:]&[:Zl:]]b", "a[\\\u2028]b"},
48                 {"a [[:bc=cs:]&[:wspace:]] b", "a [\u00A0\u202F] b"},
49         };
50         for (int i = 0; i < tests.length; ++i) {
51             final String source = tests[i][0];
52             String expected = tests[i].length == 1 ? source : tests[i][1];
53             String actual = UnicodeRegex.fix(source);
54             assertEquals(source, expected, actual);
55         } 
56     }
57
58     Transliterator hex = Transliterator.getInstance("hex");
59
60     /**
61      * Perform an exhaustive test on all Unicode characters to make sure that the UnicodeSet with each
62      * character works.
63      */
64     public void TestCharacters() {
65         UnicodeSet requiresQuote = new UnicodeSet("[\\$\\&\\-\\:\\[\\\\\\]\\^\\{\\}[:pattern_whitespace:]]");
66         boolean skip = getInclusion() < 10;
67         for (int cp = 0; cp < 0x110000; ++cp) {
68             if (cp > 0xFF && skip && (cp % 37 != 0)) {
69                 continue;
70             }
71             String cpString = UTF16.valueOf(cp);
72             String s = requiresQuote.contains(cp) ? "\\" + cpString : cpString;
73             String pattern = null;
74             final String rawPattern = "[" + s + s + "]";
75             try {
76                 pattern = UnicodeRegex.fix(rawPattern);
77             } catch (Exception e) {
78                 errln(e.getMessage());
79                 continue;
80             }
81             final String expected = "[" + s + "]";
82             assertEquals("Doubled character works" + hex.transform(s), expected, pattern);
83
84             // verify that we can create a regex pattern and use as expected
85             String shouldNotMatch = UTF16.valueOf((cp + 1) % 0x110000);
86             checkCharPattern(Pattern.compile(pattern), pattern, cpString, shouldNotMatch);
87
88             // verify that the Pattern.compile works
89             checkCharPattern(UnicodeRegex.compile(rawPattern), pattern, cpString, shouldNotMatch);
90         }
91     }
92
93     /**
94      * Check all integer Unicode properties to make sure they work.
95      */
96     public void TestUnicodeProperties() {
97         final boolean skip = getInclusion() < 10;
98         UnicodeSet temp = new UnicodeSet();
99         for (int propNum = UProperty.INT_START; propNum < UProperty.INT_LIMIT; ++propNum) {
100             if (skip && (propNum % 5 != 0)) {
101                 continue;
102             }
103             String propName = UCharacter.getPropertyName(propNum, NameChoice.LONG);
104             final int intPropertyMinValue = UCharacter.getIntPropertyMinValue(propNum);
105             int intPropertyMaxValue = UCharacter.getIntPropertyMaxValue(propNum);
106             if (skip) { // only test first if not exhaustive
107                 intPropertyMaxValue = intPropertyMinValue;
108             }
109             for (int valueNum = intPropertyMinValue; valueNum <= intPropertyMaxValue; ++valueNum) {
110                 // hack for getting property value name
111                 String valueName = UCharacter.getPropertyValueName(propNum, valueNum, NameChoice.LONG);
112                 if (valueName == null) {
113                     valueName = UCharacter.getPropertyValueName(propNum, valueNum, NameChoice.SHORT);
114                     if (valueName == null) {
115                         valueName = Integer.toString(valueNum);
116                     }
117                 }
118                 temp.applyIntPropertyValue(propNum, valueNum);
119                 if (temp.size() == 0) {
120                     continue;
121                 }
122                 final String prefix = "a";
123                 final String suffix = "b";
124                 String shouldMatch = prefix + UTF16.valueOf(temp.charAt(0)) + suffix;
125                 temp.complement();
126                 String shouldNotMatch = prefix + UTF16.valueOf(temp.charAt(0)) + suffix;
127
128                 // posix style pattern
129                 String rawPattern = prefix + "[:" + propName + "=" + valueName + ":]" + suffix;
130                 String rawNegativePattern = prefix + "[:^" + propName + "=" + valueName + ":]" + suffix;
131                 checkCharPattern(UnicodeRegex.compile(rawPattern), rawPattern, shouldMatch, shouldNotMatch);
132                 checkCharPattern(UnicodeRegex.compile(rawNegativePattern), rawNegativePattern, shouldNotMatch, shouldMatch);
133
134                 // perl style pattern
135                 rawPattern = prefix + "\\p{" + propName + "=" + valueName + "}" + suffix;
136                 rawNegativePattern = prefix + "\\P{" + propName + "=" + valueName + "}" + suffix;
137                 checkCharPattern(UnicodeRegex.compile(rawPattern), rawPattern, shouldMatch, shouldNotMatch);
138                 checkCharPattern(UnicodeRegex.compile(rawNegativePattern), rawNegativePattern, shouldNotMatch, shouldMatch);
139             }
140         }
141     }
142
143     public void TestBnf() {
144         UnicodeRegex regex = new UnicodeRegex();
145         final String[][] tests = {
146                 {
147                     "c = a wq;\n" +
148                     "a = xyz;\n" +
149                     "b = a a c;\n"
150                 },
151                 {
152                     "c = a b;\n" +
153                     "a = xyz;\n" +
154                     "b = a a c;\n",
155                     "Exception"
156                 },
157                 {
158                     "uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?;\n" +
159                     "scheme = reserved+;\n" +
160                     "host = // reserved+;\n" +
161                     "query = [\\=reserved]+;\n" +
162                     "fragment = reserved+;\n" +
163                     "reserved = [[:ascii:][:sc=grek:]&[:alphabetic:]];\n",
164                 "http://\u03B1\u03B2\u03B3?huh=hi#there"},
165                 {
166                     "langtagRegex.txt"
167                 }
168         };
169         for (int i = 0; i < tests.length; ++i) {
170             String test = tests[i][0];
171             final boolean expectException = tests[i].length < 2 ? false : tests[i][1].equals("Exception");
172             try {
173                 String result;
174                 if (test.endsWith(".txt")) {
175                     java.io.InputStream is = RegexUtilitiesTest.class.getResourceAsStream(test);
176                     List lines = UnicodeRegex.appendLines(new ArrayList(), is, "UTF-8");
177                     result = regex.compileBnf(lines);
178                 } else {
179                     result = regex.compileBnf(test);
180                 }
181                 if (expectException) {
182                     errln("Expected exception for " + test);
183                     continue;
184                 }
185                 result = result.replaceAll("[0-9]+%", ""); // just so we can use the language subtag stuff
186                 String resolved = regex.transform(result);
187                 logln(resolved);
188                 Matcher m = Pattern.compile(resolved, Pattern.COMMENTS).matcher("");
189                 String checks = "";
190                 for (int j = 1; j < tests[i].length; ++j) {
191                     String check = tests[i][j];
192                     if (!m.reset(check).matches()) {
193                         checks = checks + "Fails " + check + "\n";
194                     } else {
195                         for (int k = 1; k <= m.groupCount(); ++k) {
196                             checks += "(" + m.group(k) + ")";
197                         }
198                         checks += "\n";
199                     }
200                 }
201                 logln("Result: " + result + "\n" + checks + "\n" + test);
202             } catch (Exception e) {
203                 if (!expectException) {
204                     errln(e.getClass().getName() + ": " + e.getMessage());
205                 }
206                 continue;
207             }
208         }
209     }
210
211     /**
212      * Utility for checking patterns
213      */
214     private void checkCharPattern(Pattern pat, String matchTitle, String shouldMatch, String shouldNotMatch) {
215         Matcher matcher = pat.matcher(shouldMatch);
216         assertTrue(matchTitle + " and " + shouldMatch, matcher.matches());
217         matcher.reset(shouldNotMatch);
218         assertFalse(matchTitle + " and " + shouldNotMatch, matcher.matches());
219     }
220 }
221 //#endif