/* ******************************************************************************* * Copyright (C) 2009, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ package com.ibm.icu.dev.test.translit; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.ibm.icu.dev.test.TestFmwk; import com.ibm.icu.impl.UnicodeRegex; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UProperty; import com.ibm.icu.lang.UProperty.NameChoice; import com.ibm.icu.text.Transliterator; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; /** * @author markdavis */ public class RegexUtilitiesTest extends TestFmwk { public static void main(String[] args) throws Exception { new RegexUtilitiesTest().run(args); } /** * Check basic construction. */ public void TestConstruction() { String[][] tests = { {"a"}, {"a[a-z]b"}, {"[ba-z]", "[a-z]"}, {"q[ba-z]", "q[a-z]"}, {"[ba-z]q", "[a-z]q"}, {"a\\p{joincontrol}b", "a[\u200C\u200D]b"}, {"a\\P{joincontrol}b", "a[^\u200C\u200D]b"}, {"a[[:whitespace:]&[:Zl:]]b", "a[\\\u2028]b"}, {"a [[:bc=cs:]&[:wspace:]] b", "a [\u00A0\u202F] b"}, }; for (int i = 0; i < tests.length; ++i) { final String source = tests[i][0]; String expected = tests[i].length == 1 ? source : tests[i][1]; String actual = UnicodeRegex.fix(source); assertEquals(source, expected, actual); } } Transliterator hex = Transliterator.getInstance("hex"); /** * Perform an exhaustive test on all Unicode characters to make sure that the UnicodeSet with each * character works. */ public void TestCharacters() { UnicodeSet requiresQuote = new UnicodeSet("[\\$\\&\\-\\:\\[\\\\\\]\\^\\{\\}[:pattern_whitespace:]]"); boolean skip = getInclusion() < 10; for (int cp = 0; cp < 0x110000; ++cp) { if (cp > 0xFF && skip && (cp % 37 != 0)) { continue; } String cpString = UTF16.valueOf(cp); String s = requiresQuote.contains(cp) ? "\\" + cpString : cpString; String pattern = null; final String rawPattern = "[" + s + s + "]"; try { pattern = UnicodeRegex.fix(rawPattern); } catch (Exception e) { errln(e.getMessage()); continue; } final String expected = "[" + s + "]"; assertEquals("Doubled character works" + hex.transform(s), expected, pattern); // verify that we can create a regex pattern and use as expected String shouldNotMatch = UTF16.valueOf((cp + 1) % 0x110000); checkCharPattern(Pattern.compile(pattern), pattern, cpString, shouldNotMatch); // verify that the Pattern.compile works checkCharPattern(UnicodeRegex.compile(rawPattern), pattern, cpString, shouldNotMatch); } } /** * Check all integer Unicode properties to make sure they work. */ public void TestUnicodeProperties() { final boolean skip = getInclusion() < 10; UnicodeSet temp = new UnicodeSet(); for (int propNum = UProperty.INT_START; propNum < UProperty.INT_LIMIT; ++propNum) { if (skip && (propNum % 5 != 0)) { continue; } String propName = UCharacter.getPropertyName(propNum, NameChoice.LONG); final int intPropertyMinValue = UCharacter.getIntPropertyMinValue(propNum); int intPropertyMaxValue = UCharacter.getIntPropertyMaxValue(propNum); if (skip) { // only test first if not exhaustive intPropertyMaxValue = intPropertyMinValue; } for (int valueNum = intPropertyMinValue; valueNum <= intPropertyMaxValue; ++valueNum) { // hack for getting property value name String valueName = UCharacter.getPropertyValueName(propNum, valueNum, NameChoice.LONG); if (valueName == null) { valueName = UCharacter.getPropertyValueName(propNum, valueNum, NameChoice.SHORT); if (valueName == null) { valueName = Integer.toString(valueNum); } } temp.applyIntPropertyValue(propNum, valueNum); if (temp.size() == 0) { continue; } final String prefix = "a"; final String suffix = "b"; String shouldMatch = prefix + UTF16.valueOf(temp.charAt(0)) + suffix; temp.complement(); String shouldNotMatch = prefix + UTF16.valueOf(temp.charAt(0)) + suffix; // posix style pattern String rawPattern = prefix + "[:" + propName + "=" + valueName + ":]" + suffix; String rawNegativePattern = prefix + "[:^" + propName + "=" + valueName + ":]" + suffix; checkCharPattern(UnicodeRegex.compile(rawPattern), rawPattern, shouldMatch, shouldNotMatch); checkCharPattern(UnicodeRegex.compile(rawNegativePattern), rawNegativePattern, shouldNotMatch, shouldMatch); // perl style pattern rawPattern = prefix + "\\p{" + propName + "=" + valueName + "}" + suffix; rawNegativePattern = prefix + "\\P{" + propName + "=" + valueName + "}" + suffix; checkCharPattern(UnicodeRegex.compile(rawPattern), rawPattern, shouldMatch, shouldNotMatch); checkCharPattern(UnicodeRegex.compile(rawNegativePattern), rawNegativePattern, shouldNotMatch, shouldMatch); } } } public void TestBnf() { UnicodeRegex regex = new UnicodeRegex(); final String[][] tests = { { "c = a wq;\n" + "a = xyz;\n" + "b = a a c;\n" }, { "c = a b;\n" + "a = xyz;\n" + "b = a a c;\n", "Exception" }, { "uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?;\n" + "scheme = reserved+;\n" + "host = // reserved+;\n" + "query = [\\=reserved]+;\n" + "fragment = reserved+;\n" + "reserved = [[:ascii:][:sc=grek:]&[:alphabetic:]];\n", "http://\u03B1\u03B2\u03B3?huh=hi#there"}, { "langtagRegex.txt" } }; for (int i = 0; i < tests.length; ++i) { String test = tests[i][0]; final boolean expectException = tests[i].length < 2 ? false : tests[i][1].equals("Exception"); try { String result; if (test.endsWith(".txt")) { java.io.InputStream is = RegexUtilitiesTest.class.getResourceAsStream(test); List lines = UnicodeRegex.appendLines(new ArrayList(), is, "UTF-8"); result = regex.compileBnf(lines); } else { result = regex.compileBnf(test); } if (expectException) { errln("Expected exception for " + test); continue; } result = result.replaceAll("[0-9]+%", ""); // just so we can use the language subtag stuff String resolved = regex.transform(result); logln(resolved); Matcher m = Pattern.compile(resolved, Pattern.COMMENTS).matcher(""); String checks = ""; for (int j = 1; j < tests[i].length; ++j) { String check = tests[i][j]; if (!m.reset(check).matches()) { checks = checks + "Fails " + check + "\n"; } else { for (int k = 1; k <= m.groupCount(); ++k) { checks += "(" + m.group(k) + ")"; } checks += "\n"; } } logln("Result: " + result + "\n" + checks + "\n" + test); } catch (Exception e) { if (!expectException) { errln(e.getClass().getName() + ": " + e.getMessage()); } continue; } } } /** * Utility for checking patterns */ private void checkCharPattern(Pattern pat, String matchTitle, String shouldMatch, String shouldNotMatch) { Matcher matcher = pat.matcher(shouldMatch); assertTrue(matchTitle + " and " + shouldMatch, matcher.matches()); matcher.reset(shouldNotMatch); assertFalse(matchTitle + " and " + shouldNotMatch, matcher.matches()); } }