/* ******************************************************************************* * Copyright (C) 1996-2009, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ package com.ibm.icu.dev.test.lang; import java.text.NumberFormat; import java.text.ParsePosition; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.List; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; import com.ibm.icu.dev.test.TestFmwk; import com.ibm.icu.impl.SortedSetRelation; import com.ibm.icu.impl.Utility; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UProperty; import com.ibm.icu.lang.UScript; import com.ibm.icu.lang.UCharacterEnums.ECharacterCategory; import com.ibm.icu.text.SymbolTable; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeMatcher; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.text.UnicodeSetIterator; import com.ibm.icu.text.UnicodeSet.ComparisonStyle; /** * @test * @summary General test of UnicodeSet */ public class UnicodeSetTest extends TestFmwk { static final String NOT = "%%%%"; public static void main(String[] args) throws Exception { new UnicodeSetTest().run(args); } private static final boolean isCccValue(int ccc) { switch (ccc) { case 0: case 1: case 7: case 8: case 9: case 200: case 202: case 216: case 218: case 220: case 222: case 224: case 226: case 228: case 230: case 232: case 233: case 234: case 240: return true; default: return false; } } public void TestPropertyAccess() { int count = 0; // test to see that all of the names work for (int propNum = UProperty.BINARY_START; propNum < UProperty.INT_LIMIT; ++propNum) { count++; //Skipping tests in the non-exhaustive mode to shorten the test time ticket#6475 if(getInclusion()<=5 && count%5!=0){ continue; } if (propNum >= UProperty.BINARY_LIMIT && propNum < UProperty.INT_START) { // skip the gap propNum = UProperty.INT_START; } for (int nameChoice = UProperty.NameChoice.SHORT; nameChoice <= UProperty.NameChoice.LONG; ++nameChoice) { String propName; try { propName = UCharacter.getPropertyName(propNum, nameChoice); if (propName == null) { if (nameChoice == UProperty.NameChoice.SHORT) continue; // allow non-existent short names throw new NullPointerException(); } } catch (RuntimeException e1) { errln("Can't get property name for: " + "Property (" + propNum + ")" + ", NameChoice: " + nameChoice + ", " + e1.getClass().getName()); continue; } logln("Property (" + propNum + "): " + propName); for (int valueNum = UCharacter.getIntPropertyMinValue(propNum); valueNum <= UCharacter.getIntPropertyMaxValue(propNum); ++valueNum) { String valueName; try { valueName = UCharacter.getPropertyValueName(propNum, valueNum, nameChoice); if (valueName == null) { if (nameChoice == UProperty.NameChoice.SHORT) continue; // allow non-existent short names if ((propNum == UProperty.CANONICAL_COMBINING_CLASS || propNum == UProperty.LEAD_CANONICAL_COMBINING_CLASS || propNum == UProperty.TRAIL_CANONICAL_COMBINING_CLASS) && !isCccValue(valueNum)) { // Only a few of the canonical combining classes have names. // Otherwise they are just integer values. continue; } else { throw new NullPointerException(); } } } catch (RuntimeException e1) { errln("Can't get property value name for: " + "Property (" + propNum + "): " + propName + ", " + "Value (" + valueNum + ") " + ", NameChoice: " + nameChoice + ", " + e1.getClass().getName()); continue; } logln("Value (" + valueNum + "): " + valueName); UnicodeSet testSet; try { testSet = new UnicodeSet("[:" + propName + "=" + valueName + ":]"); } catch (RuntimeException e) { errln("Can't create UnicodeSet for: " + "Property (" + propNum + "): " + propName + ", " + "Value (" + valueNum + "): " + valueName + ", " + e.getClass().getName()); continue; } UnicodeSet collectedErrors = new UnicodeSet(); for (UnicodeSetIterator it = new UnicodeSetIterator(testSet); it.next();) { int value = UCharacter.getIntPropertyValue(it.codepoint, propNum); if (value != valueNum) { collectedErrors.add(it.codepoint); } } if (collectedErrors.size() != 0) { errln("Property Value Differs: " + "Property (" + propNum + "): " + propName + ", " + "Value (" + valueNum + "): " + valueName + ", " + "Differing values: " + collectedErrors.toPattern(true)); } } } } } /** * Test toPattern(). */ public void TestToPattern() throws Exception { // Test that toPattern() round trips with syntax characters // and whitespace. for (int i = 0; i < OTHER_TOPATTERN_TESTS.length; ++i) { checkPat(OTHER_TOPATTERN_TESTS[i], new UnicodeSet(OTHER_TOPATTERN_TESTS[i])); } for (int i = 0; i <= 0x10FFFF; ++i) { if ((i <= 0xFF && !UCharacter.isLetter(i)) || UCharacter.isWhitespace(i)) { // check various combinations to make sure they all work. if (i != 0 && !toPatternAux(i, i)) continue; if (!toPatternAux(0, i)) continue; if (!toPatternAux(i, 0xFFFF)) continue; } } // Test pattern behavior of multicharacter strings. UnicodeSet s = new UnicodeSet("[a-z {aa} {ab}]"); expectToPattern(s, "[a-z{aa}{ab}]", new String[] {"aa", "ab", NOT, "ac"}); s.add("ac"); expectToPattern(s, "[a-z{aa}{ab}{ac}]", new String[] {"aa", "ab", "ac", NOT, "xy"}); s.applyPattern("[a-z {\\{l} {r\\}}]"); expectToPattern(s, "[a-z{r\\}}{\\{l}]", new String[] {"{l", "r}", NOT, "xy"}); s.add("[]"); expectToPattern(s, "[a-z{\\[\\]}{r\\}}{\\{l}]", new String[] {"{l", "r}", "[]", NOT, "xy"}); s.applyPattern("[a-z {\u4E01\u4E02}{\\n\\r}]"); expectToPattern(s, "[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]", new String[] {"\u4E01\u4E02", "\n\r"}); s.clear(); s.add("abc"); s.add("abc"); expectToPattern(s, "[{abc}]", new String[] {"abc", NOT, "ab"}); // JB#3400: For 2 character ranges prefer [ab] to [a-b] s.clear(); s.add('a', 'b'); expectToPattern(s, "[ab]", null); // Cover applyPattern, applyPropertyAlias s.clear(); s.applyPattern("[ab ]", true); expectToPattern(s, "[ab]", new String[] {"a", NOT, "ab", " "}); s.clear(); s.applyPattern("[ab ]", false); expectToPattern(s, "[\\ ab]", new String[] {"a", "\u0020", NOT, "ab"}); s.clear(); s.applyPropertyAlias("nv", "0.5"); expectToPattern(s, "[\\u00BD\\u0D74\\u0F2A\\u2CFD\\uA831\\U00010141\\U00010175\\U00010176\\U00010E7B]", null); // Unicode 5.1 adds Malayalam 1/2 (\u0D74) // Unicode 5.2 adds U+A831 NORTH INDIC FRACTION ONE HALF and U+10E7B RUMI FRACTION ONE HALF s.clear(); s.applyPropertyAlias("gc", "Lu"); // TODO expectToPattern(s, what?) // RemoveAllStrings() s.clear(); s.applyPattern("[a-z{abc}{def}]"); expectToPattern(s, "[a-z{abc}{def}]", null); s.removeAllStrings(); expectToPattern(s, "[a-z]", null); } static String[] OTHER_TOPATTERN_TESTS = { "[[:latin:]&[:greek:]]", "[[:latin:]-[:greek:]]", "[:nonspacing mark:]" }; public boolean toPatternAux(int start, int end) { // use Integer.toString because Utility.hex doesn't handle ints String source = "0x" + Integer.toString(start,16).toUpperCase(); if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase(); UnicodeSet testSet = new UnicodeSet(); testSet.add(start, end); return checkPat(source, testSet); } boolean checkPat (String source, UnicodeSet testSet) { String pat = ""; try { // What we want to make sure of is that a pattern generated // by toPattern(), with or without escaped unprintables, can // be passed back into the UnicodeSet constructor. String pat0 = testSet.toPattern(true); if (!checkPat(source + " (escaped)", testSet, pat0)) return false; //String pat1 = unescapeLeniently(pat0); //if (!checkPat(source + " (in code)", testSet, pat1)) return false; String pat2 = testSet.toPattern(false); if (!checkPat(source, testSet, pat2)) return false; //String pat3 = unescapeLeniently(pat2); //if (!checkPat(source + " (in code)", testSet, pat3)) return false; //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3); logln(source + " => " + pat0 + ", " + pat2); } catch (Exception e) { errln("EXCEPTION in toPattern: " + source + " => " + pat); return false; } return true; } boolean checkPat (String source, UnicodeSet testSet, String pat) { UnicodeSet testSet2 = new UnicodeSet(pat); if (!testSet2.equals(testSet)) { errln("Fail toPattern: " + source + "; " + pat + " => " + testSet2.toPattern(false) + ", expected " + testSet.toPattern(false)); return false; } return true; } // NOTE: copied the following from Utility. There ought to be a version in there with a flag // that does the Java stuff public static int unescapeAt(String s, int[] offset16) { int c; int result = 0; int n = 0; int minDig = 0; int maxDig = 0; int bitsPerDigit = 4; int dig; int i; /* Check that offset is in range */ int offset = offset16[0]; int length = s.length(); if (offset < 0 || offset >= length) { return -1; } /* Fetch first UChar after '\\' */ c = UTF16.charAt(s, offset); offset += UTF16.getCharCount(c); /* Convert hexadecimal and octal escapes */ switch (c) { case 'u': minDig = maxDig = 4; break; /* case 'U': minDig = maxDig = 8; break; case 'x': minDig = 1; maxDig = 2; break; */ default: dig = UCharacter.digit(c, 8); if (dig >= 0) { minDig = 1; maxDig = 3; n = 1; /* Already have first octal digit */ bitsPerDigit = 3; result = dig; } break; } if (minDig != 0) { while (offset < length && n < maxDig) { // TEMPORARY // TODO: Restore the char32-based code when UCharacter.digit // is working (Bug 66). //c = UTF16.charAt(s, offset); //dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16); c = s.charAt(offset); dig = Character.digit((char)c, (bitsPerDigit == 3) ? 8 : 16); if (dig < 0) { break; } result = (result << bitsPerDigit) | dig; //offset += UTF16.getCharCount(c); ++offset; ++n; } if (n < minDig) { return -1; } offset16[0] = offset; return result; } /* Convert C-style escapes in table */ for (i=0; i 0 && (filter & SortedSetRelation.A_AND_B) == 0) return false; // A - B size == A.size - A&B.size if (A.size() > ab.size() && (filter & SortedSetRelation.A_NOT_B) == 0) return false; // B - A size == B.size - A&B.size if (B.size() > ab.size() && (filter & SortedSetRelation.B_NOT_A) == 0) return false; return true; } void checkSetRelation(SortedSet a, SortedSet b, String message) { for (int i = 0; i < 8; ++i) { boolean hasRelation = SortedSetRelation.hasRelation(a, i, b); boolean dumbHasRelation = dumbHasRelation(a, i, b); logln(message + " " + hasRelation + ":\t" + a + "\t" + RELATION_NAME[i] + "\t" + b); if (hasRelation != dumbHasRelation) { errln("FAIL: " + message + " " + dumbHasRelation + ":\t" + a + "\t" + RELATION_NAME[i] + "\t" + b); } } logln(""); } /** * Test the [:Latin:] syntax. */ public void TestScriptSet() { expectContainment("[:Latin:]", "aA", CharsToUnicodeString("\\u0391\\u03B1")); expectContainment("[:Greek:]", CharsToUnicodeString("\\u0391\\u03B1"), "aA"); /* Jitterbug 1423 */ expectContainment("[[:Common:][:Inherited:]]", CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA"); } /** * Test the [:Latin:] syntax. */ public void TestPropertySet() { String[] DATA = { // Pattern, Chars IN, Chars NOT in "[:Latin:]", "aA", "\u0391\u03B1", "[\\p{Greek}]", "\u0391\u03B1", "aA", "\\P{ GENERAL Category = upper case letter }", "abc", "ABC", // Combining class: @since ICU 2.2 // Check both symbolic and numeric "\\p{ccc=Nukta}", "\u0ABC", "abc", "\\p{Canonical Combining Class = 11}", "\u05B1", "\u05B2", "[:c c c = iota subscript :]", "\u0345", "xyz", // Bidi class: @since ICU 2.2 "\\p{bidiclass=lefttoright}", "abc", "\u0671\u0672", // Binary properties: @since ICU 2.2 "\\p{ideographic}", "\u4E0A", "x", "[:math=false:]", "q)*(", // )(and * were removed from math in Unicode 4.0.1 "+<>^", // JB#1767 \N{}, \p{ASCII} "[:Ascii:]", "abc\u0000\u007F", "\u0080\u4E00", "[\\N{ latin small letter a }[:name= latin small letter z:]]", "az", "qrs", // JB#2015 "[:any:]", "a\\U0010FFFF", "", "[:nv=0.5:]", "\u00BD\u0F2A", "\u00BC", // JB#2653: Age "[:Age=1.1:]", "\u03D6", // 1.1 "\u03D8\u03D9", // 3.2 "[:Age=3.1:]", "\\u1800\\u3400\\U0002f800", "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000", // JB#2350: Case_Sensitive "[:Case Sensitive:]", "A\u1FFC\\U00010410", ";\u00B4\\U00010500", // Regex compatibility test "[-b]", // leading '-' is literal "-b", "ac", "[^-b]", // leading '-' is literal "ac", "-b", "[b-]", // trailing '-' is literal "-b", "ac", "[^b-]", // trailing '-' is literal "ac", "-b", "[a-b-]", // trailing '-' is literal "ab-", "c=", "[[a-q]&[p-z]-]", // trailing '-' is literal "pq-", "or=", "[\\s|\\)|:|$|\\>]", // from regex tests "s|):$>", "\\abc", "[\uDC00cd]", // JB#2906: isolated trail at start "cd\uDC00", "ab\uD800\\U00010000", "[ab\uD800]", // JB#2906: isolated trail at start "ab\uD800", "cd\uDC00\\U00010000", "[ab\uD800cd]", // JB#2906: isolated lead in middle "abcd\uD800", "ef\uDC00\\U00010000", "[ab\uDC00cd]", // JB#2906: isolated trail in middle "abcd\uDC00", "ef\uD800\\U00010000", "[:^lccc=0:]", // Lead canonical class "\u0300\u0301", "abcd\u00c0\u00c5", "[:^tccc=0:]", // Trail canonical class "\u0300\u0301\u00c0\u00c5", "abcd", "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class "\u0300\u0301\u00c0\u00c5", "abcd", "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now) "", "abcd\u0300\u0301\u00c0\u00c5", "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not "\u0F73\u0F75\u0F81", "abcd\u0300\u0301\u00c0\u00c5", "[:Assigned:]", "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD", "\\u0888\\uFDD3\\uFFFE\\U00050005", }; for (int i=0; i indexOf() => " + set.indexOf(c)); } } int c = set.charAt(set.size()); if (c != -1) { errln("FAIL: charAt() = " + Utility.escape(String.valueOf(c))); } int j = set.indexOf('q'); if (j != -1) { errln("FAIL: indexOf('q') = " + j); } } public void TestContainsString() { UnicodeSet x = new UnicodeSet("[a{bc}]"); if (x.contains("abc")) errln("FAIL"); } public void TestExhaustive() { // exhaustive tests. Simulate UnicodeSets with integers. // That gives us very solid tests (except for large memory tests). char limit = (char)128; for (char i = 0; i < limit; ++i) { logln("Testing " + i + ", " + bitsToSet(i)); _testComplement(i); // AS LONG AS WE ARE HERE, check roundtrip checkRoundTrip(bitsToSet(i)); for (char j = 0; j < limit; ++j) { _testAdd(i,j); _testXor(i,j); _testRetain(i,j); _testRemove(i,j); } } } /** * Make sure each script name and abbreviated name can be used * to construct a UnicodeSet. */ public void TestScriptNames() { for (int i=0; i " + set.toPattern(false)); } catch (IllegalArgumentException e) { if (pat.length() == 0) { errln("FAIL (in UScript): No name for script " + i); } else { errln("FAIL: Couldn't create " + pat); } } } } } /** * Test closure API. */ public void TestCloseOver() { String CASE = String.valueOf(UnicodeSet.CASE); String[] DATA = { // selector, input, output CASE, "[aq\u00DF{Bc}{bC}{Fi}]", "[aAqQ\u00DF\u1E9E\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1 CASE, "[\u01F1]", // 'DZ' "[\u01F1\u01F2\u01F3]", CASE, "[\u1FB4]", "[\u1FB4{\u03AC\u03B9}]", CASE, "[{F\uFB01}]", "[\uFB03{ffi}]", CASE, "[a-z]","[A-Za-z\u017F\u212A]", CASE, "[abc]","[A-Ca-c]", CASE, "[ABC]","[A-Ca-c]", }; UnicodeSet s = new UnicodeSet(); UnicodeSet t = new UnicodeSet(); for (int i=0; i " + exp); } else { errln("FAIL: " + pat + ".closeOver(" + selector + ") => " + s.toPattern(true) + ", expected " + exp); } } // Test the pattern API s.applyPattern("[abc]", UnicodeSet.CASE); expectContainment(s, "abcABC", "defDEF"); s = new UnicodeSet("[^abc]", UnicodeSet.CASE); expectContainment(s, "defDEF", "abcABC"); } public void TestEscapePattern() { // The following pattern must contain at least one range "c-d" // for which isRuleWhiteSpace(c) or isRuleWhiteSpace(d) is true. String pattern = "[\\uFEFF \\u200E-\\u20FF \\uFFF9-\\uFFFC \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]"; String exp = "[\\u200E-\\u20FF\\uFEFF\\uFFF9-\\uFFFC\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]"; // We test this with two passes; in the second pass we // pre-unescape the pattern. Since U+200E is rule whitespace, // this fails -- which is what we expect. for (int pass=1; pass<=2; ++pass) { String pat = pattern; if (pass==2) { pat = Utility.unescape(pat); } // Pattern is only good for pass 1 boolean isPatternValid = (pass==1); UnicodeSet set = null; try { set = new UnicodeSet(pat); } catch (IllegalArgumentException e) { set = null; } if ((set != null) != isPatternValid){ errln("FAIL: applyPattern(" + Utility.escape(pat) + ") => " + set); continue; } if (set == null) { continue; } if (set.contains((char)0x0644)){ errln("FAIL: " + Utility.escape(pat) + " contains(U+0664)"); } String newpat = set.toPattern(true); if (newpat.equals(exp)) { logln(Utility.escape(pat) + " => " + newpat); } else { errln("FAIL: " + Utility.escape(pat) + " => " + newpat); } for (int i=0; i iterator() { ArrayList oldList = new ArrayList(); for (UnicodeSetIterator it = new UnicodeSetIterator(set1); it.next();) { oldList.add(it.getString()); } ArrayList list1 = new ArrayList(); for (String s : set1) { list1.add(s); } assertEquals("iteration test", oldList, list1); //addAllTo(Iterable, U) list1.clear(); set1.addAllTo(list1); assertEquals("iteration test", oldList, list1); list1 = set1.addAllTo(new ArrayList()); assertEquals("addAllTo", oldList, list1); ArrayList list2 = set2.addAllTo(new ArrayList()); ArrayList list3 = set3.addAllTo(new ArrayList()); // put them into different order, to check that order doesn't matter TreeSet sorted1 = set1.addAllTo(new TreeSet()); TreeSet sorted2 = set2.addAllTo(new TreeSet()); TreeSet sorted3 = set3.addAllTo(new TreeSet()); //containsAll(Collection collection) assertTrue("containsAll", set1.containsAll(list1)); assertTrue("containsAll", set1.containsAll(sorted1)); assertTrue("containsAll", set1.containsAll(list2)); assertTrue("containsAll", set1.containsAll(sorted2)); assertFalse("containsAll", set1.containsAll(list3)); assertFalse("containsAll", set1.containsAll(sorted3)); assertFalse("containsAll", set2.containsAll(list3)); assertFalse("containsAll", set2.containsAll(sorted3)); //containsSome(Collection) assertTrue("containsSome", set1.containsSome(list1)); assertTrue("containsSome", set1.containsSome(sorted1)); assertTrue("containsSome", set1.containsSome(list2)); assertTrue("containsSome", set1.containsSome(sorted2)); assertTrue("containsSome", set1.containsSome(list3)); assertTrue("containsSome", set1.containsSome(sorted3)); assertFalse("containsSome", set2.containsSome(list3)); assertFalse("containsSome", set2.containsSome(sorted3)); //containsNone(Collection) assertFalse("containsNone", set1.containsNone(list1)); assertFalse("containsNone", set1.containsNone(sorted1)); assertFalse("containsNone", set1.containsNone(list2)); assertFalse("containsNone", set1.containsNone(sorted2)); assertFalse("containsNone", set1.containsNone(list3)); assertFalse("containsNone", set1.containsNone(sorted3)); assertTrue("containsNone", set2.containsNone(list3)); assertTrue("containsNone", set2.containsNone(sorted3)); //addAll(String...) UnicodeSet other3 = new UnicodeSet().addAll("d", "m", "n", "dh"); assertEquals("addAll", set3, other3); //removeAll(Collection) UnicodeSet mod1 = new UnicodeSet(set1).removeAll(set2); UnicodeSet mod2 = new UnicodeSet(set1).removeAll(list2); assertEquals("remove all", mod1, mod2); //retainAll(Collection) mod1 = new UnicodeSet(set1).retainAll(set2); mod2 = new UnicodeSet(set1).retainAll(set2.addAllTo(new LinkedHashSet())); assertEquals("remove all", mod1, mod2); } public void TestComparison() { UnicodeSet set1 = new UnicodeSet("[a-b d-g {ch} {zh}]").freeze(); UnicodeSet set2 = new UnicodeSet("[c-e {ch}]").freeze(); UnicodeSet set3 = new UnicodeSet("[d m-n z {dh}]").freeze(); //compareTo(UnicodeSet) // do indirectly, by sorting List unsorted = Arrays.asList(set3, set2, set1); List goalShortest = Arrays.asList(set2, set3, set1); List goalLongest = Arrays.asList(set1, set3, set2); List goalLex = Arrays.asList(set1, set2, set3); List sorted = new ArrayList(new TreeSet(unsorted)); assertNotEquals("compareTo-shorter-first", unsorted, sorted); assertEquals("compareTo-shorter-first", goalShortest, sorted); TreeSet sorted1 = new TreeSet(new Comparator(){ public int compare(UnicodeSet o1, UnicodeSet o2) { // TODO Auto-generated method stub return o1.compareTo(o2, ComparisonStyle.LONGER_FIRST); }}); sorted1.addAll(unsorted); sorted = new ArrayList(sorted1); assertNotEquals("compareTo-longer-first", unsorted, sorted); assertEquals("compareTo-longer-first", goalLongest, sorted); sorted1 = new TreeSet(new Comparator(){ public int compare(UnicodeSet o1, UnicodeSet o2) { // TODO Auto-generated method stub return o1.compareTo(o2, ComparisonStyle.LEXICOGRAPHIC); }}); sorted1.addAll(unsorted); sorted = new ArrayList(sorted1); assertNotEquals("compareTo-lex", unsorted, sorted); assertEquals("compareTo-lex", goalLex, sorted); //compare(String, int) // make a list of interesting combinations List sources = Arrays.asList("\u0000", "a", "b", "\uD7FF", "\uD800", "\uDBFF", "\uDC00", "\uDFFF", "\uE000", "\uFFFD", "\uFFFF"); TreeSet target = new TreeSet(); for (String s : sources) { target.add(s); for (String t : sources) { target.add(s + t); for (String u : sources) { target.add(s + t + u); } } } // now compare all the combinations. If any of them is a code point, use it. for (String last : target) { for (String curr : target) { int lastCount = Character.codePointCount(last, 0, last.length()); int currCount = Character.codePointCount(curr, 0, curr.length()); int comparison; if (lastCount == 1) { comparison = UnicodeSet.compare(last.codePointAt(0), curr); } else if (currCount == 1) { comparison = UnicodeSet.compare(last, curr.codePointAt(0)); } else { continue; } if (comparison != last.compareTo(curr)) { // repeat for debugging if (lastCount == 1) { comparison = UnicodeSet.compare(last.codePointAt(0), curr); } else if (currCount == 1) { comparison = UnicodeSet.compare(last, curr.codePointAt(0)); } errln("Failure in comparing " + last + " & " + curr); } } } //compare(Iterable, Iterable) int max = 10; List test1 = new ArrayList(max); List test2 = new ArrayList(max); for (int i = 0; i <= max; ++i) { test1.add("a" + i); test2.add("a" + (max - i)); // add in reverse order } assertNotEquals("compare iterable test", test1, test2); TreeSet sortedTest1 = new TreeSet(test1); TreeSet sortedTest2 = new TreeSet(test2); assertEquals("compare iterable test", sortedTest1, sortedTest2); } public void TestRangeConstructor() { UnicodeSet w = new UnicodeSet().addAll(3,5); UnicodeSet s = new UnicodeSet(3,5); assertEquals("new constructor", w, s); w = new UnicodeSet().addAll(3,5).addAll(7,7); UnicodeSet t = new UnicodeSet(3,5, 7,7); assertEquals("new constructor", w, t); // check to make sure right exceptions are thrown Class expected = IllegalArgumentException.class; Class actual; try { actual = null; @SuppressWarnings("unused") UnicodeSet u = new UnicodeSet(5); } catch (IllegalArgumentException e) { actual = e.getClass(); } assertEquals("exception if odd", expected, actual); try { actual = null; @SuppressWarnings("unused") UnicodeSet u = new UnicodeSet(3, 2, 7, 9); } catch (IllegalArgumentException e) { actual = e.getClass(); } assertEquals("exception for start/end problem", expected, actual); try { actual = null; @SuppressWarnings("unused") UnicodeSet u = new UnicodeSet(3, 5, 6, 9); } catch (IllegalArgumentException e) { actual = e.getClass(); } assertEquals("exception for end/start problem", expected, actual); CheckRangeSpeed(10000, new UnicodeSet("[:whitespace:]")); CheckRangeSpeed(1000, new UnicodeSet("[:letter:]")); } /** * @param iterations * @param testSet */ private void CheckRangeSpeed(int iterations, UnicodeSet testSet) { testSet.complement().complement(); String testPattern = testSet.toString(); // fill a set of pairs from the pattern int[] pairs = new int[testSet.getRangeCount()*2]; int j = 0; for (UnicodeSetIterator it = new UnicodeSetIterator(testSet); it.nextRange();) { pairs[j++] = it.codepoint; pairs[j++] = it.codepointEnd; } UnicodeSet fromRange = new UnicodeSet(testSet); assertEquals("from range vs pattern", testSet, fromRange); double start = System.currentTimeMillis(); for (int i = 0; i < iterations; ++i) { fromRange = new UnicodeSet(testSet); } double middle = System.currentTimeMillis(); for (int i = 0; i < iterations; ++i) { new UnicodeSet(testPattern); } double end = System.currentTimeMillis(); double rangeConstructorTime = (middle - start)/iterations; double patternConstructorTime = (end - middle)/iterations; String message = "Range constructor:\t" + rangeConstructorTime + ";\tPattern constructor:\t" + patternConstructorTime + "\t\t" + percent.format(rangeConstructorTime/patternConstructorTime-1); if (rangeConstructorTime < 2*patternConstructorTime) { logln(message); } else { errln(message); } } NumberFormat percent = NumberFormat.getPercentInstance(); { percent.setMaximumFractionDigits(2); } // **************************************** // UTILITIES // **************************************** public void checkModification(UnicodeSet original, boolean isFrozen) { main: for (int i = 0; ;++i) { UnicodeSet test = (UnicodeSet) (isFrozen ? original.clone() : original.cloneAsThawed()); boolean gotException = true; boolean checkEquals = true; try { switch(i) { case 0: test.add(0); break; case 1: test.add(0,1); break; case 2: test.add("a"); break; case 3: List a = new ArrayList(); a.add("a"); test.addAll(a); break; case 4: test.addAll("ab"); break; case 5: test.addAll(new UnicodeSet("[ab]")); break; case 6: test.applyIntPropertyValue(0,0); break; case 7: test.applyPattern("[ab]"); break; case 8: test.applyPattern("[ab]", true); break; case 9: test.applyPattern("[ab]", 0); break; case 10: test.applyPropertyAlias("hex","true"); break; case 11: test.applyPropertyAlias("hex", "true", null); break; case 12: test.closeOver(UnicodeSet.CASE); break; case 13: test.compact(); checkEquals = false; break; case 14: test.complement(0); break; case 15: test.complement(0,0); break; case 16: test.complement("ab"); break; case 17: test.complementAll("ab"); break; case 18: test.complementAll(new UnicodeSet("[ab]")); break; case 19: test.remove(' '); break; case 20: test.remove(' ','a'); break; case 21: test.remove(" "); break; case 22: test.removeAll(" a"); break; case 23: test.removeAll(new UnicodeSet("[\\ a]")); break; case 24: test.retain(' '); break; case 25: test.retain(' ','a'); break; case 26: test.retain(" "); break; case 27: test.retainAll(" a"); break; case 28: test.retainAll(new UnicodeSet("[\\ a]")); break; case 29: test.set(0,1); break; case 30: test.set(new UnicodeSet("[ab]")); break; default: continue main; // so we don't keep having to change the endpoint, and gaps are not skipped. case 35: return; } gotException = false; } catch (UnsupportedOperationException e) { // do nothing } if (isFrozen && !gotException) errln(i + ") attempt to modify frozen object didn't result in an exception"); if (!isFrozen && gotException) errln(i + ") attempt to modify thawed object did result in an exception"); if (checkEquals) { if (test.equals(original)) { if (!isFrozen) errln(i + ") attempt to modify thawed object didn't change the object"); } else { // unequal if (isFrozen) errln(i + ") attempt to modify frozen object changed the object"); } } } } // Following cod block is commented out to eliminate PrettyPrinter depenencies // String[] prettyData = { // "[\\uD7DE-\\uD90C \\uDCB5-\\uDD9F]", // special case // "[:any:]", // "[:whitespace:]", // "[:linebreak=AL:]", // }; // // public void TestPrettyPrinting() { // try{ // PrettyPrinter pp = new PrettyPrinter(); // // int i = 0; // for (; i < prettyData.length; ++i) { // UnicodeSet test = new UnicodeSet(prettyData[i]); // checkPrettySet(pp, i, test); // } // Random random = new Random(0); // UnicodeSet test = new UnicodeSet(); // // // To keep runtimes under control, make the number of random test cases // // to try depends on the test framework exhaustive setting. // // params.inclusions = 5: default exhaustive value // // params.inclusions = 10: max exhaustive value. // int iterations = 50; // if (params.inclusion > 5) { // iterations = (params.inclusion-5) * 200; // } // for (; i < iterations; ++i) { // double start = random.nextGaussian() * 0x10000; // if (start < 0) start = - start; // if (start > 0x10FFFF) { // start = 0x10FFFF; // } // double end = random.nextGaussian() * 0x100; // if (end < 0) end = -end; // end = start + end; // if (end > 0x10FFFF) { // end = 0x10FFFF; // } // test.complement((int)start, (int)end); // checkPrettySet(pp, i, test); // } // }catch(RuntimeException ex){ // warnln("Could not load Collator"); // } // } // // private void checkPrettySet(PrettyPrinter pp, int i, UnicodeSet test) { // String pretty = pp.toPattern(test); // UnicodeSet retry = new UnicodeSet(pretty); // if (!test.equals(retry)) { // errln(i + ". Failed test: " + test + " != " + pretty); // } else { // logln(i + ". Worked for " + truncate(test.toString()) + " => " + truncate(pretty)); // } // } // // private String truncate(String string) { // if (string.length() <= 100) return string; // return string.substring(0,97) + "..."; // } public class TokenSymbolTable implements SymbolTable { HashMap contents = new HashMap(); /** * (Non-SymbolTable API) Add the given variable and value to * the table. Variable should NOT contain leading '$'. */ public void add(String var, String value) { char[] buffer = new char[value.length()]; value.getChars(0, value.length(), buffer, 0); add(var, buffer); } /** * (Non-SymbolTable API) Add the given variable and value to * the table. Variable should NOT contain leading '$'. */ public void add(String var, char[] body) { logln("TokenSymbolTable: add \"" + var + "\" => \"" + new String(body) + "\""); contents.put(var, body); } /* (non-Javadoc) * @see com.ibm.icu.text.SymbolTable#lookup(java.lang.String) */ public char[] lookup(String s) { logln("TokenSymbolTable: lookup \"" + s + "\" => \"" + new String((char[]) contents.get(s)) + "\""); return (char[])contents.get(s); } /* (non-Javadoc) * @see com.ibm.icu.text.SymbolTable#lookupMatcher(int) */ public UnicodeMatcher lookupMatcher(int ch) { return null; } /* (non-Javadoc) * @see com.ibm.icu.text.SymbolTable#parseReference(java.lang.String, java.text.ParsePosition, int) */ public String parseReference(String text, ParsePosition pos, int limit) { int cp; int start = pos.getIndex(); int i; for (i = start; i < limit; i += UTF16.getCharCount(cp)) { cp = UTF16.charAt(text, i); if (!com.ibm.icu.lang.UCharacter.isUnicodeIdentifierPart(cp)) { break; } } logln("TokenSymbolTable: parse \"" + text + "\" from " + start + " to " + i + " => \"" + text.substring(start,i) + "\""); pos.setIndex(i); return text.substring(start,i); } } public void TestSurrogate() { String DATA[] = { // These should all behave identically "[abc\\uD800\\uDC00]", "[abc\uD800\uDC00]", "[abc\\U00010000]", }; for (int i=0; i= 0 but is " + n + " for " + Utility.escape(set.toString())); return; } int last = 0; for (int i=0; i end) { errln("FAIL result of " + msg + ": range " + (i+1) + " start > end: " + start + ", " + end + " for " + Utility.escape(set.toString())); } if (i > 0 && start <= last) { errln("FAIL result of " + msg + ": range " + (i+1) + " overlaps previous range: " + start + ", " + end + " for " + Utility.escape(set.toString())); } last = end; } } /** * Convert a bitmask to a UnicodeSet. */ UnicodeSet bitsToSet(int a) { UnicodeSet result = new UnicodeSet(); for (int i = 0; i < 32; ++i) { if ((a & (1< 0xFFFF) { end = 0xFFFF; i = set.getRangeCount(); // Should be unnecessary } pairs.append((char)start).append((char)end); } return pairs.toString(); } /** * Test function. Make sure that the sets have the right relation */ void expectRelation(Object relationObj, Object set1Obj, Object set2Obj, String message) { int relation = ((Integer) relationObj).intValue(); UnicodeSet set1 = (UnicodeSet) set1Obj; UnicodeSet set2 = (UnicodeSet) set2Obj; // by-the-by, check the iterator checkRoundTrip(set1); checkRoundTrip(set2); boolean contains = set1.containsAll(set2); boolean isContained = set2.containsAll(set1); boolean disjoint = set1.containsNone(set2); boolean equals = set1.equals(set2); UnicodeSet intersection = new UnicodeSet(set1).retainAll(set2); UnicodeSet minus12 = new UnicodeSet(set1).removeAll(set2); UnicodeSet minus21 = new UnicodeSet(set2).removeAll(set1); // test basic properties if (contains != (intersection.size() == set2.size())) { errln("FAIL contains1" + set1.toPattern(true) + ", " + set2.toPattern(true)); } if (contains != (intersection.equals(set2))) { errln("FAIL contains2" + set1.toPattern(true) + ", " + set2.toPattern(true)); } if (isContained != (intersection.size() == set1.size())) { errln("FAIL isContained1" + set1.toPattern(true) + ", " + set2.toPattern(true)); } if (isContained != (intersection.equals(set1))) { errln("FAIL isContained2" + set1.toPattern(true) + ", " + set2.toPattern(true)); } if ((contains && isContained) != equals) { errln("FAIL equals" + set1.toPattern(true) + ", " + set2.toPattern(true)); } if (disjoint != (intersection.size() == 0)) { errln("FAIL disjoint" + set1.toPattern(true) + ", " + set2.toPattern(true)); } // Now see if the expected relation is true int status = (minus12.size() != 0 ? 4 : 0) | (intersection.size() != 0 ? 2 : 0) | (minus21.size() != 0 ? 1 : 0); if (status != relation) { errln("FAIL relation incorrect" + message + "; desired = " + RELATION_NAME[relation] + "; found = " + RELATION_NAME[status] + "; set1 = " + set1.toPattern(true) + "; set2 = " + set2.toPattern(true) ); } } /** * Basic consistency check for a few items. * That the iterator works, and that we can create a pattern and * get the same thing back */ void checkRoundTrip(UnicodeSet s) { String pat = s.toPattern(false); UnicodeSet t = copyWithIterator(s, false); checkEqual(s, t, "iterator roundtrip"); t = copyWithIterator(s, true); // try range checkEqual(s, t, "iterator roundtrip"); t = new UnicodeSet(pat); checkEqual(s, t, "toPattern(false)"); pat = s.toPattern(true); t = new UnicodeSet(pat); checkEqual(s, t, "toPattern(true)"); } UnicodeSet copyWithIterator(UnicodeSet s, boolean withRange) { UnicodeSet t = new UnicodeSet(); UnicodeSetIterator it = new UnicodeSetIterator(s); if (withRange) { while (it.nextRange()) { if (it.codepoint == UnicodeSetIterator.IS_STRING) { t.add(it.string); } else { t.add(it.codepoint, it.codepointEnd); } } } else { while (it.next()) { if (it.codepoint == UnicodeSetIterator.IS_STRING) { t.add(it.string); } else { t.add(it.codepoint); } } } return t; } boolean checkEqual(UnicodeSet s, UnicodeSet t, String message) { if (!s.equals(t)) { errln("FAIL " + message + "; source = " + s.toPattern(true) + "; result = " + t.toPattern(true) ); return false; } return true; } void expectEqual(String name, String pat1, String pat2) { UnicodeSet set1, set2; try { set1 = new UnicodeSet(pat1); set2 = new UnicodeSet(pat2); } catch (IllegalArgumentException e) { errln("FAIL: Couldn't create UnicodeSet from pattern for \"" + name + "\": " + e.getMessage()); return; } if(!set1.equals(set2)) { errln("FAIL: Sets built from patterns differ for \"" + name + "\""); } } /** * Expect the given set to contain the characters in charsIn and * to not contain those in charsOut. */ void expectContainment(String pat, String charsIn, String charsOut) { UnicodeSet set; try { set = new UnicodeSet(pat); } catch (IllegalArgumentException e) { errln("FAIL: Couldn't create UnicodeSet from pattern \"" + pat + "\": " + e.getMessage()); return; } expectContainment(set, charsIn, charsOut); } /** * Expect the given set to contain the characters in charsIn and * to not contain those in charsOut. */ void expectContainment(UnicodeSet set, String charsIn, String charsOut) { StringBuffer bad = new StringBuffer(); if (charsIn != null) { charsIn = Utility.unescape(charsIn); for (int i=0; i 0) { errln(Utility.escape("FAIL: set " + set + " does not contain " + bad + ", expected containment of " + charsIn)); } else { logln(Utility.escape("Ok: set " + set + " contains " + charsIn)); } } if (charsOut != null) { charsOut = Utility.unescape(charsOut); bad.setLength(0); for (int i=0; i 0) { errln(Utility.escape("FAIL: set " + set + " contains " + bad + ", expected non-containment of " + charsOut)); } else { logln(Utility.escape("Ok: set " + set + " does not contain " + charsOut)); } } } void expectPattern(UnicodeSet set, String pattern, String expectedPairs) { set.applyPattern(pattern); if (!getPairs(set).equals(expectedPairs)) { errln("FAIL: applyPattern(\"" + pattern + "\") => pairs \"" + Utility.escape(getPairs(set)) + "\", expected \"" + Utility.escape(expectedPairs) + "\""); } else { logln("Ok: applyPattern(\"" + pattern + "\") => pairs \"" + Utility.escape(getPairs(set)) + "\""); } } void expectToPattern(UnicodeSet set, String expPat, String[] expStrings) { String pat = set.toPattern(true); if (pat.equals(expPat)) { logln("Ok: toPattern() => \"" + pat + "\""); } else { errln("FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\""); return; } if (expStrings == null) { return; } boolean in = true; for (int i=0; i source) */ public void TestAddCollection() { UnicodeSet us = new UnicodeSet(); Collection s = null; try { us.add(s); errln("UnicodeSet.add(Collection) was suppose to return an exception for a null parameter."); } catch (Exception e) { } } }