/* ******************************************************************************* * Copyright (C) 2009-2010, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ package com.ibm.icu.dev.test.lang; import com.ibm.icu.dev.test.TestFmwk; import com.ibm.icu.impl.Utility; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.text.UnicodeSetIterator; import com.ibm.icu.text.UnicodeSet.SpanCondition; /** * @test * @summary General test of UnicodeSet string span. */ public class UnicodeSetStringSpanTest extends TestFmwk { public static void main(String[] args) throws Exception { new UnicodeSetStringSpanTest().run(args); } // Simple test first, easier to debug. public void TestSimpleStringSpan() { String pattern = "[a{ab}{bc}]"; String string = "abc"; UnicodeSet set = new UnicodeSet(pattern); set.complement(); int pos = set.spanBack(string, 3, SpanCondition.SIMPLE); if (pos != 1) { errln(String.format("FAIL: UnicodeSet(%s).spanBack(%s) returns the wrong value pos %d (!= 1)", set.toString(), string, pos)); } pos = set.span(string, SpanCondition.SIMPLE); if (pos != 3) { errln(String.format("FAIL: UnicodeSet(%s).span(%s) returns the wrong value pos %d (!= 3)", set.toString(), string, pos)); } pos = set.span(string, 1, SpanCondition.SIMPLE); if (pos != 3) { errln(String.format("FAIL: UnicodeSet(%s).span(%s) returns the wrong value pos %d (!= 3)", set.toString(), string, pos)); } } // test our slow implementation public void TestSimpleStringSpanSlow() { String pattern = "[a{ab}{bc}]"; String string = "abc"; UnicodeSet uset = new UnicodeSet(pattern); uset.complement(); UnicodeSetWithStrings set = new UnicodeSetWithStrings(uset); int length = containsSpanBackUTF16(set, string, 3, SpanCondition.SIMPLE); if (length != 1) { errln(String.format("FAIL: UnicodeSet(%s) containsSpanBackUTF16(%s) returns the wrong value length %d (!= 1)", set.toString(), string, length)); } length = containsSpanUTF16(set, string, SpanCondition.SIMPLE); if (length != 3) { errln(String.format("FAIL: UnicodeSet(%s) containsSpanUTF16(%s) returns the wrong value length %d (!= 3)", set.toString(), string, length)); } length = containsSpanUTF16(set, string.substring(1), SpanCondition.SIMPLE); if (length != 2) { errln(String.format("FAIL: UnicodeSet(%s) containsSpanUTF16(%s) returns the wrong value length %d (!= 2)", set.toString(), string, length)); } } // Test select patterns and strings, and test SIMPLE. public void TestSimpleStringSpanAndFreeze() { String pattern = "[x{xy}{xya}{axy}{ax}]"; final String string = "xx" + "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya" + "xx" + "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya" + "xx" + "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy" + "aaaa"; UnicodeSet set = new UnicodeSet(pattern); if (set.containsAll(string)) { errln("FAIL: UnicodeSet(" + pattern + ").containsAll(" + string + ") should be FALSE"); } // Remove trailing "aaaa". String string16 = string.substring(0, string.length() - 4); if (!set.containsAll(string16)) { errln("FAIL: UnicodeSet(" + pattern + ").containsAll(" + string + "[:-4]) should be TRUE"); } String s16 = "byayaxya"; if ( set.span(s16.substring(0, 8), SpanCondition.NOT_CONTAINED) != 4 || set.span(s16.substring(0, 7), SpanCondition.NOT_CONTAINED) != 4 || set.span(s16.substring(0, 6), SpanCondition.NOT_CONTAINED) != 4 || set.span(s16.substring(0, 5), SpanCondition.NOT_CONTAINED) != 5 || set.span(s16.substring(0, 4), SpanCondition.NOT_CONTAINED) != 4 || set.span(s16.substring(0, 3), SpanCondition.NOT_CONTAINED) != 3) { errln("FAIL: UnicodeSet(" + pattern + ").span(while not) returns the wrong value"); } pattern = "[a{ab}{abc}{cd}]"; set.applyPattern(pattern); s16 = "acdabcdabccd"; if ( set.span(s16.substring(0, 12), SpanCondition.CONTAINED) != 12 || set.span(s16.substring(0, 12), SpanCondition.SIMPLE) != 6 || set.span(s16.substring(7), SpanCondition.SIMPLE) != 5) { errln("FAIL: UnicodeSet(" + pattern + ").span(while longest match) returns the wrong value"); } set.freeze(); if ( set.span(s16.substring(0, 12), SpanCondition.CONTAINED) != 12 || set.span(s16.substring(0, 12), SpanCondition.SIMPLE) != 6 || set.span(s16.substring(7), SpanCondition.SIMPLE) != 5) { errln("FAIL: UnicodeSet(" + pattern + ").span(while longest match) returns the wrong value"); } pattern = "[d{cd}{bcd}{ab}]"; set = (UnicodeSet)set.cloneAsThawed(); set.applyPattern(pattern).freeze(); s16 = "abbcdabcdabd"; if ( set.spanBack(s16, 12, SpanCondition.CONTAINED) != 0 || set.spanBack(s16, 12, SpanCondition.SIMPLE) != 6 || set.spanBack(s16, 5, SpanCondition.SIMPLE) != 0) { errln("FAIL: UnicodeSet(" + pattern + ").spanBack(while longest match) returns the wrong value"); } } // more complex test. -------------------------------------------------------- // Make the strings in a UnicodeSet easily accessible. static class UnicodeSetWithStrings { private UnicodeSet set; private String strings[]; private int stringsLength; private boolean hasSurrogates; public UnicodeSetWithStrings(final UnicodeSet normalSet) { set = normalSet; stringsLength = 0; hasSurrogates = false; strings = new String[20]; int size = set.size(); if (size > 0 && set.charAt(size - 1) < 0) { // If a set's last element is not a code point, then it must contain strings. // Iterate over the set, skip all code point ranges, and cache the strings. UnicodeSetIterator iter = new UnicodeSetIterator(set); while (iter.nextRange() && stringsLength < strings.length) { if (iter.codepoint == UnicodeSetIterator.IS_STRING) { // Store the pointer to the set's string element // which we happen to know is a stable pointer. strings[stringsLength] = iter.getString(); ++stringsLength; } } } } public final UnicodeSet getSet() { return set; } public boolean hasStrings() { return (stringsLength > 0); } public boolean hasStringsWithSurrogates() { return hasSurrogates; } } static class UnicodeSetWithStringsIterator { private UnicodeSetWithStrings fSet; private int nextStringIndex; public UnicodeSetWithStringsIterator(final UnicodeSetWithStrings set) { fSet = set; nextStringIndex = 0; } public void reset() { nextStringIndex = 0; } public final String nextString() { if (nextStringIndex < fSet.stringsLength) { return fSet.strings[nextStringIndex++]; } else { return null; } } } // Compare 16-bit Unicode strings (which may be malformed UTF-16) // at code point boundaries. // That is, each edge of a match must not be in the middle of a surrogate pair. static boolean matches16CPB(final String s, int start, int limit, final String t) { limit -= start; int length = t.length(); return t.equals(s.substring(start, start + length)) && !(0 < start && UTF16.isLeadSurrogate (s.charAt(start - 1)) && UTF16.isTrailSurrogate(s.charAt(start))) && !(length < limit && UTF16.isLeadSurrogate (s.charAt(start + length - 1)) && UTF16.isTrailSurrogate(s.charAt(start + length))); } // Implement span() with contains() for comparison. static int containsSpanUTF16(final UnicodeSetWithStrings set, final String s, SpanCondition spanCondition) { final UnicodeSet realSet = set.getSet(); int length = s.length(); if (!set.hasStrings()) { boolean spanContained = false; if (spanCondition != SpanCondition.NOT_CONTAINED) { spanContained = true; // Pin to 0/1 values. } int c; int start = 0, prev; while ((prev = start) < length) { c = s.codePointAt(start); start = s.offsetByCodePoints(start, 1); if (realSet.contains(c) != spanContained) { break; } } return prev; } else if (spanCondition == SpanCondition.NOT_CONTAINED) { UnicodeSetWithStringsIterator iter = new UnicodeSetWithStringsIterator(set); int c; int start, next; for (start = next = 0; start < length;) { c = s.codePointAt(next); next = s.offsetByCodePoints(next, 1); if (realSet.contains(c)) { break; } String str; iter.reset(); while ((str = iter.nextString()) != null) { if (str.length() <= (length - start) && matches16CPB(s, start, length, str)) { // spanNeedsStrings=true; return start; } } start = next; } return start; } else /* CONTAINED or SIMPLE */{ UnicodeSetWithStringsIterator iter = new UnicodeSetWithStringsIterator(set); int c; int start, next, maxSpanLimit = 0; for (start = next = 0; start < length;) { c = s.codePointAt(next); next = s.offsetByCodePoints(next, 1); if (!realSet.contains(c)) { next = start; // Do not span this single, not-contained code point. } String str; iter.reset(); while ((str = iter.nextString()) != null) { if (str.length() <= (length - start) && matches16CPB(s, start, length, str)) { // spanNeedsStrings=true; int matchLimit = start + str.length(); if (matchLimit == length) { return length; } if (spanCondition == SpanCondition.CONTAINED) { // Iterate for the shortest match at each position. // Recurse for each but the shortest match. if (next == start) { next = matchLimit; // First match from start. } else { if (matchLimit < next) { // Remember shortest match from start for iteration. int temp = next; next = matchLimit; matchLimit = temp; } // Recurse for non-shortest match from start. int spanLength = containsSpanUTF16(set, s.substring(matchLimit), SpanCondition.CONTAINED); if ((matchLimit + spanLength) > maxSpanLimit) { maxSpanLimit = matchLimit + spanLength; if (maxSpanLimit == length) { return length; } } } } else /* spanCondition==SIMPLE */{ if (matchLimit > next) { // Remember longest match from start. next = matchLimit; } } } } if (next == start) { break; // No match from start. } start = next; } if (start > maxSpanLimit) { return start; } else { return maxSpanLimit; } } } static int containsSpanBackUTF16(final UnicodeSetWithStrings set, final String s, int length, SpanCondition spanCondition) { if (length == 0) { return 0; } final UnicodeSet realSet = set.getSet(); if (!set.hasStrings()) { boolean spanContained = false; if (spanCondition != SpanCondition.NOT_CONTAINED) { spanContained = true; // Pin to 0/1 values. } int c; int prev = length; do { c = s.codePointBefore(prev); if (realSet.contains(c) != spanContained) { break; } prev = s.offsetByCodePoints(prev, -1); } while (prev > 0); return prev; } else if (spanCondition == SpanCondition.NOT_CONTAINED) { UnicodeSetWithStringsIterator iter = new UnicodeSetWithStringsIterator(set); int c; int prev = length, length0 = length; do { c = s.codePointBefore(prev); if (realSet.contains(c)) { break; } String str; iter.reset(); while ((str = iter.nextString()) != null) { if (str.length() <= prev && matches16CPB(s, prev - str.length(), length0, str)) { // spanNeedsStrings=true; return prev; } } prev = s.offsetByCodePoints(prev, -1); } while (prev > 0); return prev; } else /* SpanCondition.CONTAINED or SIMPLE */{ UnicodeSetWithStringsIterator iter = new UnicodeSetWithStringsIterator(set); int c; int prev = length, minSpanStart = length, length0 = length; do { c = s.codePointBefore(length); length = s.offsetByCodePoints(length, -1); if (!realSet.contains(c)) { length = prev; // Do not span this single, not-contained code point. } String str; iter.reset(); while ((str = iter.nextString()) != null) { if (str.length() <= prev && matches16CPB(s, prev - str.length(), length0, str)) { // spanNeedsStrings=true; int matchStart = prev - str.length(); if (matchStart == 0) { return 0; } if (spanCondition == SpanCondition.CONTAINED) { // Iterate for the shortest match at each position. // Recurse for each but the shortest match. if (length == prev) { length = matchStart; // First match from prev. } else { if (matchStart > length) { // Remember shortest match from prev for iteration. int temp = length; length = matchStart; matchStart = temp; } // Recurse for non-shortest match from prev. int spanStart = containsSpanBackUTF16(set, s, matchStart, SpanCondition.CONTAINED); if (spanStart < minSpanStart) { minSpanStart = spanStart; if (minSpanStart == 0) { return 0; } } } } else /* spanCondition==SIMPLE */{ if (matchStart < length) { // Remember longest match from prev. length = matchStart; } } } } if (length == prev) { break; // No match from prev. } } while ((prev = length) > 0); if (prev < minSpanStart) { return prev; } else { return minSpanStart; } } } // spans to be performed and compared static final int SPAN_UTF16 = 1; static final int SPAN_UTF8 = 2; static final int SPAN_UTFS = 3; static final int SPAN_SET = 4; static final int SPAN_COMPLEMENT = 8; static final int SPAN_POLARITY = 0xc; static final int SPAN_FWD = 0x10; static final int SPAN_BACK = 0x20; static final int SPAN_DIRS = 0x30; static final int SPAN_CONTAINED = 0x100; static final int SPAN_SIMPLE = 0x200; static final int SPAN_CONDITION = 0x300; static final int SPAN_ALL = 0x33f; static SpanCondition invertSpanCondition(SpanCondition spanCondition, SpanCondition contained) { return spanCondition == SpanCondition.NOT_CONTAINED ? contained : SpanCondition.NOT_CONTAINED; } /* * Count spans on a string with the method according to type and set the span limits. The set may be the complement * of the original. When using spanBack() and comparing with span(), use a span condition for the first spanBack() * according to the expected number of spans. Sets typeName to an empty string if there is no such type. Returns -1 * if the span option is filtered out. */ static int getSpans(final UnicodeSetWithStrings set, boolean isComplement, final String s, int whichSpans, int type, String[] typeName, int limits[], int limitsCapacity, int expectCount) { final UnicodeSet realSet = set.getSet(); int start, count, i; SpanCondition spanCondition, firstSpanCondition, contained; boolean isForward; int length = s.length(); if (type < 0 || 7 < type) { typeName[0] = null; return 0; } final String typeNames16[] = { "contains", "contains(LM)", "span", "span(LM)", "containsBack", "containsBack(LM)", "spanBack", "spanBack(LM)" }; typeName[0] = typeNames16[type]; // filter span options if (type <= 3) { // span forward if ((whichSpans & SPAN_FWD) == 0) { return -1; } isForward = true; } else { // span backward if ((whichSpans & SPAN_BACK) == 0) { return -1; } isForward = false; } if ((type & 1) == 0) { // use SpanCondition.CONTAINED if ((whichSpans & SPAN_CONTAINED) == 0) { return -1; } contained = SpanCondition.CONTAINED; } else { // use SIMPLE if ((whichSpans & SPAN_SIMPLE) == 0) { return -1; } contained = SpanCondition.SIMPLE; } // Default first span condition for going forward with an uncomplemented set. spanCondition = SpanCondition.NOT_CONTAINED; if (isComplement) { spanCondition = invertSpanCondition(spanCondition, contained); } // First span condition for span(), used to terminate the spanBack() iteration. firstSpanCondition = spanCondition; // spanBack(): Its initial span condition is span()'s last span condition, // which is the opposite of span()'s first span condition // if we expect an even number of spans. // (The loop inverts spanCondition (expectCount-1) times // before the expectCount'th span() call.) // If we do not compare forward and backward directions, then we do not have an // expectCount and just start with firstSpanCondition. if (!isForward && (whichSpans & SPAN_FWD) != 0 && (expectCount & 1) == 0) { spanCondition = invertSpanCondition(spanCondition, contained); } count = 0; switch (type) { case 0: case 1: start = 0; for (;;) { start += containsSpanUTF16(set, s.substring(start), spanCondition); if (count < limitsCapacity) { limits[count] = start; } ++count; if (start >= length) { break; } spanCondition = invertSpanCondition(spanCondition, contained); } break; case 2: case 3: start = 0; for (;;) { start = realSet.span(s, start, spanCondition); if (count < limitsCapacity) { limits[count] = start; } ++count; if (start >= length) { break; } spanCondition = invertSpanCondition(spanCondition, contained); } break; case 4: case 5: for (;;) { ++count; if (count <= limitsCapacity) { limits[limitsCapacity - count] = length; } length = containsSpanBackUTF16(set, s, length, spanCondition); if (length == 0 && spanCondition == firstSpanCondition) { break; } spanCondition = invertSpanCondition(spanCondition, contained); } if (count < limitsCapacity) { for (i = count; i-- > 0;) { limits[i] = limits[limitsCapacity - count + i]; } } break; case 6: case 7: for (;;) { ++count; if (count <= limitsCapacity) { limits[limitsCapacity - count] = length >= 0 ? length : s.length(); } length = realSet.spanBack(s, length, spanCondition); if (length == 0 && spanCondition == firstSpanCondition) { break; } spanCondition = invertSpanCondition(spanCondition, contained); } if (count < limitsCapacity) { for (i = count; i-- > 0;) { limits[i] = limits[limitsCapacity - count + i]; } } break; default: typeName = null; return -1; } return count; } // sets to be tested; odd index=isComplement static final int SLOW = 0; static final int SLOW_NOT = 1; static final int FAST = 2; static final int FAST_NOT = 3; static final int SET_COUNT = 4; static final String setNames[] = { "slow", "slow.not", "fast", "fast.not" }; /* * Verify that we get the same results whether we look at text with contains(), span() or spanBack(), using unfrozen * or frozen versions of the set, and using the set or its complement (switching the spanConditions accordingly). * The latter verifies that set.span(spanCondition) == set.complement().span(!spanCondition). * * The expectLimits[] are either provided by the caller (with expectCount>=0) or returned to the caller (with an * input expectCount<0). */ void verifySpan(final UnicodeSetWithStrings sets[], final String s, int whichSpans, int expectLimits[], int expectCount, // TODO final String testName, int index) { int[] limits = new int[500]; int limitsCount; int i, j; String[] typeName = new String[1]; int type; for (i = 0; i < SET_COUNT; ++i) { if ((i & 1) == 0) { // Even-numbered sets are original, uncomplemented sets. if ((whichSpans & SPAN_SET) == 0) { continue; } } else { // Odd-numbered sets are complemented. if ((whichSpans & SPAN_COMPLEMENT) == 0) { continue; } } for (type = 0;; ++type) { limitsCount = getSpans(sets[i], (0 != (i & 1)), s, whichSpans, type, typeName, limits, limits.length, expectCount); if (typeName[0] == null) { break; // All types tried. } if (limitsCount < 0) { continue; // Span option filtered out. } if (expectCount < 0) { expectCount = limitsCount; if (limitsCount > limits.length) { errln(String.format("FAIL: %s[0x%x].%s.%s span count=%d > %d capacity - too many spans", testName, index, setNames[i], typeName[0], limitsCount, limits.length)); return; } for (j = limitsCount; j-- > 0;) { expectLimits[j] = limits[j]; } } else if (limitsCount != expectCount) { errln(String.format("FAIL: %s[0x%x].%s.%s span count=%d != %d", testName, index, setNames[i], typeName[0], limitsCount, expectCount)); } else { for (j = 0; j < limitsCount; ++j) { if (limits[j] != expectLimits[j]) { errln(String.format("FAIL: %s[0x%x].%s.%s span count=%d limits[%d]=%d != %d", testName, index, setNames[i], typeName[0], limitsCount, j, limits[j], expectLimits[j])); break; } } } } } // Compare span() with containsAll()/containsNone(), // but only if we have expectLimits[] from the uncomplemented set. if ((whichSpans & SPAN_SET) != 0) { final String s16 = s; String string; int prev = 0, limit, len; for (i = 0; i < expectCount; ++i) { limit = expectLimits[i]; len = limit - prev; if (len > 0) { string = s16.substring(prev, prev + len); // read-only alias if (0 != (i & 1)) { if (!sets[SLOW].getSet().containsAll(string)) { errln(String.format("FAIL: %s[0x%x].%s.containsAll(%d..%d)==false contradicts span()", testName, index, setNames[SLOW], prev, limit)); return; } if (!sets[FAST].getSet().containsAll(string)) { errln(String.format("FAIL: %s[0x%x].%s.containsAll(%d..%d)==false contradicts span()", testName, index, setNames[FAST], prev, limit)); return; } } else { if (!sets[SLOW].getSet().containsNone(string)) { errln(String.format("FAIL: %s[0x%x].%s.containsNone(%d..%d)==false contradicts span()", testName, index, setNames[SLOW], prev, limit)); return; } if (!sets[FAST].getSet().containsNone(string)) { errln(String.format("FAIL: %s[0x%x].%s.containsNone(%d..%d)==false contradicts span()", testName, index, setNames[FAST], prev, limit)); return; } } } prev = limit; } } } // Specifically test either UTF-16 or UTF-8. void verifySpan(final UnicodeSetWithStrings sets[], final String s, int whichSpans, final String testName, int index) { int[] expectLimits = new int[500]; int expectCount = -1; verifySpan(sets, s, whichSpans, expectLimits, expectCount, testName, index); } // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text, // unless either UTF is turned off in whichSpans. // Testing UTF-16 and UTF-8 together requires that surrogate code points // have the same contains(c) value as U+FFFD. void verifySpanBothUTFs(final UnicodeSetWithStrings sets[], final String s16, int whichSpans, final String testName, int index) { int[] expectLimits = new int[500]; int expectCount; expectCount = -1; // Get expectLimits[] from verifySpan(). if ((whichSpans & SPAN_UTF16) != 0) { verifySpan(sets, s16, whichSpans, expectLimits, expectCount, testName, index); } } static int nextCodePoint(int c) { // Skip some large and boring ranges. switch (c) { case 0x3441: return 0x4d7f; case 0x5100: return 0x9f00; case 0xb040: return 0xd780; case 0xe041: return 0xf8fe; case 0x10100: return 0x20000; case 0x20041: return 0xe0000; case 0xe0101: return 0x10fffd; default: return c + 1; } } // Verify that all implementations represent the same set. void verifySpanContents(final UnicodeSetWithStrings sets[], int whichSpans, final String testName) { StringBuffer s = new StringBuffer(); int localWhichSpans; int c, first; for (first = c = 0;; c = nextCodePoint(c)) { if (c > 0x10ffff || s.length() > 1024) { localWhichSpans = whichSpans; verifySpanBothUTFs(sets, s.toString(), localWhichSpans, testName, first); if (c > 0x10ffff) { break; } s.delete(0, s.length()); first = c; } UTF16.append(s, c); } } // Test with a particular, interesting string. // Specify length and try NUL-termination. static final char interestingStringChars[] = { 0x61, 0x62, 0x20, // Latin, space 0x3b1, 0x3b2, 0x3b3, // Greek 0xd900, // lead surrogate 0x3000, 0x30ab, 0x30ad, // wide space, Katakana 0xdc05, // trail surrogate 0xa0, 0xac00, 0xd7a3, // nbsp, Hangul 0xd900, 0xdc05, // unassigned supplementary 0xd840, 0xdfff, 0xd860, 0xdffe, // Han supplementary 0xd7a4, 0xdc05, 0xd900, 0x2028 // unassigned, surrogates in wrong order, LS }; static String interestingString = new String(interestingStringChars); static final String unicodeSet1 = "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]"; public void TestInterestingStringSpan() { UnicodeSet uset = new UnicodeSet(Utility.unescape(unicodeSet1)); SpanCondition spanCondition = SpanCondition.NOT_CONTAINED; int expect = 2; int start = 14; int c = 0xd840; boolean contains = uset.contains(c); if (false != contains) { errln(String.format("FAIL: UnicodeSet(unicodeSet1).contains(%d) = true (expect false)", c)); } UnicodeSetWithStrings set = new UnicodeSetWithStrings(uset); int len = containsSpanUTF16(set, interestingString.substring(start), spanCondition); if (expect != len) { errln(String.format("FAIL: containsSpanUTF16(unicodeSet1, \"%s(%d)\") = %d (expect %d)", interestingString, start, len, expect)); } len = uset.span(interestingString, start, spanCondition) - start; if (expect != len) { errln(String.format("FAIL: UnicodeSet(unicodeSet1).span(\"%s\", %d) = %d (expect %d)", interestingString, start, len, expect)); } } void verifySpanUTF16String(final UnicodeSetWithStrings sets[], int whichSpans, final String testName) { if ((whichSpans & SPAN_UTF16) == 0) { return; } verifySpan(sets, interestingString, (whichSpans & ~SPAN_UTF8), testName, 1); } // Take a set of span options and multiply them so that // each portion only has one of the options a, b and c. // If b==0, then the set of options is just modified with mask and a. // If b!=0 and c==0, then the set of options is just modified with mask, a and b. static int addAlternative(int whichSpans[], int whichSpansCount, int mask, int a, int b, int c) { int s; int i; for (i = 0; i < whichSpansCount; ++i) { s = whichSpans[i] & mask; whichSpans[i] = s | a; if (b != 0) { whichSpans[whichSpansCount + i] = s | b; if (c != 0) { whichSpans[2 * whichSpansCount + i] = s | c; } } } return b == 0 ? whichSpansCount : c == 0 ? 2 * whichSpansCount : 3 * whichSpansCount; } // They are not representable in UTF-8, and a leading trail surrogate // and a trailing lead surrogate must not match in the middle of a proper surrogate pair. // U+20001 == \\uD840\\uDC01 // U+20400 == \\uD841\\uDC00 static final String patternWithUnpairedSurrogate = "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]"; static final String stringWithUnpairedSurrogate = "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"; static final String _63_a = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; static final String _64_a = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; static final String _63_b = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"; static final String _64_b = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"; static final String longPattern = "[a{" + _64_a + _64_a + _64_a + _64_a + "b}" + "{a" + _64_b + _64_b + _64_b + _64_b + "}]"; public void TestStringWithUnpairedSurrogateSpan() { String string = Utility.unescape(stringWithUnpairedSurrogate); UnicodeSet uset = new UnicodeSet(Utility.unescape(patternWithUnpairedSurrogate)); SpanCondition spanCondition = SpanCondition.NOT_CONTAINED; int start = 17; int expect = 5; UnicodeSetWithStrings set = new UnicodeSetWithStrings(uset); int len = containsSpanUTF16(set, string.substring(start), spanCondition); if (expect != len) { errln(String.format("FAIL: containsSpanUTF16(patternWithUnpairedSurrogate, \"%s(%d)\") = %d (expect %d)", string, start, len, expect)); } len = uset.span(string, start, spanCondition) - start; if (expect != len) { errln(String.format("FAIL: UnicodeSet(patternWithUnpairedSurrogate).span(\"%s\", %d) = %d (expect %d)", string, start, len, expect)); } } public void TestSpan() { // "[...]" is a UnicodeSet pattern. // "*" performs tests on all Unicode code points and on a selection of // malformed UTF-8/16 strings. // "-options" limits the scope of testing for the current set. // By default, the test verifies that equivalent boundaries are found // for UTF-16 and UTF-8, going forward and backward, // alternating NOT_CONTAINED with // either CONTAINED or SIMPLE. // Single-character options: // 8 -- UTF-16 and UTF-8 boundaries may differ. // Cause: contains(U+FFFD) is inconsistent with contains(some surrogates), // or the set contains strings with unpaired surrogates // which do not translate to valid UTF-8. // c -- set.span() and set.complement().span() boundaries may differ. // Cause: Set strings are not complemented. // b -- span() and spanBack() boundaries may differ. // Cause: Strings in the set overlap, and spanBack(CONTAINED) // and spanBack(SIMPLE) are defined to // match with non-overlapping substrings. // For example, with a set containing "ab" and "ba", // span() of "aba" yields boundaries { 0, 2, 3 } // because the initial "ab" matches from 0 to 2, // while spanBack() yields boundaries { 0, 1, 3 } // because the final "ba" matches from 1 to 3. // l -- CONTAINED and SIMPLE boundaries may differ. // Cause: Strings in the set overlap, and a longer match may // require a sequence including non-longest substrings. // For example, with a set containing "ab", "abc" and "cd", // span(contained) of "abcd" spans the entire string // but span(longest match) only spans the first 3 characters. // Each "-options" first resets all options and then applies the specified options. // A "-" without options resets the options. // The options are also reset for each new set. // Other strings will be spanned. final String testdata[] = { "[:ID_Continue:]", "*", "[:White_Space:]", "*", "[]", "*", "[\\u0000-\\U0010FFFF]", "*", "[\\u0000\\u0080\\u0800\\U00010000]", "*", "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]", "*", unicodeSet1, "-c", "*", "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]", "-c", "*", // Overlapping strings cause overlapping attempts to match. "[x{xy}{xya}{axy}{ax}]", "-cl", // More repetitions of "xya" would take too long with the recursive // reference implementation. // containsAll()=false // test_string 0x14 "xx" + "xyaxyaxyaxya" + // set.complement().span(longest match) will stop here. "xx" + // set.complement().span(contained) will stop between the two 'x'es. "xyaxyaxyaxya" + "xx" + "xyaxyaxyaxya" + // span() ends here. "aaa", // containsAll()=true // test_string 0x15 "xx" + "xyaxyaxyaxya" + "xx" + "xyaxyaxyaxya" + "xx" + "xyaxyaxyaxy", "-bc", // test_string 0x17 "byayaxya", // span() -> { 4, 7, 8 } spanBack() -> { 5, 8 } "-c", "byayaxy", // span() -> { 4, 7 } complement.span() -> { 7 } "byayax", // span() -> { 4, 6 } complement.span() -> { 6 } "-", "byaya", // span() -> { 5 } "byay", // span() -> { 4 } "bya", // span() -> { 3 } // span(longest match) will not span the whole string. "[a{ab}{bc}]", "-cl", // test_string 0x21 "abc", "[a{ab}{abc}{cd}]", "-cl", "acdabcdabccd", // spanBack(longest match) will not span the whole string. "[c{ab}{bc}]", "-cl", "abc", "[d{cd}{bcd}{ab}]", "-cl", "abbcdabcdabd", // Test with non-ASCII set strings - test proper handling of surrogate pairs // and UTF-8 trail bytes. // Copies of above test sets and strings, but transliterated to have // different code points with similar trail units. // Previous: a b c d // Unicode: 042B 30AB 200AB 204AB // UTF-16: 042B 30AB D840 DCAB D841 DCAB // UTF-8: D0 AB E3 82 AB F0 A0 82 AB F0 A0 92 AB "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]", "-cl", "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB", "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]", "-cl", "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB", // Stress bookkeeping and recursion. // The following strings are barely doable with the recursive // reference implementation. // The not-contained character at the end prevents an early exit from the span(). "[b{bb}]", "-c", // test_string 0x33 "bbbbbbbbbbbbbbbbbbbbbbbb-", // On complement sets, span() and spanBack() get different results // because b is not in the complement set and there is an odd number of b's // in the test string. "-bc", "bbbbbbbbbbbbbbbbbbbbbbbbb-", // Test with set strings with an initial or final code point span // longer than 254. longPattern, "-c", _64_a + _64_a + _64_a + _63_a + "b", _64_a + _64_a + _64_a + _64_a + "b", _64_a + _64_a + _64_a + _64_a + "aaaabbbb", "a" + _64_b + _64_b + _64_b + _63_b, "a" + _64_b + _64_b + _64_b + _64_b, "aaaabbbb" + _64_b + _64_b + _64_b + _64_b, // Test with strings containing unpaired surrogates. patternWithUnpairedSurrogate, "-8cl", stringWithUnpairedSurrogate }; int i, j; int whichSpansCount = 1; int[] whichSpans = new int[96]; for (i = whichSpans.length; i-- > 0;) { whichSpans[i] = SPAN_ALL; } UnicodeSet[] sets = new UnicodeSet[SET_COUNT]; UnicodeSetWithStrings[] sets_with_str = new UnicodeSetWithStrings[SET_COUNT]; String testName = null; String testNameLimit; for (i = 0; i < testdata.length; ++i) { final String s = testdata[i]; if (s.charAt(0) == '[') { // Create new test sets from this pattern. for (j = 0; j < SET_COUNT; ++j) { sets_with_str[j] = null; sets[j] = null; } sets[SLOW] = new UnicodeSet(Utility.unescape(s)); sets[SLOW_NOT] = new UnicodeSet(sets[SLOW]); sets[SLOW_NOT].complement(); // Intermediate set: Test cloning of a frozen set. UnicodeSet fast = new UnicodeSet(sets[SLOW]); fast.freeze(); sets[FAST] = (UnicodeSet) fast.clone(); fast = null; UnicodeSet fastNot = new UnicodeSet(sets[SLOW_NOT]); fastNot.freeze(); sets[FAST_NOT] = (UnicodeSet) fastNot.clone(); fastNot = null; for (j = 0; j < SET_COUNT; ++j) { sets_with_str[j] = new UnicodeSetWithStrings(sets[j]); } testName = s + ':'; whichSpans[0] = SPAN_ALL; whichSpansCount = 1; } else if (s.charAt(0) == '-') { whichSpans[0] = SPAN_ALL; whichSpansCount = 1; for (j = 1; j < s.length(); j++) { switch (s.charAt(j)) { case 'c': whichSpansCount = addAlternative(whichSpans, whichSpansCount, ~SPAN_POLARITY, SPAN_SET, SPAN_COMPLEMENT, 0); break; case 'b': whichSpansCount = addAlternative(whichSpans, whichSpansCount, ~SPAN_DIRS, SPAN_FWD, SPAN_BACK, 0); break; case 'l': // test CONTAINED FWD & BACK, and separately // SIMPLE only FWD, and separately // SIMPLE only BACK whichSpansCount = addAlternative(whichSpans, whichSpansCount, ~(SPAN_DIRS | SPAN_CONDITION), SPAN_DIRS | SPAN_CONTAINED, SPAN_FWD | SPAN_SIMPLE, SPAN_BACK | SPAN_SIMPLE); break; case '8': whichSpansCount = addAlternative(whichSpans, whichSpansCount, ~SPAN_UTFS, SPAN_UTF16, SPAN_UTF8, 0); break; default: errln(String.format("FAIL: unrecognized span set option in \"%s\"", testdata[i])); break; } } } else if (s.equals("*")) { testNameLimit = "bad_string"; for (j = 0; j < whichSpansCount; ++j) { if (whichSpansCount > 1) { testNameLimit += String.format("%%0x%3x", whichSpans[j]); } verifySpanUTF16String(sets_with_str, whichSpans[j], testName); } testNameLimit = "contents"; for (j = 0; j < whichSpansCount; ++j) { if (whichSpansCount > 1) { testNameLimit += String.format("%%0x%3x", whichSpans[j]); } verifySpanContents(sets_with_str, whichSpans[j], testName); } } else { String string = Utility.unescape(s); testNameLimit = "test_string"; for (j = 0; j < whichSpansCount; ++j) { if (whichSpansCount > 1) { testNameLimit += String.format("%%0x%3x", whichSpans[j]); } verifySpanBothUTFs(sets_with_str, string, whichSpans[j], testName, i); } } } } }