2 *******************************************************************************
\r
3 * Copyright (C) 2009-2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.dev.test.lang;
\r
9 import com.ibm.icu.dev.test.TestFmwk;
\r
10 import com.ibm.icu.impl.Utility;
\r
11 import com.ibm.icu.text.UTF16;
\r
12 import com.ibm.icu.text.UnicodeSet;
\r
13 import com.ibm.icu.text.UnicodeSetIterator;
\r
14 import com.ibm.icu.text.UnicodeSet.SpanCondition;
\r
18 * @summary General test of UnicodeSet string span.
\r
20 public class UnicodeSetStringSpanTest extends TestFmwk {
\r
22 public static void main(String[] args) throws Exception {
\r
23 new UnicodeSetStringSpanTest().run(args);
\r
26 // Simple test first, easier to debug.
\r
27 public void TestSimpleStringSpan() {
\r
28 String pattern = "[a{ab}{bc}]";
\r
29 String string = "abc";
\r
30 UnicodeSet set = new UnicodeSet(pattern);
\r
32 int pos = set.spanBack(string, 3, SpanCondition.SIMPLE);
\r
34 errln(String.format("FAIL: UnicodeSet(%s).spanBack(%s) returns the wrong value pos %d (!= 1)",
\r
35 set.toString(), string, pos));
\r
37 pos = set.span(string, SpanCondition.SIMPLE);
\r
39 errln(String.format("FAIL: UnicodeSet(%s).span(%s) returns the wrong value pos %d (!= 3)",
\r
40 set.toString(), string, pos));
\r
42 pos = set.span(string, 1, SpanCondition.SIMPLE);
\r
44 errln(String.format("FAIL: UnicodeSet(%s).span(%s) returns the wrong value pos %d (!= 3)",
\r
45 set.toString(), string, pos));
\r
49 // test our slow implementation
\r
50 public void TestSimpleStringSpanSlow() {
\r
51 String pattern = "[a{ab}{bc}]";
\r
52 String string = "abc";
\r
53 UnicodeSet uset = new UnicodeSet(pattern);
\r
55 UnicodeSetWithStrings set = new UnicodeSetWithStrings(uset);
\r
57 int length = containsSpanBackUTF16(set, string, 3, SpanCondition.SIMPLE);
\r
59 errln(String.format("FAIL: UnicodeSet(%s) containsSpanBackUTF16(%s) returns the wrong value length %d (!= 1)",
\r
60 set.toString(), string, length));
\r
62 length = containsSpanUTF16(set, string, SpanCondition.SIMPLE);
\r
64 errln(String.format("FAIL: UnicodeSet(%s) containsSpanUTF16(%s) returns the wrong value length %d (!= 3)",
\r
65 set.toString(), string, length));
\r
67 length = containsSpanUTF16(set, string.substring(1), SpanCondition.SIMPLE);
\r
69 errln(String.format("FAIL: UnicodeSet(%s) containsSpanUTF16(%s) returns the wrong value length %d (!= 2)",
\r
70 set.toString(), string, length));
\r
74 // Test select patterns and strings, and test SIMPLE.
\r
75 public void TestSimpleStringSpanAndFreeze() {
\r
76 String pattern = "[x{xy}{xya}{axy}{ax}]";
\r
77 final String string = "xx"
\r
78 + "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya" + "xx"
\r
79 + "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya" + "xx"
\r
80 + "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy" + "aaaa";
\r
82 UnicodeSet set = new UnicodeSet(pattern);
\r
83 if (set.containsAll(string)) {
\r
84 errln("FAIL: UnicodeSet(" + pattern + ").containsAll(" + string + ") should be FALSE");
\r
87 // Remove trailing "aaaa".
\r
88 String string16 = string.substring(0, string.length() - 4);
\r
89 if (!set.containsAll(string16)) {
\r
90 errln("FAIL: UnicodeSet(" + pattern + ").containsAll(" + string + "[:-4]) should be TRUE");
\r
93 String s16 = "byayaxya";
\r
94 if ( set.span(s16.substring(0, 8), SpanCondition.NOT_CONTAINED) != 4
\r
95 || set.span(s16.substring(0, 7), SpanCondition.NOT_CONTAINED) != 4
\r
96 || set.span(s16.substring(0, 6), SpanCondition.NOT_CONTAINED) != 4
\r
97 || set.span(s16.substring(0, 5), SpanCondition.NOT_CONTAINED) != 5
\r
98 || set.span(s16.substring(0, 4), SpanCondition.NOT_CONTAINED) != 4
\r
99 || set.span(s16.substring(0, 3), SpanCondition.NOT_CONTAINED) != 3) {
\r
100 errln("FAIL: UnicodeSet(" + pattern + ").span(while not) returns the wrong value");
\r
103 pattern = "[a{ab}{abc}{cd}]";
\r
104 set.applyPattern(pattern);
\r
105 s16 = "acdabcdabccd";
\r
106 if ( set.span(s16.substring(0, 12), SpanCondition.CONTAINED) != 12
\r
107 || set.span(s16.substring(0, 12), SpanCondition.SIMPLE) != 6
\r
108 || set.span(s16.substring(7), SpanCondition.SIMPLE) != 5) {
\r
109 errln("FAIL: UnicodeSet(" + pattern + ").span(while longest match) returns the wrong value");
\r
112 if ( set.span(s16.substring(0, 12), SpanCondition.CONTAINED) != 12
\r
113 || set.span(s16.substring(0, 12), SpanCondition.SIMPLE) != 6
\r
114 || set.span(s16.substring(7), SpanCondition.SIMPLE) != 5) {
\r
115 errln("FAIL: UnicodeSet(" + pattern + ").span(while longest match) returns the wrong value");
\r
118 pattern = "[d{cd}{bcd}{ab}]";
\r
119 set = (UnicodeSet)set.cloneAsThawed();
\r
120 set.applyPattern(pattern).freeze();
\r
121 s16 = "abbcdabcdabd";
\r
122 if ( set.spanBack(s16, 12, SpanCondition.CONTAINED) != 0
\r
123 || set.spanBack(s16, 12, SpanCondition.SIMPLE) != 6
\r
124 || set.spanBack(s16, 5, SpanCondition.SIMPLE) != 0) {
\r
125 errln("FAIL: UnicodeSet(" + pattern + ").spanBack(while longest match) returns the wrong value");
\r
129 // more complex test. --------------------------------------------------------
\r
131 // Make the strings in a UnicodeSet easily accessible.
\r
132 static class UnicodeSetWithStrings {
\r
134 private UnicodeSet set;
\r
136 private String strings[];
\r
137 private int stringsLength;
\r
138 private boolean hasSurrogates;
\r
140 public UnicodeSetWithStrings(final UnicodeSet normalSet) {
\r
143 hasSurrogates = false;
\r
144 strings = new String[20];
\r
145 int size = set.size();
\r
146 if (size > 0 && set.charAt(size - 1) < 0) {
\r
147 // If a set's last element is not a code point, then it must contain strings.
\r
148 // Iterate over the set, skip all code point ranges, and cache the strings.
\r
149 UnicodeSetIterator iter = new UnicodeSetIterator(set);
\r
150 while (iter.nextRange() && stringsLength < strings.length) {
\r
151 if (iter.codepoint == UnicodeSetIterator.IS_STRING) {
\r
152 // Store the pointer to the set's string element
\r
153 // which we happen to know is a stable pointer.
\r
154 strings[stringsLength] = iter.getString();
\r
161 public final UnicodeSet getSet() {
\r
165 public boolean hasStrings() {
\r
166 return (stringsLength > 0);
\r
169 public boolean hasStringsWithSurrogates() {
\r
170 return hasSurrogates;
\r
175 static class UnicodeSetWithStringsIterator {
\r
177 private UnicodeSetWithStrings fSet;
\r
178 private int nextStringIndex;
\r
180 public UnicodeSetWithStringsIterator(final UnicodeSetWithStrings set) {
\r
182 nextStringIndex = 0;
\r
185 public void reset() {
\r
186 nextStringIndex = 0;
\r
189 public final String nextString() {
\r
190 if (nextStringIndex < fSet.stringsLength) {
\r
191 return fSet.strings[nextStringIndex++];
\r
199 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
\r
200 // at code point boundaries.
\r
201 // That is, each edge of a match must not be in the middle of a surrogate pair.
\r
202 static boolean matches16CPB(final String s, int start, int limit, final String t) {
\r
204 int length = t.length();
\r
205 return t.equals(s.substring(start, start + length))
\r
206 && !(0 < start && UTF16.isLeadSurrogate (s.charAt(start - 1)) &&
\r
207 UTF16.isTrailSurrogate(s.charAt(start)))
\r
208 && !(length < limit && UTF16.isLeadSurrogate (s.charAt(start + length - 1)) &&
\r
209 UTF16.isTrailSurrogate(s.charAt(start + length)));
\r
212 // Implement span() with contains() for comparison.
\r
213 static int containsSpanUTF16(final UnicodeSetWithStrings set, final String s,
\r
214 SpanCondition spanCondition) {
\r
215 final UnicodeSet realSet = set.getSet();
\r
216 int length = s.length();
\r
217 if (!set.hasStrings()) {
\r
218 boolean spanContained = false;
\r
219 if (spanCondition != SpanCondition.NOT_CONTAINED) {
\r
220 spanContained = true; // Pin to 0/1 values.
\r
224 int start = 0, prev;
\r
225 while ((prev = start) < length) {
\r
226 c = s.codePointAt(start);
\r
227 start = s.offsetByCodePoints(start, 1);
\r
228 if (realSet.contains(c) != spanContained) {
\r
233 } else if (spanCondition == SpanCondition.NOT_CONTAINED) {
\r
234 UnicodeSetWithStringsIterator iter = new UnicodeSetWithStringsIterator(set);
\r
237 for (start = next = 0; start < length;) {
\r
238 c = s.codePointAt(next);
\r
239 next = s.offsetByCodePoints(next, 1);
\r
240 if (realSet.contains(c)) {
\r
245 while ((str = iter.nextString()) != null) {
\r
246 if (str.length() <= (length - start) && matches16CPB(s, start, length, str)) {
\r
247 // spanNeedsStrings=true;
\r
254 } else /* CONTAINED or SIMPLE */{
\r
255 UnicodeSetWithStringsIterator iter = new UnicodeSetWithStringsIterator(set);
\r
257 int start, next, maxSpanLimit = 0;
\r
258 for (start = next = 0; start < length;) {
\r
259 c = s.codePointAt(next);
\r
260 next = s.offsetByCodePoints(next, 1);
\r
261 if (!realSet.contains(c)) {
\r
262 next = start; // Do not span this single, not-contained code point.
\r
266 while ((str = iter.nextString()) != null) {
\r
267 if (str.length() <= (length - start) && matches16CPB(s, start, length, str)) {
\r
268 // spanNeedsStrings=true;
\r
269 int matchLimit = start + str.length();
\r
270 if (matchLimit == length) {
\r
273 if (spanCondition == SpanCondition.CONTAINED) {
\r
274 // Iterate for the shortest match at each position.
\r
275 // Recurse for each but the shortest match.
\r
276 if (next == start) {
\r
277 next = matchLimit; // First match from start.
\r
279 if (matchLimit < next) {
\r
280 // Remember shortest match from start for iteration.
\r
285 // Recurse for non-shortest match from start.
\r
286 int spanLength = containsSpanUTF16(set, s.substring(matchLimit),
\r
287 SpanCondition.CONTAINED);
\r
288 if ((matchLimit + spanLength) > maxSpanLimit) {
\r
289 maxSpanLimit = matchLimit + spanLength;
\r
290 if (maxSpanLimit == length) {
\r
295 } else /* spanCondition==SIMPLE */{
\r
296 if (matchLimit > next) {
\r
297 // Remember longest match from start.
\r
303 if (next == start) {
\r
304 break; // No match from start.
\r
308 if (start > maxSpanLimit) {
\r
311 return maxSpanLimit;
\r
316 static int containsSpanBackUTF16(final UnicodeSetWithStrings set, final String s, int length,
\r
317 SpanCondition spanCondition) {
\r
321 final UnicodeSet realSet = set.getSet();
\r
322 if (!set.hasStrings()) {
\r
323 boolean spanContained = false;
\r
324 if (spanCondition != SpanCondition.NOT_CONTAINED) {
\r
325 spanContained = true; // Pin to 0/1 values.
\r
331 c = s.codePointBefore(prev);
\r
332 if (realSet.contains(c) != spanContained) {
\r
335 prev = s.offsetByCodePoints(prev, -1);
\r
336 } while (prev > 0);
\r
338 } else if (spanCondition == SpanCondition.NOT_CONTAINED) {
\r
339 UnicodeSetWithStringsIterator iter = new UnicodeSetWithStringsIterator(set);
\r
341 int prev = length, length0 = length;
\r
343 c = s.codePointBefore(prev);
\r
344 if (realSet.contains(c)) {
\r
349 while ((str = iter.nextString()) != null) {
\r
350 if (str.length() <= prev && matches16CPB(s, prev - str.length(), length0, str)) {
\r
351 // spanNeedsStrings=true;
\r
355 prev = s.offsetByCodePoints(prev, -1);
\r
356 } while (prev > 0);
\r
358 } else /* SpanCondition.CONTAINED or SIMPLE */{
\r
359 UnicodeSetWithStringsIterator iter = new UnicodeSetWithStringsIterator(set);
\r
361 int prev = length, minSpanStart = length, length0 = length;
\r
363 c = s.codePointBefore(length);
\r
364 length = s.offsetByCodePoints(length, -1);
\r
365 if (!realSet.contains(c)) {
\r
366 length = prev; // Do not span this single, not-contained code point.
\r
370 while ((str = iter.nextString()) != null) {
\r
371 if (str.length() <= prev && matches16CPB(s, prev - str.length(), length0, str)) {
\r
372 // spanNeedsStrings=true;
\r
373 int matchStart = prev - str.length();
\r
374 if (matchStart == 0) {
\r
377 if (spanCondition == SpanCondition.CONTAINED) {
\r
378 // Iterate for the shortest match at each position.
\r
379 // Recurse for each but the shortest match.
\r
380 if (length == prev) {
\r
381 length = matchStart; // First match from prev.
\r
383 if (matchStart > length) {
\r
384 // Remember shortest match from prev for iteration.
\r
386 length = matchStart;
\r
389 // Recurse for non-shortest match from prev.
\r
390 int spanStart = containsSpanBackUTF16(set, s, matchStart,
\r
391 SpanCondition.CONTAINED);
\r
392 if (spanStart < minSpanStart) {
\r
393 minSpanStart = spanStart;
\r
394 if (minSpanStart == 0) {
\r
399 } else /* spanCondition==SIMPLE */{
\r
400 if (matchStart < length) {
\r
401 // Remember longest match from prev.
\r
402 length = matchStart;
\r
407 if (length == prev) {
\r
408 break; // No match from prev.
\r
410 } while ((prev = length) > 0);
\r
411 if (prev < minSpanStart) {
\r
414 return minSpanStart;
\r
419 // spans to be performed and compared
\r
420 static final int SPAN_UTF16 = 1;
\r
421 static final int SPAN_UTF8 = 2;
\r
422 static final int SPAN_UTFS = 3;
\r
424 static final int SPAN_SET = 4;
\r
425 static final int SPAN_COMPLEMENT = 8;
\r
426 static final int SPAN_POLARITY = 0xc;
\r
428 static final int SPAN_FWD = 0x10;
\r
429 static final int SPAN_BACK = 0x20;
\r
430 static final int SPAN_DIRS = 0x30;
\r
432 static final int SPAN_CONTAINED = 0x100;
\r
433 static final int SPAN_SIMPLE = 0x200;
\r
434 static final int SPAN_CONDITION = 0x300;
\r
436 static final int SPAN_ALL = 0x33f;
\r
438 static SpanCondition invertSpanCondition(SpanCondition spanCondition, SpanCondition contained) {
\r
439 return spanCondition == SpanCondition.NOT_CONTAINED ? contained
\r
440 : SpanCondition.NOT_CONTAINED;
\r
444 * Count spans on a string with the method according to type and set the span limits. The set may be the complement
\r
445 * of the original. When using spanBack() and comparing with span(), use a span condition for the first spanBack()
\r
446 * according to the expected number of spans. Sets typeName to an empty string if there is no such type. Returns -1
\r
447 * if the span option is filtered out.
\r
449 static int getSpans(final UnicodeSetWithStrings set, boolean isComplement, final String s,
\r
450 int whichSpans, int type, String[] typeName, int limits[], int limitsCapacity,
\r
452 final UnicodeSet realSet = set.getSet();
\r
453 int start, count, i;
\r
454 SpanCondition spanCondition, firstSpanCondition, contained;
\r
457 int length = s.length();
\r
458 if (type < 0 || 7 < type) {
\r
459 typeName[0] = null;
\r
463 final String typeNames16[] = {
\r
469 "containsBack(LM)",
\r
473 typeName[0] = typeNames16[type];
\r
475 // filter span options
\r
478 if ((whichSpans & SPAN_FWD) == 0) {
\r
484 if ((whichSpans & SPAN_BACK) == 0) {
\r
489 if ((type & 1) == 0) {
\r
490 // use SpanCondition.CONTAINED
\r
491 if ((whichSpans & SPAN_CONTAINED) == 0) {
\r
494 contained = SpanCondition.CONTAINED;
\r
497 if ((whichSpans & SPAN_SIMPLE) == 0) {
\r
500 contained = SpanCondition.SIMPLE;
\r
503 // Default first span condition for going forward with an uncomplemented set.
\r
504 spanCondition = SpanCondition.NOT_CONTAINED;
\r
505 if (isComplement) {
\r
506 spanCondition = invertSpanCondition(spanCondition, contained);
\r
509 // First span condition for span(), used to terminate the spanBack() iteration.
\r
510 firstSpanCondition = spanCondition;
\r
512 // spanBack(): Its initial span condition is span()'s last span condition,
\r
513 // which is the opposite of span()'s first span condition
\r
514 // if we expect an even number of spans.
\r
515 // (The loop inverts spanCondition (expectCount-1) times
\r
516 // before the expectCount'th span() call.)
\r
517 // If we do not compare forward and backward directions, then we do not have an
\r
518 // expectCount and just start with firstSpanCondition.
\r
519 if (!isForward && (whichSpans & SPAN_FWD) != 0 && (expectCount & 1) == 0) {
\r
520 spanCondition = invertSpanCondition(spanCondition, contained);
\r
529 start += containsSpanUTF16(set, s.substring(start), spanCondition);
\r
530 if (count < limitsCapacity) {
\r
531 limits[count] = start;
\r
534 if (start >= length) {
\r
537 spanCondition = invertSpanCondition(spanCondition, contained);
\r
544 start = realSet.span(s, start, spanCondition);
\r
545 if (count < limitsCapacity) {
\r
546 limits[count] = start;
\r
549 if (start >= length) {
\r
552 spanCondition = invertSpanCondition(spanCondition, contained);
\r
559 if (count <= limitsCapacity) {
\r
560 limits[limitsCapacity - count] = length;
\r
562 length = containsSpanBackUTF16(set, s, length, spanCondition);
\r
563 if (length == 0 && spanCondition == firstSpanCondition) {
\r
566 spanCondition = invertSpanCondition(spanCondition, contained);
\r
568 if (count < limitsCapacity) {
\r
569 for (i = count; i-- > 0;) {
\r
570 limits[i] = limits[limitsCapacity - count + i];
\r
578 if (count <= limitsCapacity) {
\r
579 limits[limitsCapacity - count] = length >= 0 ? length : s.length();
\r
581 length = realSet.spanBack(s, length, spanCondition);
\r
582 if (length == 0 && spanCondition == firstSpanCondition) {
\r
585 spanCondition = invertSpanCondition(spanCondition, contained);
\r
587 if (count < limitsCapacity) {
\r
588 for (i = count; i-- > 0;) {
\r
589 limits[i] = limits[limitsCapacity - count + i];
\r
601 // sets to be tested; odd index=isComplement
\r
602 static final int SLOW = 0;
\r
603 static final int SLOW_NOT = 1;
\r
604 static final int FAST = 2;
\r
605 static final int FAST_NOT = 3;
\r
606 static final int SET_COUNT = 4;
\r
608 static final String setNames[] = { "slow", "slow.not", "fast", "fast.not" };
\r
611 * Verify that we get the same results whether we look at text with contains(), span() or spanBack(), using unfrozen
\r
612 * or frozen versions of the set, and using the set or its complement (switching the spanConditions accordingly).
\r
613 * The latter verifies that set.span(spanCondition) == set.complement().span(!spanCondition).
\r
615 * The expectLimits[] are either provided by the caller (with expectCount>=0) or returned to the caller (with an
\r
616 * input expectCount<0).
\r
618 void verifySpan(final UnicodeSetWithStrings sets[], final String s, int whichSpans,
\r
619 int expectLimits[], int expectCount, // TODO
\r
620 final String testName, int index) {
\r
621 int[] limits = new int[500];
\r
624 String[] typeName = new String[1];
\r
627 for (i = 0; i < SET_COUNT; ++i) {
\r
628 if ((i & 1) == 0) {
\r
629 // Even-numbered sets are original, uncomplemented sets.
\r
630 if ((whichSpans & SPAN_SET) == 0) {
\r
634 // Odd-numbered sets are complemented.
\r
635 if ((whichSpans & SPAN_COMPLEMENT) == 0) {
\r
639 for (type = 0;; ++type) {
\r
640 limitsCount = getSpans(sets[i], (0 != (i & 1)), s, whichSpans, type, typeName, limits,
\r
641 limits.length, expectCount);
\r
642 if (typeName[0] == null) {
\r
643 break; // All types tried.
\r
645 if (limitsCount < 0) {
\r
646 continue; // Span option filtered out.
\r
648 if (expectCount < 0) {
\r
649 expectCount = limitsCount;
\r
650 if (limitsCount > limits.length) {
\r
651 errln(String.format("FAIL: %s[0x%x].%s.%s span count=%d > %d capacity - too many spans",
\r
652 testName, index, setNames[i], typeName[0], limitsCount, limits.length));
\r
655 for (j = limitsCount; j-- > 0;) {
\r
656 expectLimits[j] = limits[j];
\r
658 } else if (limitsCount != expectCount) {
\r
659 errln(String.format("FAIL: %s[0x%x].%s.%s span count=%d != %d", testName, index, setNames[i],
\r
660 typeName[0], limitsCount, expectCount));
\r
662 for (j = 0; j < limitsCount; ++j) {
\r
663 if (limits[j] != expectLimits[j]) {
\r
664 errln(String.format("FAIL: %s[0x%x].%s.%s span count=%d limits[%d]=%d != %d", testName,
\r
665 index, setNames[i], typeName[0], limitsCount, j, limits[j], expectLimits[j]));
\r
673 // Compare span() with containsAll()/containsNone(),
\r
674 // but only if we have expectLimits[] from the uncomplemented set.
\r
675 if ((whichSpans & SPAN_SET) != 0) {
\r
676 final String s16 = s;
\r
678 int prev = 0, limit, len;
\r
679 for (i = 0; i < expectCount; ++i) {
\r
680 limit = expectLimits[i];
\r
681 len = limit - prev;
\r
683 string = s16.substring(prev, prev + len); // read-only alias
\r
684 if (0 != (i & 1)) {
\r
685 if (!sets[SLOW].getSet().containsAll(string)) {
\r
686 errln(String.format("FAIL: %s[0x%x].%s.containsAll(%d..%d)==false contradicts span()",
\r
687 testName, index, setNames[SLOW], prev, limit));
\r
690 if (!sets[FAST].getSet().containsAll(string)) {
\r
691 errln(String.format("FAIL: %s[0x%x].%s.containsAll(%d..%d)==false contradicts span()",
\r
692 testName, index, setNames[FAST], prev, limit));
\r
696 if (!sets[SLOW].getSet().containsNone(string)) {
\r
697 errln(String.format("FAIL: %s[0x%x].%s.containsNone(%d..%d)==false contradicts span()",
\r
698 testName, index, setNames[SLOW], prev, limit));
\r
701 if (!sets[FAST].getSet().containsNone(string)) {
\r
702 errln(String.format("FAIL: %s[0x%x].%s.containsNone(%d..%d)==false contradicts span()",
\r
703 testName, index, setNames[FAST], prev, limit));
\r
713 // Specifically test either UTF-16 or UTF-8.
\r
714 void verifySpan(final UnicodeSetWithStrings sets[], final String s, int whichSpans,
\r
715 final String testName, int index) {
\r
716 int[] expectLimits = new int[500];
\r
717 int expectCount = -1;
\r
718 verifySpan(sets, s, whichSpans, expectLimits, expectCount, testName, index);
\r
721 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
\r
722 // unless either UTF is turned off in whichSpans.
\r
723 // Testing UTF-16 and UTF-8 together requires that surrogate code points
\r
724 // have the same contains(c) value as U+FFFD.
\r
725 void verifySpanBothUTFs(final UnicodeSetWithStrings sets[], final String s16, int whichSpans,
\r
726 final String testName, int index) {
\r
727 int[] expectLimits = new int[500];
\r
730 expectCount = -1; // Get expectLimits[] from verifySpan().
\r
732 if ((whichSpans & SPAN_UTF16) != 0) {
\r
733 verifySpan(sets, s16, whichSpans, expectLimits, expectCount, testName, index);
\r
737 static int nextCodePoint(int c) {
\r
738 // Skip some large and boring ranges.
\r
759 // Verify that all implementations represent the same set.
\r
760 void verifySpanContents(final UnicodeSetWithStrings sets[], int whichSpans, final String testName) {
\r
761 StringBuffer s = new StringBuffer();
\r
762 int localWhichSpans;
\r
764 for (first = c = 0;; c = nextCodePoint(c)) {
\r
765 if (c > 0x10ffff || s.length() > 1024) {
\r
766 localWhichSpans = whichSpans;
\r
767 verifySpanBothUTFs(sets, s.toString(), localWhichSpans, testName, first);
\r
768 if (c > 0x10ffff) {
\r
771 s.delete(0, s.length());
\r
774 UTF16.append(s, c);
\r
778 // Test with a particular, interesting string.
\r
779 // Specify length and try NUL-termination.
\r
780 static final char interestingStringChars[] = { 0x61, 0x62, 0x20, // Latin, space
\r
781 0x3b1, 0x3b2, 0x3b3, // Greek
\r
782 0xd900, // lead surrogate
\r
783 0x3000, 0x30ab, 0x30ad, // wide space, Katakana
\r
784 0xdc05, // trail surrogate
\r
785 0xa0, 0xac00, 0xd7a3, // nbsp, Hangul
\r
786 0xd900, 0xdc05, // unassigned supplementary
\r
787 0xd840, 0xdfff, 0xd860, 0xdffe, // Han supplementary
\r
788 0xd7a4, 0xdc05, 0xd900, 0x2028 // unassigned, surrogates in wrong order, LS
\r
790 static String interestingString = new String(interestingStringChars);
\r
791 static final String unicodeSet1 = "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]";
\r
793 public void TestInterestingStringSpan() {
\r
794 UnicodeSet uset = new UnicodeSet(Utility.unescape(unicodeSet1));
\r
795 SpanCondition spanCondition = SpanCondition.NOT_CONTAINED;
\r
800 boolean contains = uset.contains(c);
\r
801 if (false != contains) {
\r
802 errln(String.format("FAIL: UnicodeSet(unicodeSet1).contains(%d) = true (expect false)",
\r
806 UnicodeSetWithStrings set = new UnicodeSetWithStrings(uset);
\r
807 int len = containsSpanUTF16(set, interestingString.substring(start), spanCondition);
\r
808 if (expect != len) {
\r
809 errln(String.format("FAIL: containsSpanUTF16(unicodeSet1, \"%s(%d)\") = %d (expect %d)",
\r
810 interestingString, start, len, expect));
\r
813 len = uset.span(interestingString, start, spanCondition) - start;
\r
814 if (expect != len) {
\r
815 errln(String.format("FAIL: UnicodeSet(unicodeSet1).span(\"%s\", %d) = %d (expect %d)",
\r
816 interestingString, start, len, expect));
\r
820 void verifySpanUTF16String(final UnicodeSetWithStrings sets[], int whichSpans, final String testName) {
\r
821 if ((whichSpans & SPAN_UTF16) == 0) {
\r
824 verifySpan(sets, interestingString, (whichSpans & ~SPAN_UTF8), testName, 1);
\r
827 // Take a set of span options and multiply them so that
\r
828 // each portion only has one of the options a, b and c.
\r
829 // If b==0, then the set of options is just modified with mask and a.
\r
830 // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
\r
831 static int addAlternative(int whichSpans[], int whichSpansCount, int mask, int a, int b, int c) {
\r
835 for (i = 0; i < whichSpansCount; ++i) {
\r
836 s = whichSpans[i] & mask;
\r
837 whichSpans[i] = s | a;
\r
839 whichSpans[whichSpansCount + i] = s | b;
\r
841 whichSpans[2 * whichSpansCount + i] = s | c;
\r
845 return b == 0 ? whichSpansCount : c == 0 ? 2 * whichSpansCount : 3 * whichSpansCount;
\r
848 // They are not representable in UTF-8, and a leading trail surrogate
\r
849 // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
\r
850 // U+20001 == \\uD840\\uDC01
\r
851 // U+20400 == \\uD841\\uDC00
\r
852 static final String patternWithUnpairedSurrogate =
\r
853 "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]";
\r
854 static final String stringWithUnpairedSurrogate =
\r
855 "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb";
\r
857 static final String _63_a = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
\r
858 static final String _64_a = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
\r
859 static final String _63_b = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
\r
860 static final String _64_b = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
\r
861 static final String longPattern =
\r
862 "[a{" + _64_a + _64_a + _64_a + _64_a + "b}" + "{a" + _64_b + _64_b + _64_b + _64_b + "}]";
\r
864 public void TestStringWithUnpairedSurrogateSpan() {
\r
865 String string = Utility.unescape(stringWithUnpairedSurrogate);
\r
866 UnicodeSet uset = new UnicodeSet(Utility.unescape(patternWithUnpairedSurrogate));
\r
867 SpanCondition spanCondition = SpanCondition.NOT_CONTAINED;
\r
871 UnicodeSetWithStrings set = new UnicodeSetWithStrings(uset);
\r
872 int len = containsSpanUTF16(set, string.substring(start), spanCondition);
\r
873 if (expect != len) {
\r
874 errln(String.format("FAIL: containsSpanUTF16(patternWithUnpairedSurrogate, \"%s(%d)\") = %d (expect %d)",
\r
875 string, start, len, expect));
\r
878 len = uset.span(string, start, spanCondition) - start;
\r
879 if (expect != len) {
\r
880 errln(String.format("FAIL: UnicodeSet(patternWithUnpairedSurrogate).span(\"%s\", %d) = %d (expect %d)",
\r
881 string, start, len, expect));
\r
885 public void TestSpan() {
\r
886 // "[...]" is a UnicodeSet pattern.
\r
887 // "*" performs tests on all Unicode code points and on a selection of
\r
888 // malformed UTF-8/16 strings.
\r
889 // "-options" limits the scope of testing for the current set.
\r
890 // By default, the test verifies that equivalent boundaries are found
\r
891 // for UTF-16 and UTF-8, going forward and backward,
\r
892 // alternating NOT_CONTAINED with
\r
893 // either CONTAINED or SIMPLE.
\r
894 // Single-character options:
\r
895 // 8 -- UTF-16 and UTF-8 boundaries may differ.
\r
896 // Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
\r
897 // or the set contains strings with unpaired surrogates
\r
898 // which do not translate to valid UTF-8.
\r
899 // c -- set.span() and set.complement().span() boundaries may differ.
\r
900 // Cause: Set strings are not complemented.
\r
901 // b -- span() and spanBack() boundaries may differ.
\r
902 // Cause: Strings in the set overlap, and spanBack(CONTAINED)
\r
903 // and spanBack(SIMPLE) are defined to
\r
904 // match with non-overlapping substrings.
\r
905 // For example, with a set containing "ab" and "ba",
\r
906 // span() of "aba" yields boundaries { 0, 2, 3 }
\r
907 // because the initial "ab" matches from 0 to 2,
\r
908 // while spanBack() yields boundaries { 0, 1, 3 }
\r
909 // because the final "ba" matches from 1 to 3.
\r
910 // l -- CONTAINED and SIMPLE boundaries may differ.
\r
911 // Cause: Strings in the set overlap, and a longer match may
\r
912 // require a sequence including non-longest substrings.
\r
913 // For example, with a set containing "ab", "abc" and "cd",
\r
914 // span(contained) of "abcd" spans the entire string
\r
915 // but span(longest match) only spans the first 3 characters.
\r
916 // Each "-options" first resets all options and then applies the specified options.
\r
917 // A "-" without options resets the options.
\r
918 // The options are also reset for each new set.
\r
919 // Other strings will be spanned.
\r
920 final String testdata[] = {
\r
927 "[\\u0000-\\U0010FFFF]",
\r
929 "[\\u0000\\u0080\\u0800\\U00010000]",
\r
931 "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
\r
936 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
\r
940 // Overlapping strings cause overlapping attempts to match.
\r
941 "[x{xy}{xya}{axy}{ax}]",
\r
944 // More repetitions of "xya" would take too long with the recursive
\r
945 // reference implementation.
\r
946 // containsAll()=false
\r
947 // test_string 0x14
\r
948 "xx" + "xyaxyaxyaxya" + // set.complement().span(longest match) will stop here.
\r
949 "xx" + // set.complement().span(contained) will stop between the two 'x'es.
\r
950 "xyaxyaxyaxya" + "xx" + "xyaxyaxyaxya" + // span() ends here.
\r
953 // containsAll()=true
\r
954 // test_string 0x15
\r
955 "xx" + "xyaxyaxyaxya" + "xx" + "xyaxyaxyaxya" + "xx" + "xyaxyaxyaxy",
\r
958 // test_string 0x17
\r
959 "byayaxya", // span() -> { 4, 7, 8 } spanBack() -> { 5, 8 }
\r
961 "byayaxy", // span() -> { 4, 7 } complement.span() -> { 7 }
\r
962 "byayax", // span() -> { 4, 6 } complement.span() -> { 6 }
\r
964 "byaya", // span() -> { 5 }
\r
965 "byay", // span() -> { 4 }
\r
966 "bya", // span() -> { 3 }
\r
968 // span(longest match) will not span the whole string.
\r
971 // test_string 0x21
\r
974 "[a{ab}{abc}{cd}]",
\r
978 // spanBack(longest match) will not span the whole string.
\r
983 "[d{cd}{bcd}{ab}]",
\r
987 // Test with non-ASCII set strings - test proper handling of surrogate pairs
\r
988 // and UTF-8 trail bytes.
\r
989 // Copies of above test sets and strings, but transliterated to have
\r
990 // different code points with similar trail units.
\r
991 // Previous: a b c d
\r
992 // Unicode: 042B 30AB 200AB 204AB
\r
993 // UTF-16: 042B 30AB D840 DCAB D841 DCAB
\r
994 // UTF-8: D0 AB E3 82 AB F0 A0 82 AB F0 A0 92 AB
\r
995 "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
\r
997 "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
\r
999 "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
\r
1001 "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
\r
1003 // Stress bookkeeping and recursion.
\r
1004 // The following strings are barely doable with the recursive
\r
1005 // reference implementation.
\r
1006 // The not-contained character at the end prevents an early exit from the span().
\r
1009 // test_string 0x33
\r
1010 "bbbbbbbbbbbbbbbbbbbbbbbb-",
\r
1011 // On complement sets, span() and spanBack() get different results
\r
1012 // because b is not in the complement set and there is an odd number of b's
\r
1013 // in the test string.
\r
1015 "bbbbbbbbbbbbbbbbbbbbbbbbb-",
\r
1017 // Test with set strings with an initial or final code point span
\r
1018 // longer than 254.
\r
1021 _64_a + _64_a + _64_a + _63_a + "b",
\r
1022 _64_a + _64_a + _64_a + _64_a + "b",
\r
1023 _64_a + _64_a + _64_a + _64_a + "aaaabbbb",
\r
1024 "a" + _64_b + _64_b + _64_b + _63_b,
\r
1025 "a" + _64_b + _64_b + _64_b + _64_b,
\r
1026 "aaaabbbb" + _64_b + _64_b + _64_b + _64_b,
\r
1028 // Test with strings containing unpaired surrogates.
\r
1029 patternWithUnpairedSurrogate, "-8cl",
\r
1030 stringWithUnpairedSurrogate };
\r
1032 int whichSpansCount = 1;
\r
1033 int[] whichSpans = new int[96];
\r
1034 for (i = whichSpans.length; i-- > 0;) {
\r
1035 whichSpans[i] = SPAN_ALL;
\r
1038 UnicodeSet[] sets = new UnicodeSet[SET_COUNT];
\r
1039 UnicodeSetWithStrings[] sets_with_str = new UnicodeSetWithStrings[SET_COUNT];
\r
1041 String testName = null;
\r
1042 String testNameLimit;
\r
1044 for (i = 0; i < testdata.length; ++i) {
\r
1045 final String s = testdata[i];
\r
1046 if (s.charAt(0) == '[') {
\r
1047 // Create new test sets from this pattern.
\r
1048 for (j = 0; j < SET_COUNT; ++j) {
\r
1049 sets_with_str[j] = null;
\r
1052 sets[SLOW] = new UnicodeSet(Utility.unescape(s));
\r
1053 sets[SLOW_NOT] = new UnicodeSet(sets[SLOW]);
\r
1054 sets[SLOW_NOT].complement();
\r
1055 // Intermediate set: Test cloning of a frozen set.
\r
1056 UnicodeSet fast = new UnicodeSet(sets[SLOW]);
\r
1058 sets[FAST] = (UnicodeSet) fast.clone();
\r
1060 UnicodeSet fastNot = new UnicodeSet(sets[SLOW_NOT]);
\r
1062 sets[FAST_NOT] = (UnicodeSet) fastNot.clone();
\r
1065 for (j = 0; j < SET_COUNT; ++j) {
\r
1066 sets_with_str[j] = new UnicodeSetWithStrings(sets[j]);
\r
1069 testName = s + ':';
\r
1070 whichSpans[0] = SPAN_ALL;
\r
1071 whichSpansCount = 1;
\r
1072 } else if (s.charAt(0) == '-') {
\r
1073 whichSpans[0] = SPAN_ALL;
\r
1074 whichSpansCount = 1;
\r
1076 for (j = 1; j < s.length(); j++) {
\r
1077 switch (s.charAt(j)) {
\r
1079 whichSpansCount = addAlternative(whichSpans, whichSpansCount, ~SPAN_POLARITY, SPAN_SET,
\r
1080 SPAN_COMPLEMENT, 0);
\r
1083 whichSpansCount = addAlternative(whichSpans, whichSpansCount, ~SPAN_DIRS, SPAN_FWD, SPAN_BACK,
\r
1087 // test CONTAINED FWD & BACK, and separately
\r
1088 // SIMPLE only FWD, and separately
\r
1089 // SIMPLE only BACK
\r
1090 whichSpansCount = addAlternative(whichSpans, whichSpansCount, ~(SPAN_DIRS | SPAN_CONDITION),
\r
1091 SPAN_DIRS | SPAN_CONTAINED, SPAN_FWD | SPAN_SIMPLE, SPAN_BACK | SPAN_SIMPLE);
\r
1094 whichSpansCount = addAlternative(whichSpans, whichSpansCount, ~SPAN_UTFS, SPAN_UTF16,
\r
1098 errln(String.format("FAIL: unrecognized span set option in \"%s\"", testdata[i]));
\r
1102 } else if (s.equals("*")) {
\r
1103 testNameLimit = "bad_string";
\r
1104 for (j = 0; j < whichSpansCount; ++j) {
\r
1105 if (whichSpansCount > 1) {
\r
1106 testNameLimit += String.format("%%0x%3x", whichSpans[j]);
\r
1108 verifySpanUTF16String(sets_with_str, whichSpans[j], testName);
\r
1111 testNameLimit = "contents";
\r
1112 for (j = 0; j < whichSpansCount; ++j) {
\r
1113 if (whichSpansCount > 1) {
\r
1114 testNameLimit += String.format("%%0x%3x", whichSpans[j]);
\r
1116 verifySpanContents(sets_with_str, whichSpans[j], testName);
\r
1119 String string = Utility.unescape(s);
\r
1120 testNameLimit = "test_string";
\r
1121 for (j = 0; j < whichSpansCount; ++j) {
\r
1122 if (whichSpansCount > 1) {
\r
1123 testNameLimit += String.format("%%0x%3x", whichSpans[j]);
\r
1125 verifySpanBothUTFs(sets_with_str, string, whichSpans[j], testName, i);
\r