]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-52_1/main/tests/core/src/com/ibm/icu/dev/test/lang/UnicodeSetStringSpanTest.java
Upgrade ICU4J.
[Dictionary.git] / jars / icu4j-52_1 / main / tests / core / src / com / ibm / icu / dev / test / lang / UnicodeSetStringSpanTest.java
1 /*
2  *******************************************************************************
3  * Copyright (C) 2009-2011, International Business Machines Corporation and    *
4  * others. All Rights Reserved.                                                *
5  *******************************************************************************
6  */
7 package com.ibm.icu.dev.test.lang;
8
9 import com.ibm.icu.dev.test.TestFmwk;
10 import com.ibm.icu.impl.Utility;
11 import com.ibm.icu.text.UTF16;
12 import com.ibm.icu.text.UnicodeSet;
13 import com.ibm.icu.text.UnicodeSet.SpanCondition;
14 import com.ibm.icu.text.UnicodeSetIterator;
15
16 /**
17  * @test
18  * @summary General test of UnicodeSet string span.
19  */
20 public class UnicodeSetStringSpanTest extends TestFmwk {
21
22     public static void main(String[] args) throws Exception {
23         new UnicodeSetStringSpanTest().run(args);
24     }
25
26     // Simple test first, easier to debug.
27     public void TestSimpleStringSpan() {
28         String pattern = "[a{ab}{bc}]";
29         String string = "abc";
30         UnicodeSet set = new UnicodeSet(pattern);
31         set.complement();
32         int pos = set.spanBack(string, 3, SpanCondition.SIMPLE);
33         if (pos != 1) {
34             errln(String.format("FAIL: UnicodeSet(%s).spanBack(%s) returns the wrong value pos %d (!= 1)",
35                     set.toString(), string, pos));
36         }
37         pos = set.span(string, SpanCondition.SIMPLE);
38         if (pos != 3) {
39             errln(String.format("FAIL: UnicodeSet(%s).span(%s) returns the wrong value pos %d (!= 3)",
40                     set.toString(), string, pos));
41         }
42         pos = set.span(string, 1, SpanCondition.SIMPLE);
43         if (pos != 3) {
44             errln(String.format("FAIL: UnicodeSet(%s).span(%s) returns the wrong value pos %d (!= 3)",
45                     set.toString(), string, pos));
46         }
47     }
48
49     // test our slow implementation
50     public void TestSimpleStringSpanSlow() {
51         String pattern = "[a{ab}{bc}]";
52         String string = "abc";
53         UnicodeSet uset = new UnicodeSet(pattern);
54         uset.complement();
55         UnicodeSetWithStrings set = new UnicodeSetWithStrings(uset);
56
57         int length = containsSpanBackUTF16(set, string, 3, SpanCondition.SIMPLE);
58         if (length != 1) {
59             errln(String.format("FAIL: UnicodeSet(%s) containsSpanBackUTF16(%s) returns the wrong value length %d (!= 1)",
60                     set.toString(), string, length));
61         }
62         length = containsSpanUTF16(set, string, SpanCondition.SIMPLE);
63         if (length != 3) {
64             errln(String.format("FAIL: UnicodeSet(%s) containsSpanUTF16(%s) returns the wrong value length %d (!= 3)",
65                     set.toString(), string, length));
66         }
67         length = containsSpanUTF16(set, string.substring(1), SpanCondition.SIMPLE);
68         if (length != 2) {
69             errln(String.format("FAIL: UnicodeSet(%s) containsSpanUTF16(%s) returns the wrong value length %d (!= 2)",
70                     set.toString(), string, length));
71         }
72     }
73
74     // Test select patterns and strings, and test SIMPLE.
75     public void TestSimpleStringSpanAndFreeze() {
76         String pattern = "[x{xy}{xya}{axy}{ax}]";
77         final String string = "xx"
78                 + "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya" + "xx"
79                 + "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya" + "xx"
80                 + "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy" + "aaaa";
81
82         UnicodeSet set = new UnicodeSet(pattern);
83         if (set.containsAll(string)) {
84             errln("FAIL: UnicodeSet(" + pattern + ").containsAll(" + string + ") should be FALSE");
85         }
86
87         // Remove trailing "aaaa".
88         String string16 = string.substring(0, string.length() - 4);
89         if (!set.containsAll(string16)) {
90             errln("FAIL: UnicodeSet(" + pattern + ").containsAll(" + string + "[:-4]) should be TRUE");
91         }
92
93         String s16 = "byayaxya";
94         if (   set.span(s16.substring(0, 8), SpanCondition.NOT_CONTAINED) != 4
95             || set.span(s16.substring(0, 7), SpanCondition.NOT_CONTAINED) != 4
96             || set.span(s16.substring(0, 6), SpanCondition.NOT_CONTAINED) != 4
97             || set.span(s16.substring(0, 5), SpanCondition.NOT_CONTAINED) != 5
98             || set.span(s16.substring(0, 4), SpanCondition.NOT_CONTAINED) != 4
99             || set.span(s16.substring(0, 3), SpanCondition.NOT_CONTAINED) != 3) {
100             errln("FAIL: UnicodeSet(" + pattern + ").span(while not) returns the wrong value");
101         }
102
103         pattern = "[a{ab}{abc}{cd}]";
104         set.applyPattern(pattern);
105         s16 = "acdabcdabccd";
106         if (   set.span(s16.substring(0, 12), SpanCondition.CONTAINED) != 12
107             || set.span(s16.substring(0, 12), SpanCondition.SIMPLE) != 6
108             || set.span(s16.substring(7),     SpanCondition.SIMPLE) != 5) {
109             errln("FAIL: UnicodeSet(" + pattern + ").span(while longest match) returns the wrong value");
110         }
111         set.freeze();
112         if (   set.span(s16.substring(0, 12), SpanCondition.CONTAINED) != 12
113             || set.span(s16.substring(0, 12), SpanCondition.SIMPLE) != 6
114             || set.span(s16.substring(7),     SpanCondition.SIMPLE) != 5) {
115             errln("FAIL: UnicodeSet(" + pattern + ").span(while longest match) returns the wrong value");
116         }
117
118         pattern = "[d{cd}{bcd}{ab}]";
119         set = (UnicodeSet)set.cloneAsThawed();
120         set.applyPattern(pattern).freeze();
121         s16 = "abbcdabcdabd";
122         if (   set.spanBack(s16, 12, SpanCondition.CONTAINED) != 0
123             || set.spanBack(s16, 12, SpanCondition.SIMPLE) != 6
124             || set.spanBack(s16,  5, SpanCondition.SIMPLE) != 0) {
125             errln("FAIL: UnicodeSet(" + pattern + ").spanBack(while longest match) returns the wrong value");
126         }
127     }
128
129     // more complex test. --------------------------------------------------------
130
131     // Make the strings in a UnicodeSet easily accessible.
132     static class UnicodeSetWithStrings {
133
134         private UnicodeSet set;
135
136         private String strings[];
137         private int stringsLength;
138         private boolean hasSurrogates;
139
140         public UnicodeSetWithStrings(final UnicodeSet normalSet) {
141             set = normalSet;
142             stringsLength = 0;
143             hasSurrogates = false;
144             strings = new String[20];
145             int size = set.size();
146             if (size > 0 && set.charAt(size - 1) < 0) {
147                 // If a set's last element is not a code point, then it must contain strings.
148                 // Iterate over the set, skip all code point ranges, and cache the strings.
149                 UnicodeSetIterator iter = new UnicodeSetIterator(set);
150                 while (iter.nextRange() && stringsLength < strings.length) {
151                     if (iter.codepoint == UnicodeSetIterator.IS_STRING) {
152                         // Store the pointer to the set's string element
153                         // which we happen to know is a stable pointer.
154                         strings[stringsLength] = iter.getString();
155                         ++stringsLength;
156                     }
157                 }
158             }
159         }
160
161         public final UnicodeSet getSet() {
162             return set;
163         }
164
165         public boolean hasStrings() {
166             return (stringsLength > 0);
167         }
168
169         public boolean hasStringsWithSurrogates() {
170             return hasSurrogates;
171         }
172
173     }
174
175     static class UnicodeSetWithStringsIterator {
176
177         private UnicodeSetWithStrings fSet;
178         private int nextStringIndex;
179
180         public UnicodeSetWithStringsIterator(final UnicodeSetWithStrings set) {
181             fSet = set;
182             nextStringIndex = 0;
183         }
184
185         public void reset() {
186             nextStringIndex = 0;
187         }
188
189         public final String nextString() {
190             if (nextStringIndex < fSet.stringsLength) {
191                 return fSet.strings[nextStringIndex++];
192             } else {
193                 return null;
194             }
195         }
196
197     }
198
199     // Compare 16-bit Unicode strings (which may be malformed UTF-16)
200     // at code point boundaries.
201     // That is, each edge of a match must not be in the middle of a surrogate pair.
202     static boolean matches16CPB(final String s, int start, int limit, final String t) {
203         limit -= start;
204         int length = t.length();
205         return t.equals(s.substring(start, start + length))
206                 && !(0 < start && UTF16.isLeadSurrogate (s.charAt(start - 1)) &&
207                                   UTF16.isTrailSurrogate(s.charAt(start)))
208                 && !(length < limit && UTF16.isLeadSurrogate (s.charAt(start + length - 1)) &&
209                                        UTF16.isTrailSurrogate(s.charAt(start + length)));
210     }
211
212     // Implement span() with contains() for comparison.
213     static int containsSpanUTF16(final UnicodeSetWithStrings set, final String s,
214             SpanCondition spanCondition) {
215         final UnicodeSet realSet = set.getSet();
216         int length = s.length();
217         if (!set.hasStrings()) {
218             boolean spanContained = false;
219             if (spanCondition != SpanCondition.NOT_CONTAINED) {
220                 spanContained = true; // Pin to 0/1 values.
221             }
222
223             int c;
224             int start = 0, prev;
225             while ((prev = start) < length) {
226                 c = s.codePointAt(start);
227                 start = s.offsetByCodePoints(start, 1);
228                 if (realSet.contains(c) != spanContained) {
229                     break;
230                 }
231             }
232             return prev;
233         } else if (spanCondition == SpanCondition.NOT_CONTAINED) {
234             UnicodeSetWithStringsIterator iter = new UnicodeSetWithStringsIterator(set);
235             int c;
236             int start, next;
237             for (start = next = 0; start < length;) {
238                 c = s.codePointAt(next);
239                 next = s.offsetByCodePoints(next, 1);
240                 if (realSet.contains(c)) {
241                     break;
242                 }
243                 String str;
244                 iter.reset();
245                 while ((str = iter.nextString()) != null) {
246                     if (str.length() <= (length - start) && matches16CPB(s, start, length, str)) {
247                         // spanNeedsStrings=true;
248                         return start;
249                     }
250                 }
251                 start = next;
252             }
253             return start;
254         } else /* CONTAINED or SIMPLE */{
255             UnicodeSetWithStringsIterator iter = new UnicodeSetWithStringsIterator(set);
256             int c;
257             int start, next, maxSpanLimit = 0;
258             for (start = next = 0; start < length;) {
259                 c = s.codePointAt(next);
260                 next = s.offsetByCodePoints(next, 1);
261                 if (!realSet.contains(c)) {
262                     next = start; // Do not span this single, not-contained code point.
263                 }
264                 String str;
265                 iter.reset();
266                 while ((str = iter.nextString()) != null) {
267                     if (str.length() <= (length - start) && matches16CPB(s, start, length, str)) {
268                         // spanNeedsStrings=true;
269                         int matchLimit = start + str.length();
270                         if (matchLimit == length) {
271                             return length;
272                         }
273                         if (spanCondition == SpanCondition.CONTAINED) {
274                             // Iterate for the shortest match at each position.
275                             // Recurse for each but the shortest match.
276                             if (next == start) {
277                                 next = matchLimit; // First match from start.
278                             } else {
279                                 if (matchLimit < next) {
280                                     // Remember shortest match from start for iteration.
281                                     int temp = next;
282                                     next = matchLimit;
283                                     matchLimit = temp;
284                                 }
285                                 // Recurse for non-shortest match from start.
286                                 int spanLength = containsSpanUTF16(set, s.substring(matchLimit),
287                                         SpanCondition.CONTAINED);
288                                 if ((matchLimit + spanLength) > maxSpanLimit) {
289                                     maxSpanLimit = matchLimit + spanLength;
290                                     if (maxSpanLimit == length) {
291                                         return length;
292                                     }
293                                 }
294                             }
295                         } else /* spanCondition==SIMPLE */{
296                             if (matchLimit > next) {
297                                 // Remember longest match from start.
298                                 next = matchLimit;
299                             }
300                         }
301                     }
302                 }
303                 if (next == start) {
304                     break; // No match from start.
305                 }
306                 start = next;
307             }
308             if (start > maxSpanLimit) {
309                 return start;
310             } else {
311                 return maxSpanLimit;
312             }
313         }
314     }
315
316     static int containsSpanBackUTF16(final UnicodeSetWithStrings set, final String s, int length,
317             SpanCondition spanCondition) {
318         if (length == 0) {
319             return 0;
320         }
321         final UnicodeSet realSet = set.getSet();
322         if (!set.hasStrings()) {
323             boolean spanContained = false;
324             if (spanCondition != SpanCondition.NOT_CONTAINED) {
325                 spanContained = true; // Pin to 0/1 values.
326             }
327
328             int c;
329             int prev = length;
330             do {
331                 c = s.codePointBefore(prev);
332                 if (realSet.contains(c) != spanContained) {
333                     break;
334                 }
335                 prev = s.offsetByCodePoints(prev, -1);
336             } while (prev > 0);
337             return prev;
338         } else if (spanCondition == SpanCondition.NOT_CONTAINED) {
339             UnicodeSetWithStringsIterator iter = new UnicodeSetWithStringsIterator(set);
340             int c;
341             int prev = length, length0 = length;
342             do {
343                 c = s.codePointBefore(prev);
344                 if (realSet.contains(c)) {
345                     break;
346                 }
347                 String str;
348                 iter.reset();
349                 while ((str = iter.nextString()) != null) {
350                     if (str.length() <= prev && matches16CPB(s, prev - str.length(), length0, str)) {
351                         // spanNeedsStrings=true;
352                         return prev;
353                     }
354                 }
355                 prev = s.offsetByCodePoints(prev, -1);
356             } while (prev > 0);
357             return prev;
358         } else /* SpanCondition.CONTAINED or SIMPLE */{
359             UnicodeSetWithStringsIterator iter = new UnicodeSetWithStringsIterator(set);
360             int c;
361             int prev = length, minSpanStart = length, length0 = length;
362             do {
363                 c = s.codePointBefore(length);
364                 length = s.offsetByCodePoints(length, -1);
365                 if (!realSet.contains(c)) {
366                     length = prev; // Do not span this single, not-contained code point.
367                 }
368                 String str;
369                 iter.reset();
370                 while ((str = iter.nextString()) != null) {
371                     if (str.length() <= prev && matches16CPB(s, prev - str.length(), length0, str)) {
372                         // spanNeedsStrings=true;
373                         int matchStart = prev - str.length();
374                         if (matchStart == 0) {
375                             return 0;
376                         }
377                         if (spanCondition == SpanCondition.CONTAINED) {
378                             // Iterate for the shortest match at each position.
379                             // Recurse for each but the shortest match.
380                             if (length == prev) {
381                                 length = matchStart; // First match from prev.
382                             } else {
383                                 if (matchStart > length) {
384                                     // Remember shortest match from prev for iteration.
385                                     int temp = length;
386                                     length = matchStart;
387                                     matchStart = temp;
388                                 }
389                                 // Recurse for non-shortest match from prev.
390                                 int spanStart = containsSpanBackUTF16(set, s, matchStart,
391                                         SpanCondition.CONTAINED);
392                                 if (spanStart < minSpanStart) {
393                                     minSpanStart = spanStart;
394                                     if (minSpanStart == 0) {
395                                         return 0;
396                                     }
397                                 }
398                             }
399                         } else /* spanCondition==SIMPLE */{
400                             if (matchStart < length) {
401                                 // Remember longest match from prev.
402                                 length = matchStart;
403                             }
404                         }
405                     }
406                 }
407                 if (length == prev) {
408                     break; // No match from prev.
409                 }
410             } while ((prev = length) > 0);
411             if (prev < minSpanStart) {
412                 return prev;
413             } else {
414                 return minSpanStart;
415             }
416         }
417     }
418
419     // spans to be performed and compared
420     static final int SPAN_UTF16 = 1;
421     static final int SPAN_UTF8 = 2;
422     static final int SPAN_UTFS = 3;
423
424     static final int SPAN_SET = 4;
425     static final int SPAN_COMPLEMENT = 8;
426     static final int SPAN_POLARITY = 0xc;
427
428     static final int SPAN_FWD = 0x10;
429     static final int SPAN_BACK = 0x20;
430     static final int SPAN_DIRS = 0x30;
431
432     static final int SPAN_CONTAINED = 0x100;
433     static final int SPAN_SIMPLE = 0x200;
434     static final int SPAN_CONDITION = 0x300;
435
436     static final int SPAN_ALL = 0x33f;
437
438     static SpanCondition invertSpanCondition(SpanCondition spanCondition, SpanCondition contained) {
439         return spanCondition == SpanCondition.NOT_CONTAINED ? contained
440                 : SpanCondition.NOT_CONTAINED;
441     }
442
443     /*
444      * Count spans on a string with the method according to type and set the span limits. The set may be the complement
445      * of the original. When using spanBack() and comparing with span(), use a span condition for the first spanBack()
446      * according to the expected number of spans. Sets typeName to an empty string if there is no such type. Returns -1
447      * if the span option is filtered out.
448      */
449     static int getSpans(final UnicodeSetWithStrings set, boolean isComplement, final String s,
450             int whichSpans, int type, String[] typeName, int limits[], int limitsCapacity,
451             int expectCount) {
452         final UnicodeSet realSet = set.getSet();
453         int start, count, i;
454         SpanCondition spanCondition, firstSpanCondition, contained;
455         boolean isForward;
456
457         int length = s.length();
458         if (type < 0 || 7 < type) {
459             typeName[0] = null;
460             return 0;
461         }
462
463         final String typeNames16[] = {
464                 "contains",
465                 "contains(LM)",
466                 "span",
467                 "span(LM)",
468                 "containsBack",
469                 "containsBack(LM)",
470                 "spanBack",
471                 "spanBack(LM)" };
472
473         typeName[0] = typeNames16[type];
474
475         // filter span options
476         if (type <= 3) {
477             // span forward
478             if ((whichSpans & SPAN_FWD) == 0) {
479                 return -1;
480             }
481             isForward = true;
482         } else {
483             // span backward
484             if ((whichSpans & SPAN_BACK) == 0) {
485                 return -1;
486             }
487             isForward = false;
488         }
489         if ((type & 1) == 0) {
490             // use SpanCondition.CONTAINED
491             if ((whichSpans & SPAN_CONTAINED) == 0) {
492                 return -1;
493             }
494             contained = SpanCondition.CONTAINED;
495         } else {
496             // use SIMPLE
497             if ((whichSpans & SPAN_SIMPLE) == 0) {
498                 return -1;
499             }
500             contained = SpanCondition.SIMPLE;
501         }
502
503         // Default first span condition for going forward with an uncomplemented set.
504         spanCondition = SpanCondition.NOT_CONTAINED;
505         if (isComplement) {
506             spanCondition = invertSpanCondition(spanCondition, contained);
507         }
508
509         // First span condition for span(), used to terminate the spanBack() iteration.
510         firstSpanCondition = spanCondition;
511
512         // spanBack(): Its initial span condition is span()'s last span condition,
513         // which is the opposite of span()'s first span condition
514         // if we expect an even number of spans.
515         // (The loop inverts spanCondition (expectCount-1) times
516         // before the expectCount'th span() call.)
517         // If we do not compare forward and backward directions, then we do not have an
518         // expectCount and just start with firstSpanCondition.
519         if (!isForward && (whichSpans & SPAN_FWD) != 0 && (expectCount & 1) == 0) {
520             spanCondition = invertSpanCondition(spanCondition, contained);
521         }
522
523         count = 0;
524         switch (type) {
525         case 0:
526         case 1:
527             start = 0;
528             for (;;) {
529                 start += containsSpanUTF16(set, s.substring(start), spanCondition);
530                 if (count < limitsCapacity) {
531                     limits[count] = start;
532                 }
533                 ++count;
534                 if (start >= length) {
535                     break;
536                 }
537                 spanCondition = invertSpanCondition(spanCondition, contained);
538             }
539             break;
540         case 2:
541         case 3:
542             start = 0;
543             for (;;) {
544                 start = realSet.span(s, start, spanCondition);
545                 if (count < limitsCapacity) {
546                     limits[count] = start;
547                 }
548                 ++count;
549                 if (start >= length) {
550                     break;
551                 }
552                 spanCondition = invertSpanCondition(spanCondition, contained);
553             }
554             break;
555         case 4:
556         case 5:
557             for (;;) {
558                 ++count;
559                 if (count <= limitsCapacity) {
560                     limits[limitsCapacity - count] = length;
561                 }
562                 length = containsSpanBackUTF16(set, s, length, spanCondition);
563                 if (length == 0 && spanCondition == firstSpanCondition) {
564                     break;
565                 }
566                 spanCondition = invertSpanCondition(spanCondition, contained);
567             }
568             if (count < limitsCapacity) {
569                 for (i = count; i-- > 0;) {
570                     limits[i] = limits[limitsCapacity - count + i];
571                 }
572             }
573             break;
574         case 6:
575         case 7:
576             for (;;) {
577                 ++count;
578                 if (count <= limitsCapacity) {
579                     limits[limitsCapacity - count] = length >= 0 ? length : s.length();
580                 }
581                 length = realSet.spanBack(s, length, spanCondition);
582                 if (length == 0 && spanCondition == firstSpanCondition) {
583                     break;
584                 }
585                 spanCondition = invertSpanCondition(spanCondition, contained);
586             }
587             if (count < limitsCapacity) {
588                 for (i = count; i-- > 0;) {
589                     limits[i] = limits[limitsCapacity - count + i];
590                 }
591             }
592             break;
593         default:
594             typeName = null;
595             return -1;
596         }
597
598         return count;
599     }
600
601     // sets to be tested; odd index=isComplement
602     static final int SLOW = 0;
603     static final int SLOW_NOT = 1;
604     static final int FAST = 2;
605     static final int FAST_NOT = 3;
606     static final int SET_COUNT = 4;
607
608     static final String setNames[] = { "slow", "slow.not", "fast", "fast.not" };
609
610     /*
611      * Verify that we get the same results whether we look at text with contains(), span() or spanBack(), using unfrozen
612      * or frozen versions of the set, and using the set or its complement (switching the spanConditions accordingly).
613      * The latter verifies that set.span(spanCondition) == set.complement().span(!spanCondition).
614      * 
615      * The expectLimits[] are either provided by the caller (with expectCount>=0) or returned to the caller (with an
616      * input expectCount<0).
617      */
618     void verifySpan(final UnicodeSetWithStrings sets[], final String s, int whichSpans,
619             int expectLimits[], int expectCount, // TODO
620             final String testName, int index) {
621         int[] limits = new int[500];
622         int limitsCount;
623         int i, j;
624         String[] typeName = new String[1];
625         int type;
626
627         for (i = 0; i < SET_COUNT; ++i) {
628             if ((i & 1) == 0) {
629                 // Even-numbered sets are original, uncomplemented sets.
630                 if ((whichSpans & SPAN_SET) == 0) {
631                     continue;
632                 }
633             } else {
634                 // Odd-numbered sets are complemented.
635                 if ((whichSpans & SPAN_COMPLEMENT) == 0) {
636                     continue;
637                 }
638             }
639             for (type = 0;; ++type) {
640                 limitsCount = getSpans(sets[i], (0 != (i & 1)), s, whichSpans, type, typeName, limits,
641                         limits.length, expectCount);
642                 if (typeName[0] == null) {
643                     break; // All types tried.
644                 }
645                 if (limitsCount < 0) {
646                     continue; // Span option filtered out.
647                 }
648                 if (expectCount < 0) {
649                     expectCount = limitsCount;
650                     if (limitsCount > limits.length) {
651                         errln(String.format("FAIL: %s[0x%x].%s.%s span count=%d > %d capacity - too many spans",
652                                 testName, index, setNames[i], typeName[0], limitsCount, limits.length));
653                         return;
654                     }
655                     for (j = limitsCount; j-- > 0;) {
656                         expectLimits[j] = limits[j];
657                     }
658                 } else if (limitsCount != expectCount) {
659                     errln(String.format("FAIL: %s[0x%x].%s.%s span count=%d != %d", testName, index, setNames[i],
660                             typeName[0], limitsCount, expectCount));
661                 } else {
662                     for (j = 0; j < limitsCount; ++j) {
663                         if (limits[j] != expectLimits[j]) {
664                             errln(String.format("FAIL: %s[0x%x].%s.%s span count=%d limits[%d]=%d != %d", testName,
665                                     index, setNames[i], typeName[0], limitsCount, j, limits[j], expectLimits[j]));
666                             break;
667                         }
668                     }
669                 }
670             }
671         }
672
673         // Compare span() with containsAll()/containsNone(),
674         // but only if we have expectLimits[] from the uncomplemented set.
675         if ((whichSpans & SPAN_SET) != 0) {
676             final String s16 = s;
677             String string;
678             int prev = 0, limit, len;
679             for (i = 0; i < expectCount; ++i) {
680                 limit = expectLimits[i];
681                 len = limit - prev;
682                 if (len > 0) {
683                     string = s16.substring(prev, prev + len); // read-only alias
684                     if (0 != (i & 1)) {
685                         if (!sets[SLOW].getSet().containsAll(string)) {
686                             errln(String.format("FAIL: %s[0x%x].%s.containsAll(%d..%d)==false contradicts span()",
687                                     testName, index, setNames[SLOW], prev, limit));
688                             return;
689                         }
690                         if (!sets[FAST].getSet().containsAll(string)) {
691                             errln(String.format("FAIL: %s[0x%x].%s.containsAll(%d..%d)==false contradicts span()",
692                                     testName, index, setNames[FAST], prev, limit));
693                             return;
694                         }
695                     } else {
696                         if (!sets[SLOW].getSet().containsNone(string)) {
697                             errln(String.format("FAIL: %s[0x%x].%s.containsNone(%d..%d)==false contradicts span()",
698                                     testName, index, setNames[SLOW], prev, limit));
699                             return;
700                         }
701                         if (!sets[FAST].getSet().containsNone(string)) {
702                             errln(String.format("FAIL: %s[0x%x].%s.containsNone(%d..%d)==false contradicts span()",
703                                     testName, index, setNames[FAST], prev, limit));
704                             return;
705                         }
706                     }
707                 }
708                 prev = limit;
709             }
710         }
711     }
712
713     // Specifically test either UTF-16 or UTF-8.
714     void verifySpan(final UnicodeSetWithStrings sets[], final String s, int whichSpans,
715             final String testName, int index) {
716         int[] expectLimits = new int[500];
717         int expectCount = -1;
718         verifySpan(sets, s, whichSpans, expectLimits, expectCount, testName, index);
719     }
720
721     // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
722     // unless either UTF is turned off in whichSpans.
723     // Testing UTF-16 and UTF-8 together requires that surrogate code points
724     // have the same contains(c) value as U+FFFD.
725     void verifySpanBothUTFs(final UnicodeSetWithStrings sets[], final String s16, int whichSpans,
726             final String testName, int index) {
727         int[] expectLimits = new int[500];
728         int expectCount;
729
730         expectCount = -1; // Get expectLimits[] from verifySpan().
731
732         if ((whichSpans & SPAN_UTF16) != 0) {
733             verifySpan(sets, s16, whichSpans, expectLimits, expectCount, testName, index);
734         }
735     }
736
737     static int nextCodePoint(int c) {
738         // Skip some large and boring ranges.
739         switch (c) {
740         case 0x3441:
741             return 0x4d7f;
742         case 0x5100:
743             return 0x9f00;
744         case 0xb040:
745             return 0xd780;
746         case 0xe041:
747             return 0xf8fe;
748         case 0x10100:
749             return 0x20000;
750         case 0x20041:
751             return 0xe0000;
752         case 0xe0101:
753             return 0x10fffd;
754         default:
755             return c + 1;
756         }
757     }
758
759     // Verify that all implementations represent the same set.
760     void verifySpanContents(final UnicodeSetWithStrings sets[], int whichSpans, final String testName) {
761         StringBuffer s = new StringBuffer();
762         int localWhichSpans;
763         int c, first;
764         for (first = c = 0;; c = nextCodePoint(c)) {
765             if (c > 0x10ffff || s.length() > 1024) {
766                 localWhichSpans = whichSpans;
767                 verifySpanBothUTFs(sets, s.toString(), localWhichSpans, testName, first);
768                 if (c > 0x10ffff) {
769                     break;
770                 }
771                 s.delete(0, s.length());
772                 first = c;
773             }
774             UTF16.append(s, c);
775         }
776     }
777
778     // Test with a particular, interesting string.
779     // Specify length and try NUL-termination.
780     static final char interestingStringChars[] = { 0x61, 0x62, 0x20, // Latin, space
781             0x3b1, 0x3b2, 0x3b3, // Greek
782             0xd900, // lead surrogate
783             0x3000, 0x30ab, 0x30ad, // wide space, Katakana
784             0xdc05, // trail surrogate
785             0xa0, 0xac00, 0xd7a3, // nbsp, Hangul
786             0xd900, 0xdc05, // unassigned supplementary
787             0xd840, 0xdfff, 0xd860, 0xdffe, // Han supplementary
788             0xd7a4, 0xdc05, 0xd900, 0x2028  // unassigned, surrogates in wrong order, LS
789     };
790     static String interestingString = new String(interestingStringChars);
791     static final String unicodeSet1 = "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]";
792
793     public void TestInterestingStringSpan() {
794         UnicodeSet uset = new UnicodeSet(Utility.unescape(unicodeSet1));
795         SpanCondition spanCondition = SpanCondition.NOT_CONTAINED;
796         int expect = 2;
797         int start = 14;
798
799         int c = 0xd840;
800         boolean contains = uset.contains(c);
801         if (false != contains) {
802             errln(String.format("FAIL: UnicodeSet(unicodeSet1).contains(%d) = true (expect false)",
803                   c));
804         }
805
806         UnicodeSetWithStrings set = new UnicodeSetWithStrings(uset);
807         int len = containsSpanUTF16(set, interestingString.substring(start), spanCondition);
808         if (expect != len) {
809             errln(String.format("FAIL: containsSpanUTF16(unicodeSet1, \"%s(%d)\") = %d (expect %d)",
810                   interestingString, start, len, expect));
811         }
812
813         len = uset.span(interestingString, start, spanCondition) - start;
814         if (expect != len) {
815             errln(String.format("FAIL: UnicodeSet(unicodeSet1).span(\"%s\", %d) = %d (expect %d)",
816                   interestingString, start, len, expect));
817         }
818     }
819
820     void verifySpanUTF16String(final UnicodeSetWithStrings sets[], int whichSpans, final String testName) {
821         if ((whichSpans & SPAN_UTF16) == 0) {
822             return;
823         }
824         verifySpan(sets, interestingString, (whichSpans & ~SPAN_UTF8), testName, 1);
825     }
826
827     // Take a set of span options and multiply them so that
828     // each portion only has one of the options a, b and c.
829     // If b==0, then the set of options is just modified with mask and a.
830     // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
831     static int addAlternative(int whichSpans[], int whichSpansCount, int mask, int a, int b, int c) {
832         int s;
833         int i;
834
835         for (i = 0; i < whichSpansCount; ++i) {
836             s = whichSpans[i] & mask;
837             whichSpans[i] = s | a;
838             if (b != 0) {
839                 whichSpans[whichSpansCount + i] = s | b;
840                 if (c != 0) {
841                     whichSpans[2 * whichSpansCount + i] = s | c;
842                 }
843             }
844         }
845         return b == 0 ? whichSpansCount : c == 0 ? 2 * whichSpansCount : 3 * whichSpansCount;
846     }
847
848     // They are not representable in UTF-8, and a leading trail surrogate
849     // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
850     // U+20001 == \\uD840\\uDC01
851     // U+20400 == \\uD841\\uDC00
852     static final String patternWithUnpairedSurrogate =
853         "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]";
854     static final String stringWithUnpairedSurrogate =
855         "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb";
856
857     static final String _63_a = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
858     static final String _64_a = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
859     static final String _63_b = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
860     static final String _64_b = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
861     static final String longPattern =
862         "[a{" + _64_a + _64_a + _64_a + _64_a + "b}" + "{a" + _64_b + _64_b + _64_b + _64_b + "}]";
863
864     public void TestStringWithUnpairedSurrogateSpan() {
865         String string = Utility.unescape(stringWithUnpairedSurrogate);
866         UnicodeSet uset = new UnicodeSet(Utility.unescape(patternWithUnpairedSurrogate));
867         SpanCondition spanCondition = SpanCondition.NOT_CONTAINED;
868         int start = 17;
869         int expect = 5;
870
871         UnicodeSetWithStrings set = new UnicodeSetWithStrings(uset);
872         int len = containsSpanUTF16(set, string.substring(start), spanCondition);
873         if (expect != len) {
874             errln(String.format("FAIL: containsSpanUTF16(patternWithUnpairedSurrogate, \"%s(%d)\") = %d (expect %d)",
875                   string, start, len, expect));
876         }
877
878         len = uset.span(string, start, spanCondition) - start;
879         if (expect != len) {
880             errln(String.format("FAIL: UnicodeSet(patternWithUnpairedSurrogate).span(\"%s\", %d) = %d (expect %d)",
881                   string, start, len, expect));
882         }
883     }
884
885     public void TestSpan() {
886         // "[...]" is a UnicodeSet pattern.
887         // "*" performs tests on all Unicode code points and on a selection of
888         // malformed UTF-8/16 strings.
889         // "-options" limits the scope of testing for the current set.
890         // By default, the test verifies that equivalent boundaries are found
891         // for UTF-16 and UTF-8, going forward and backward,
892         // alternating NOT_CONTAINED with
893         // either CONTAINED or SIMPLE.
894         // Single-character options:
895         // 8 -- UTF-16 and UTF-8 boundaries may differ.
896         // Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
897         // or the set contains strings with unpaired surrogates
898         // which do not translate to valid UTF-8.
899         // c -- set.span() and set.complement().span() boundaries may differ.
900         // Cause: Set strings are not complemented.
901         // b -- span() and spanBack() boundaries may differ.
902         // Cause: Strings in the set overlap, and spanBack(CONTAINED)
903         // and spanBack(SIMPLE) are defined to
904         // match with non-overlapping substrings.
905         // For example, with a set containing "ab" and "ba",
906         // span() of "aba" yields boundaries { 0, 2, 3 }
907         // because the initial "ab" matches from 0 to 2,
908         // while spanBack() yields boundaries { 0, 1, 3 }
909         // because the final "ba" matches from 1 to 3.
910         // l -- CONTAINED and SIMPLE boundaries may differ.
911         // Cause: Strings in the set overlap, and a longer match may
912         // require a sequence including non-longest substrings.
913         // For example, with a set containing "ab", "abc" and "cd",
914         // span(contained) of "abcd" spans the entire string
915         // but span(longest match) only spans the first 3 characters.
916         // Each "-options" first resets all options and then applies the specified options.
917         // A "-" without options resets the options.
918         // The options are also reset for each new set.
919         // Other strings will be spanned.
920         final String testdata[] = {
921                 "[:ID_Continue:]",
922                 "*",
923                 "[:White_Space:]",
924                 "*",
925                 "[]",
926                 "*",
927                 "[\\u0000-\\U0010FFFF]",
928                 "*",
929                 "[\\u0000\\u0080\\u0800\\U00010000]",
930                 "*",
931                 "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
932                 "*",
933                 unicodeSet1,
934                 "-c",
935                 "*",
936                 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
937                 "-c",
938                 "*",
939
940                 // Overlapping strings cause overlapping attempts to match.
941                 "[x{xy}{xya}{axy}{ax}]",
942                 "-cl",
943
944                 // More repetitions of "xya" would take too long with the recursive
945                 // reference implementation.
946                 // containsAll()=false
947                 // test_string 0x14
948                 "xx" + "xyaxyaxyaxya" + // set.complement().span(longest match) will stop here.
949                         "xx" + // set.complement().span(contained) will stop between the two 'x'es.
950                         "xyaxyaxyaxya" + "xx" + "xyaxyaxyaxya" + // span() ends here.
951                         "aaa",
952
953                 // containsAll()=true
954                 // test_string 0x15
955                 "xx" + "xyaxyaxyaxya" + "xx" + "xyaxyaxyaxya" + "xx" + "xyaxyaxyaxy",
956
957                 "-bc",
958                 // test_string 0x17
959                 "byayaxya", // span() -> { 4, 7, 8 } spanBack() -> { 5, 8 }
960                 "-c",
961                 "byayaxy", // span() -> { 4, 7 } complement.span() -> { 7 }
962                 "byayax", // span() -> { 4, 6 } complement.span() -> { 6 }
963                 "-",
964                 "byaya", // span() -> { 5 }
965                 "byay", // span() -> { 4 }
966                 "bya", // span() -> { 3 }
967
968                 // span(longest match) will not span the whole string.
969                 "[a{ab}{bc}]",
970                 "-cl",
971                 // test_string 0x21
972                 "abc",
973
974                 "[a{ab}{abc}{cd}]",
975                 "-cl",
976                 "acdabcdabccd",
977
978                 // spanBack(longest match) will not span the whole string.
979                 "[c{ab}{bc}]",
980                 "-cl",
981                 "abc",
982
983                 "[d{cd}{bcd}{ab}]",
984                 "-cl",
985                 "abbcdabcdabd",
986
987                 // Test with non-ASCII set strings - test proper handling of surrogate pairs
988                 // and UTF-8 trail bytes.
989                 // Copies of above test sets and strings, but transliterated to have
990                 // different code points with similar trail units.
991                 // Previous: a b c d
992                 // Unicode: 042B 30AB 200AB 204AB
993                 // UTF-16: 042B 30AB D840 DCAB D841 DCAB
994                 // UTF-8: D0 AB E3 82 AB F0 A0 82 AB F0 A0 92 AB
995                 "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
996                 "-cl",
997                 "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
998
999                 "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
1000                 "-cl",
1001                 "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
1002
1003                 // Stress bookkeeping and recursion.
1004                 // The following strings are barely doable with the recursive
1005                 // reference implementation.
1006                 // The not-contained character at the end prevents an early exit from the span().
1007                 "[b{bb}]",
1008                 "-c",
1009                 // test_string 0x33
1010                 "bbbbbbbbbbbbbbbbbbbbbbbb-",
1011                 // On complement sets, span() and spanBack() get different results
1012                 // because b is not in the complement set and there is an odd number of b's
1013                 // in the test string.
1014                 "-bc",
1015                 "bbbbbbbbbbbbbbbbbbbbbbbbb-",
1016
1017                 // Test with set strings with an initial or final code point span
1018                 // longer than 254.
1019                 longPattern,
1020                 "-c",
1021                 _64_a + _64_a + _64_a + _63_a + "b",
1022                 _64_a + _64_a + _64_a + _64_a + "b",
1023                 _64_a + _64_a + _64_a + _64_a + "aaaabbbb",
1024                 "a" + _64_b + _64_b + _64_b + _63_b,
1025                 "a" + _64_b + _64_b + _64_b + _64_b,
1026                 "aaaabbbb" + _64_b + _64_b + _64_b + _64_b,
1027
1028                 // Test with strings containing unpaired surrogates.
1029                 patternWithUnpairedSurrogate, "-8cl",
1030                 stringWithUnpairedSurrogate };
1031         int i, j;
1032         int whichSpansCount = 1;
1033         int[] whichSpans = new int[96];
1034         for (i = whichSpans.length; i-- > 0;) {
1035             whichSpans[i] = SPAN_ALL;
1036         }
1037
1038         UnicodeSet[] sets = new UnicodeSet[SET_COUNT];
1039         UnicodeSetWithStrings[] sets_with_str = new UnicodeSetWithStrings[SET_COUNT];
1040
1041         String testName = null;
1042         @SuppressWarnings("unused")
1043         String testNameLimit;
1044
1045         for (i = 0; i < testdata.length; ++i) {
1046             final String s = testdata[i];
1047             if (s.charAt(0) == '[') {
1048                 // Create new test sets from this pattern.
1049                 for (j = 0; j < SET_COUNT; ++j) {
1050                     sets_with_str[j] = null;
1051                     sets[j] = null;
1052                 }
1053                 sets[SLOW] = new UnicodeSet(Utility.unescape(s));
1054                 sets[SLOW_NOT] = new UnicodeSet(sets[SLOW]);
1055                 sets[SLOW_NOT].complement();
1056                 // Intermediate set: Test cloning of a frozen set.
1057                 UnicodeSet fast = new UnicodeSet(sets[SLOW]);
1058                 fast.freeze();
1059                 sets[FAST] = (UnicodeSet) fast.clone();
1060                 fast = null;
1061                 UnicodeSet fastNot = new UnicodeSet(sets[SLOW_NOT]);
1062                 fastNot.freeze();
1063                 sets[FAST_NOT] = (UnicodeSet) fastNot.clone();
1064                 fastNot = null;
1065
1066                 for (j = 0; j < SET_COUNT; ++j) {
1067                     sets_with_str[j] = new UnicodeSetWithStrings(sets[j]);
1068                 }
1069
1070                 testName = s + ':';
1071                 whichSpans[0] = SPAN_ALL;
1072                 whichSpansCount = 1;
1073             } else if (s.charAt(0) == '-') {
1074                 whichSpans[0] = SPAN_ALL;
1075                 whichSpansCount = 1;
1076
1077                 for (j = 1; j < s.length(); j++) {
1078                     switch (s.charAt(j)) {
1079                     case 'c':
1080                         whichSpansCount = addAlternative(whichSpans, whichSpansCount, ~SPAN_POLARITY, SPAN_SET,
1081                                 SPAN_COMPLEMENT, 0);
1082                         break;
1083                     case 'b':
1084                         whichSpansCount = addAlternative(whichSpans, whichSpansCount, ~SPAN_DIRS, SPAN_FWD, SPAN_BACK,
1085                                 0);
1086                         break;
1087                     case 'l':
1088                         // test CONTAINED FWD & BACK, and separately
1089                         // SIMPLE only FWD, and separately
1090                         // SIMPLE only BACK
1091                         whichSpansCount = addAlternative(whichSpans, whichSpansCount, ~(SPAN_DIRS | SPAN_CONDITION),
1092                                 SPAN_DIRS | SPAN_CONTAINED, SPAN_FWD | SPAN_SIMPLE, SPAN_BACK | SPAN_SIMPLE);
1093                         break;
1094                     case '8':
1095                         whichSpansCount = addAlternative(whichSpans, whichSpansCount, ~SPAN_UTFS, SPAN_UTF16,
1096                                 SPAN_UTF8, 0);
1097                         break;
1098                     default:
1099                         errln(String.format("FAIL: unrecognized span set option in \"%s\"", testdata[i]));
1100                         break;
1101                     }
1102                 }
1103             } else if (s.equals("*")) {
1104                 testNameLimit = "bad_string";
1105                 for (j = 0; j < whichSpansCount; ++j) {
1106                     if (whichSpansCount > 1) {
1107                         testNameLimit += String.format("%%0x%3x", whichSpans[j]);
1108                     }
1109                     verifySpanUTF16String(sets_with_str, whichSpans[j], testName);
1110                 }
1111
1112                 testNameLimit = "contents";
1113                 for (j = 0; j < whichSpansCount; ++j) {
1114                     if (whichSpansCount > 1) {
1115                         testNameLimit += String.format("%%0x%3x", whichSpans[j]);
1116                     }
1117                     verifySpanContents(sets_with_str, whichSpans[j], testName);
1118                 }
1119             } else {
1120                 String string = Utility.unescape(s);
1121                 testNameLimit = "test_string";
1122                 for (j = 0; j < whichSpansCount; ++j) {
1123                     if (whichSpansCount > 1) {
1124                         testNameLimit += String.format("%%0x%3x", whichSpans[j]);
1125                     }
1126                     verifySpanBothUTFs(sets_with_str, string, whichSpans[j], testName, i);
1127                 }
1128             }
1129         }
1130     }
1131
1132 }