2 ******************************************************************************
\r
4 * Copyright (C) 2009-2010, International Business Machines
\r
5 * Corporation and others. All Rights Reserved.
\r
7 ******************************************************************************
\r
10 package com.ibm.icu.impl;
\r
12 import java.util.ArrayList;
\r
14 import com.ibm.icu.text.UnicodeSet;
\r
15 import com.ibm.icu.text.UnicodeSet.SpanCondition;
\r
18 * Implement span() etc. for a set with strings.
\r
19 * Avoid recursion because of its exponential complexity.
\r
20 * Instead, try multiple paths at once and track them with an IndexList.
\r
22 public class UnicodeSetStringSpan {
\r
25 * Which span() variant will be used? The object is either built for one variant and used once, or built for all and
\r
26 * may be used many times.
\r
28 public static final int FWD = 0x20;
\r
29 public static final int BACK = 0x10;
\r
30 public static final int UTF16 = 8;
\r
31 public static final int CONTAINED = 2;
\r
32 public static final int NOT_CONTAINED = 1;
\r
34 public static final int ALL = 0x3f;
\r
36 public static final int FWD_UTF16_CONTAINED = FWD | UTF16 | CONTAINED;
\r
37 public static final int FWD_UTF16_NOT_CONTAINED = FWD | UTF16 | NOT_CONTAINED;
\r
38 public static final int BACK_UTF16_CONTAINED = BACK | UTF16 | CONTAINED;
\r
39 public static final int BACK_UTF16_NOT_CONTAINED = BACK | UTF16 | NOT_CONTAINED;
\r
41 // Special spanLength short values. (since Java has not unsigned byte type)
\r
42 // All code points in the string are contained in the parent set.
\r
43 static final short ALL_CP_CONTAINED = 0xff;
\r
44 // The spanLength is >=0xfe.
\r
45 static final short LONG_SPAN = ALL_CP_CONTAINED - 1;
\r
47 // Set for span(). Same as parent but without strings.
\r
48 private UnicodeSet spanSet;
\r
50 // Set for span(not contained).
\r
51 // Same as spanSet, plus characters that start or end strings.
\r
52 private UnicodeSet spanNotSet;
\r
54 // The strings of the parent set.
\r
55 private ArrayList<String> strings;
\r
57 // the lengths of span(), spanBack() etc. for each string.
\r
58 private short[] spanLengths;
\r
60 // Maximum lengths of relevant strings.
\r
61 private int maxLength16;
\r
63 // Set up for all variants of span()?
\r
64 private boolean all;
\r
67 private OffsetList offsets;
\r
69 // Construct for all variants of span(), or only for any one variant.
\r
70 // Initialize as little as possible, for single use.
\r
71 public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which) {
\r
72 spanSet = new UnicodeSet(0, 0x10ffff);
\r
73 strings = setStrings;
\r
74 all = (which == ALL);
\r
75 spanSet.retainAll(set);
\r
76 if (0 != (which & NOT_CONTAINED)) {
\r
77 // Default to the same sets.
\r
78 // addToSpanNotSet() will create a separate set if necessary.
\r
79 spanNotSet = spanSet;
\r
81 offsets = new OffsetList();
\r
83 // Determine if the strings even need to be taken into account at all for span() etc.
\r
84 // If any string is relevant, then all strings need to be used for
\r
85 // span(longest match) but only the relevant ones for span(while contained).
\r
86 // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
\r
87 // and do not store UTF-8 strings if !thisRelevant and CONTAINED.
\r
88 // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
\r
89 // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
\r
90 int stringsLength = strings.size();
\r
93 boolean someRelevant = false;
\r
94 for (i = 0; i < stringsLength; ++i) {
\r
95 String string = strings.get(i);
\r
96 int length16 = string.length();
\r
97 spanLength = spanSet.span(string, SpanCondition.CONTAINED);
\r
98 if (spanLength < length16) { // Relevant string.
\r
99 someRelevant = true;
\r
101 if ((0 != (which & UTF16)) && length16 > maxLength16) {
\r
102 maxLength16 = length16;
\r
105 if (!someRelevant) {
\r
110 // Freeze after checking for the need to use strings at all because freezing
\r
111 // a set takes some time and memory which are wasted if there are no relevant strings.
\r
116 int spanBackLengthsOffset;
\r
118 // Allocate a block of meta data.
\r
121 // 2 sets of span lengths
\r
122 allocSize = stringsLength * (2);
\r
124 allocSize = stringsLength; // One set of span lengths.
\r
126 spanLengths = new short[allocSize];
\r
129 // Store span lengths for all span() variants.
\r
130 spanBackLengthsOffset = stringsLength;
\r
132 // Store span lengths for only one span() variant.
\r
133 spanBackLengthsOffset = 0;
\r
136 // Set the meta data and spanNotSet and write the UTF-8 strings.
\r
138 for (i = 0; i < stringsLength; ++i) {
\r
139 String string = strings.get(i);
\r
140 int length16 = string.length();
\r
141 spanLength = spanSet.span(string, SpanCondition.CONTAINED);
\r
142 if (spanLength < length16) { // Relevant string.
\r
143 if (0 != (which & UTF16)) {
\r
144 if (0 != (which & CONTAINED)) {
\r
145 if (0 != (which & FWD)) {
\r
146 spanLengths[i] = makeSpanLengthByte(spanLength);
\r
148 if (0 != (which & BACK)) {
\r
149 spanLength = length16
\r
150 - spanSet.spanBack(string, length16, SpanCondition.CONTAINED);
\r
151 spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength);
\r
153 } else /* not CONTAINED, not all, but NOT_CONTAINED */{
\r
154 spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant
\r
158 if (0 != (which & NOT_CONTAINED)) {
\r
159 // Add string start and end code points to the spanNotSet so that
\r
160 // a span(while not contained) stops before any string.
\r
162 if (0 != (which & FWD)) {
\r
163 c = string.codePointAt(0);
\r
164 addToSpanNotSet(c);
\r
166 if (0 != (which & BACK)) {
\r
167 c = string.codePointBefore(length16);
\r
168 addToSpanNotSet(c);
\r
171 } else { // Irrelevant string.
\r
173 spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED;
\r
175 // All spanXYZLengths pointers contain the same address.
\r
176 spanLengths[i] = ALL_CP_CONTAINED;
\r
183 spanNotSet.freeze();
\r
188 * Constructs a copy of an existing UnicodeSetStringSpan.
\r
189 * Assumes which==ALL for a frozen set.
\r
191 public UnicodeSetStringSpan(final UnicodeSetStringSpan otherStringSpan, final ArrayList<String> newParentSetStrings) {
\r
192 spanSet = otherStringSpan.spanSet;
\r
193 strings = newParentSetStrings;
\r
194 maxLength16 = otherStringSpan.maxLength16;
\r
196 if (otherStringSpan.spanNotSet == otherStringSpan.spanSet) {
\r
197 spanNotSet = spanSet;
\r
199 spanNotSet = (UnicodeSet) otherStringSpan.spanNotSet.clone();
\r
201 offsets = new OffsetList();
\r
203 spanLengths = otherStringSpan.spanLengths.clone();
\r
207 * Do the strings need to be checked in span() etc.?
\r
209 * @return TRUE if strings need to be checked (call span() here), FALSE if not (use a BMPSet for best performance).
\r
211 public boolean needsStringSpanUTF16() {
\r
212 return (maxLength16 != 0);
\r
215 // For fast UnicodeSet::contains(c).
\r
216 public boolean contains(int c) {
\r
217 return spanSet.contains(c);
\r
220 // Add a starting or ending string character to the spanNotSet
\r
221 // so that a character span ends before any string.
\r
222 private void addToSpanNotSet(int c) {
\r
223 if (spanNotSet == null || spanNotSet == spanSet) {
\r
224 if (spanSet.contains(c)) {
\r
225 return; // Nothing to do.
\r
227 spanNotSet = spanSet.cloneAsThawed();
\r
233 * Note: In span() when spanLength==0 (after a string match, or at the beginning after an empty code point span) and
\r
234 * in spanNot() and spanNotUTF8(), string matching could use a binary search because all string matches are done
\r
235 * from the same start index.
\r
237 * For UTF-8, this would require a comparison function that returns UTF-16 order.
\r
239 * This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets
\r
240 * with strings have very few very short strings. For cases with many strings, it might be better to use a different
\r
241 * API and implementation with a DFA (state machine).
\r
245 * Algorithm for span(SpanCondition.CONTAINED)
\r
247 * Theoretical algorithm: - Iterate through the string, and at each code point boundary: + If the code point there
\r
248 * is in the set, then remember to continue after it. + If a set string matches at the current position, then
\r
249 * remember to continue after it. + Either recursively span for each code point or string match, or recursively span
\r
250 * for all but the shortest one and iteratively continue the span with the shortest local match. + Remember the
\r
251 * longest recursive span (the farthest end point). + If there is no match at the current position, neither for the
\r
252 * code point there nor for any set string, then stop and return the longest recursive span length.
\r
254 * Optimized implementation:
\r
256 * (We assume that most sets will have very few very short strings. A span using a string-less set is extremely
\r
259 * Create and cache a spanSet which contains all of the single code points of the original set but none of its
\r
262 * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). - Loop: + Try to match each set
\r
263 * string at the end of the spanLength. ~ Set strings that start with set-contained code points must be matched with
\r
264 * a partial overlap because the recursive algorithm would have tried to match them at every position. ~ Set strings
\r
265 * that entirely consist of set-contained code points are irrelevant for span(SpanCondition.CONTAINED)
\r
266 * because the recursive algorithm would continue after them anyway and find the longest recursive match from their
\r
267 * end. ~ Rather than recursing, note each end point of a set string match. + If no set string matched after
\r
268 * spanSet.span(), then return with where the spanSet.span() ended. + If at least one set string matched after
\r
269 * spanSet.span(), then pop the shortest string match end point and continue the loop, trying to match all set
\r
270 * strings from there. + If at least one more set string matched after a previous string match, then test if the
\r
271 * code point after the previous string match is also contained in the set. Continue the loop with the shortest end
\r
272 * point of either this code point or a matching set string. + If no more set string matched after a previous string
\r
273 * match, then try another spanLength=spanSet.span(SpanCondition.CONTAINED). Stop if spanLength==0,
\r
274 * otherwise continue the loop.
\r
276 * By noting each end point of a set string match, the function visits each string position at most once and
\r
277 * finishes in linear time.
\r
279 * The recursive algorithm may visit the same string position many times if multiple paths lead to it and finishes
\r
280 * in exponential time.
\r
284 * Algorithm for span(SIMPLE)
\r
286 * Theoretical algorithm: - Iterate through the string, and at each code point boundary: + If the code point there
\r
287 * is in the set, then remember to continue after it. + If a set string matches at the current position, then
\r
288 * remember to continue after it. + Continue from the farthest match position and ignore all others. + If there is
\r
289 * no match at the current position, then stop and return the current position.
\r
291 * Optimized implementation:
\r
293 * (Same assumption and spanSet as above.)
\r
295 * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). - Loop: + Try to match each set
\r
296 * string at the end of the spanLength. ~ Set strings that start with set-contained code points must be matched with
\r
297 * a partial overlap because the standard algorithm would have tried to match them earlier. ~ Set strings that
\r
298 * entirely consist of set-contained code points must be matched with a full overlap because the longest-match
\r
299 * algorithm would hide set string matches that end earlier. Such set strings need not be matched earlier inside the
\r
300 * code point span because the standard algorithm would then have continued after the set string match anyway. ~
\r
301 * Remember the longest set string match (farthest end point) from the earliest starting point. + If no set string
\r
302 * matched after spanSet.span(), then return with where the spanSet.span() ended. + If at least one set string
\r
303 * matched, then continue the loop after the longest match from the earliest position. + If no more set string
\r
304 * matched after a previous string match, then try another
\r
305 * spanLength=spanSet.span(SpanCondition.CONTAINED). Stop if spanLength==0, otherwise continue the
\r
311 * @param s The string to be spanned
\r
312 * @param start The start index that the span begins
\r
313 * @param spanCondition The span condition
\r
314 * @return the length of the span
\r
317 public synchronized int span(CharSequence s, int start, int length, SpanCondition spanCondition) {
\r
318 if (spanCondition == SpanCondition.NOT_CONTAINED) {
\r
319 return spanNot(s, start, length);
\r
321 int spanLength = spanSet.span(s.subSequence(start, start + length), SpanCondition.CONTAINED);
\r
322 if (spanLength == length) {
\r
326 // Consider strings; they may overlap with the span.
\r
328 if (spanCondition == SpanCondition.CONTAINED) {
\r
329 // Use offset list to try all possibilities.
\r
330 initSize = maxLength16;
\r
332 offsets.setMaxLength(initSize);
\r
333 int pos = start + spanLength, rest = length - spanLength;
\r
334 int i, stringsLength = strings.size();
\r
336 if (spanCondition == SpanCondition.CONTAINED) {
\r
337 for (i = 0; i < stringsLength; ++i) {
\r
338 int overlap = spanLengths[i];
\r
339 if (overlap == ALL_CP_CONTAINED) {
\r
340 continue; // Irrelevant string.
\r
342 String string = strings.get(i);
\r
344 int length16 = string.length();
\r
346 // Try to match this string at pos-overlap..pos.
\r
347 if (overlap >= LONG_SPAN) {
\r
348 overlap = length16;
\r
349 // While contained: No point matching fully inside the code point span.
\r
350 overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code
\r
353 if (overlap > spanLength) {
\r
354 overlap = spanLength;
\r
356 int inc = length16 - overlap; // Keep overlap+inc==length16.
\r
361 // Try to match if the increment is not listed already.
\r
362 if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) {
\r
364 return length; // Reached the end of the string.
\r
366 offsets.addOffset(inc);
\r
368 if (overlap == 0) {
\r
375 } else /* SIMPLE */{
\r
376 int maxInc = 0, maxOverlap = 0;
\r
377 for (i = 0; i < stringsLength; ++i) {
\r
378 int overlap = spanLengths[i];
\r
379 // For longest match, we do need to try to match even an all-contained string
\r
380 // to find the match from the earliest start.
\r
382 String string = strings.get(i);
\r
384 int length16 = string.length();
\r
386 // Try to match this string at pos-overlap..pos.
\r
387 if (overlap >= LONG_SPAN) {
\r
388 overlap = length16;
\r
389 // Longest match: Need to match fully inside the code point span
\r
390 // to find the match from the earliest start.
\r
392 if (overlap > spanLength) {
\r
393 overlap = spanLength;
\r
395 int inc = length16 - overlap; // Keep overlap+inc==length16.
\r
397 if (inc > rest || overlap < maxOverlap) {
\r
400 // Try to match if the string is longer or starts earlier.
\r
401 if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc)
\r
402 && matches16CPB(s, pos - overlap, length, string, length16)) {
\r
403 maxInc = inc; // Longest match from earliest start.
\r
404 maxOverlap = overlap;
\r
412 if (maxInc != 0 || maxOverlap != 0) {
\r
413 // Longest-match algorithm, and there was a string match.
\r
414 // Simply continue after it.
\r
418 return length; // Reached the end of the string.
\r
420 spanLength = 0; // Match strings from after a string match.
\r
424 // Finished trying to match all strings at pos.
\r
426 if (spanLength != 0 || pos == 0) {
\r
427 // The position is after an unlimited code point span (spanLength!=0),
\r
428 // not after a string match.
\r
429 // The only position where spanLength==0 after a span is pos==0.
\r
430 // Otherwise, an unlimited code point span is only tried again when no
\r
431 // strings match, and if such a non-initial span fails we stop.
\r
432 if (offsets.isEmpty()) {
\r
433 return pos - start; // No strings matched after a span.
\r
435 // Match strings from after the next string match.
\r
437 // The position is after a string match (or a single code point).
\r
438 if (offsets.isEmpty()) {
\r
439 // No more strings matched after a previous string match.
\r
440 // Try another code point span from after the last string match.
\r
441 spanLength = spanSet.span(s.subSequence(pos, pos + rest), SpanCondition.CONTAINED);
\r
442 if (spanLength == rest || // Reached the end of the string, or
\r
443 spanLength == 0 // neither strings nor span progressed.
\r
445 return pos + spanLength - start;
\r
448 rest -= spanLength;
\r
449 continue; // spanLength>0: Match strings from after a span.
\r
451 // Try to match only one code point from after a string match if some
\r
452 // string matched beyond it, so that we try all possible positions
\r
453 // and don't overshoot.
\r
454 spanLength = spanOne(spanSet, s, pos, rest);
\r
455 if (spanLength > 0) {
\r
456 if (spanLength == rest) {
\r
457 return length; // Reached the end of the string.
\r
459 // Match strings after this code point.
\r
460 // There cannot be any increments below it because UnicodeSet strings
\r
461 // contain multiple code points.
\r
463 rest -= spanLength;
\r
464 offsets.shift(spanLength);
\r
466 continue; // Match strings from after a single code point.
\r
468 // Match strings from after the next string match.
\r
471 int minOffset = offsets.popMinimum();
\r
474 spanLength = 0; // Match strings from after a string match.
\r
479 * Span a string backwards.
\r
481 * @param s The string to be spanned
\r
482 * @param spanCondition The span condition
\r
483 * @return The string index which starts the span (i.e. inclusive).
\r
486 public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) {
\r
487 if (spanCondition == SpanCondition.NOT_CONTAINED) {
\r
488 return spanNotBack(s, length);
\r
490 int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED);
\r
494 int spanLength = length - pos;
\r
496 // Consider strings; they may overlap with the span.
\r
498 if (spanCondition == SpanCondition.CONTAINED) {
\r
499 // Use offset list to try all possibilities.
\r
500 initSize = maxLength16;
\r
502 offsets.setMaxLength(initSize);
\r
503 int i, stringsLength = strings.size();
\r
504 int spanBackLengthsOffset = 0;
\r
506 spanBackLengthsOffset = stringsLength;
\r
509 if (spanCondition == SpanCondition.CONTAINED) {
\r
510 for (i = 0; i < stringsLength; ++i) {
\r
511 int overlap = spanLengths[spanBackLengthsOffset + i];
\r
512 if (overlap == ALL_CP_CONTAINED) {
\r
513 continue; // Irrelevant string.
\r
515 String string = strings.get(i);
\r
517 int length16 = string.length();
\r
519 // Try to match this string at pos-(length16-overlap)..pos-length16.
\r
520 if (overlap >= LONG_SPAN) {
\r
521 overlap = length16;
\r
522 // While contained: No point matching fully inside the code point span.
\r
524 len1 = string.offsetByCodePoints(0, 1);
\r
525 overlap -= len1; // Length of the string minus the first code point.
\r
527 if (overlap > spanLength) {
\r
528 overlap = spanLength;
\r
530 int dec = length16 - overlap; // Keep dec+overlap==length16.
\r
535 // Try to match if the decrement is not listed already.
\r
536 if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) {
\r
538 return 0; // Reached the start of the string.
\r
540 offsets.addOffset(dec);
\r
542 if (overlap == 0) {
\r
549 } else /* SIMPLE */{
\r
550 int maxDec = 0, maxOverlap = 0;
\r
551 for (i = 0; i < stringsLength; ++i) {
\r
552 int overlap = spanLengths[spanBackLengthsOffset + i];
\r
553 // For longest match, we do need to try to match even an all-contained string
\r
554 // to find the match from the latest end.
\r
556 String string = strings.get(i);
\r
558 int length16 = string.length();
\r
560 // Try to match this string at pos-(length16-overlap)..pos-length16.
\r
561 if (overlap >= LONG_SPAN) {
\r
562 overlap = length16;
\r
563 // Longest match: Need to match fully inside the code point span
\r
564 // to find the match from the latest end.
\r
566 if (overlap > spanLength) {
\r
567 overlap = spanLength;
\r
569 int dec = length16 - overlap; // Keep dec+overlap==length16.
\r
571 if (dec > pos || overlap < maxOverlap) {
\r
574 // Try to match if the string is longer or ends later.
\r
575 if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec)
\r
576 && matches16CPB(s, pos - dec, length, string, length16)) {
\r
577 maxDec = dec; // Longest match from latest end.
\r
578 maxOverlap = overlap;
\r
586 if (maxDec != 0 || maxOverlap != 0) {
\r
587 // Longest-match algorithm, and there was a string match.
\r
588 // Simply continue before it.
\r
591 return 0; // Reached the start of the string.
\r
593 spanLength = 0; // Match strings from before a string match.
\r
597 // Finished trying to match all strings at pos.
\r
599 if (spanLength != 0 || pos == length) {
\r
600 // The position is before an unlimited code point span (spanLength!=0),
\r
601 // not before a string match.
\r
602 // The only position where spanLength==0 before a span is pos==length.
\r
603 // Otherwise, an unlimited code point span is only tried again when no
\r
604 // strings match, and if such a non-initial span fails we stop.
\r
605 if (offsets.isEmpty()) {
\r
606 return pos; // No strings matched before a span.
\r
608 // Match strings from before the next string match.
\r
610 // The position is before a string match (or a single code point).
\r
611 if (offsets.isEmpty()) {
\r
612 // No more strings matched before a previous string match.
\r
613 // Try another code point span from before the last string match.
\r
615 pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED);
\r
616 spanLength = oldPos - pos;
\r
617 if (pos == 0 || // Reached the start of the string, or
\r
618 spanLength == 0 // neither strings nor span progressed.
\r
622 continue; // spanLength>0: Match strings from before a span.
\r
624 // Try to match only one code point from before a string match if some
\r
625 // string matched beyond it, so that we try all possible positions
\r
626 // and don't overshoot.
\r
627 spanLength = spanOneBack(spanSet, s, pos);
\r
628 if (spanLength > 0) {
\r
629 if (spanLength == pos) {
\r
630 return 0; // Reached the start of the string.
\r
632 // Match strings before this code point.
\r
633 // There cannot be any decrements below it because UnicodeSet strings
\r
634 // contain multiple code points.
\r
636 offsets.shift(spanLength);
\r
638 continue; // Match strings from before a single code point.
\r
640 // Match strings from before the next string match.
\r
643 pos -= offsets.popMinimum();
\r
644 spanLength = 0; // Match strings from before a string match.
\r
649 * Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED)
\r
651 * Theoretical algorithm: - Iterate through the string, and at each code point boundary: + If the code point there
\r
652 * is in the set, then return with the current position. + If a set string matches at the current position, then
\r
653 * return with the current position.
\r
655 * Optimized implementation:
\r
657 * (Same assumption as for span() above.)
\r
659 * Create and cache a spanNotSet which contains all of the single code points of the original set but none of its
\r
660 * strings. For each set string add its initial code point to the spanNotSet. (Also add its final code point for
\r
664 * + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED).
\r
665 * + If the current code point is in the original set, then return the current position.
\r
666 * + If any set string matches at the current position, then return the current position.
\r
667 * + If there is no match at the current position, neither for the code point
\r
668 * there nor for any set string, then skip this code point and continue the loop. This happens for
\r
669 * set-string-initial code points that were added to spanNotSet when there is not actually a match for such a set
\r
672 * @return the length of the span
\r
674 private int spanNot(CharSequence s, int start, int length) {
\r
675 int pos = start, rest = length;
\r
676 int i, stringsLength = strings.size();
\r
678 // Span until we find a code point from the set,
\r
679 // or a code point that starts or ends some string.
\r
680 i = spanNotSet.span(s.subSequence(pos, pos + rest), SpanCondition.NOT_CONTAINED);
\r
682 return length; // Reached the end of the string.
\r
687 // Check whether the current code point is in the original set,
\r
688 // without the string starts and ends.
\r
689 int cpLength = spanOne(spanSet, s, pos, rest);
\r
690 if (cpLength > 0) {
\r
691 return pos - start; // There is a set element at pos.
\r
694 // Try to match the strings at pos.
\r
695 for (i = 0; i < stringsLength; ++i) {
\r
696 if (spanLengths[i] == ALL_CP_CONTAINED) {
\r
697 continue; // Irrelevant string.
\r
699 String string = strings.get(i);
\r
701 int length16 = string.length();
\r
702 if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) {
\r
703 return pos - start; // There is a set element at pos.
\r
707 // The span(while not contained) ended on a string start/end which is
\r
708 // not in the original set. Skip this code point and continue.
\r
712 } while (rest != 0);
\r
713 return length; // Reached the end of the string.
\r
716 private int spanNotBack(CharSequence s, int length) {
\r
718 int i, stringsLength = strings.size();
\r
720 // Span until we find a code point from the set,
\r
721 // or a code point that starts or ends some string.
\r
722 pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED);
\r
724 return 0; // Reached the start of the string.
\r
727 // Check whether the current code point is in the original set,
\r
728 // without the string starts and ends.
\r
729 int cpLength = spanOneBack(spanSet, s, pos);
\r
730 if (cpLength > 0) {
\r
731 return pos; // There is a set element at pos.
\r
734 // Try to match the strings at pos.
\r
735 for (i = 0; i < stringsLength; ++i) {
\r
736 // Use spanLengths rather than a spanLengths pointer because
\r
737 // it is easier and we only need to know whether the string is irrelevant
\r
738 // which is the same in either array.
\r
739 if (spanLengths[i] == ALL_CP_CONTAINED) {
\r
740 continue; // Irrelevant string.
\r
742 String string = strings.get(i);
\r
744 int length16 = string.length();
\r
745 if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) {
\r
746 return pos; // There is a set element at pos.
\r
750 // The span(while not contained) ended on a string start/end which is
\r
751 // not in the original set. Skip this code point and continue.
\r
754 } while (pos != 0);
\r
755 return 0; // Reached the start of the string.
\r
758 static short makeSpanLengthByte(int spanLength) {
\r
759 // 0xfe==UnicodeSetStringSpan::LONG_SPAN
\r
760 return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN;
\r
763 // Compare strings without any argument checks. Requires length>0.
\r
764 private static boolean matches16(CharSequence s, int start, final String t, int length) {
\r
765 int end = start + length;
\r
766 while (length-- > 0) {
\r
767 if (s.charAt(--end) != t.charAt(length)) {
\r
775 * Compare 16-bit Unicode strings (which may be malformed UTF-16)
\r
776 * at code point boundaries.
\r
777 * That is, each edge of a match must not be in the middle of a surrogate pair.
\r
778 * @param start The start index of s.
\r
779 * @param slength The length of s from start.
\r
780 * @param tlength The length of t.
\r
782 static boolean matches16CPB(CharSequence s, int start, int slength, final String t, int tlength) {
\r
783 return !(0 < start && com.ibm.icu.text.UTF16.isLeadSurrogate (s.charAt(start - 1)) &&
\r
784 com.ibm.icu.text.UTF16.isTrailSurrogate(s.charAt(start + 0)))
\r
785 && !(tlength < slength && com.ibm.icu.text.UTF16.isLeadSurrogate (s.charAt(start + tlength - 1)) &&
\r
786 com.ibm.icu.text.UTF16.isTrailSurrogate(s.charAt(start + tlength)))
\r
787 && matches16(s, start, t, tlength);
\r
790 // Does the set contain the next code point?
\r
791 // If so, return its length; otherwise return its negative length.
\r
792 static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) {
\r
793 char c = s.charAt(start);
\r
794 if (c >= 0xd800 && c <= 0xdbff && length >= 2) {
\r
795 char c2 = s.charAt(start + 1);
\r
796 if (com.ibm.icu.text.UTF16.isTrailSurrogate(c2)) {
\r
797 int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
\r
798 return set.contains(supplementary) ? 2 : -2;
\r
801 return set.contains(c) ? 1 : -1;
\r
804 static int spanOneBack(final UnicodeSet set, CharSequence s, int length) {
\r
805 char c = s.charAt(length - 1);
\r
806 if (c >= 0xdc00 && c <= 0xdfff && length >= 2) {
\r
807 char c2 = s.charAt(length - 2);
\r
808 if (com.ibm.icu.text.UTF16.isLeadSurrogate(c2)) {
\r
809 int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
\r
810 return set.contains(supplementary) ? 2 : -2;
\r
813 return set.contains(c) ? 1 : -1;
\r
818 * Helper class for UnicodeSetStringSpan.
\r
820 * List of offsets from the current position from where to try matching a code point or a string. Store offsets rather
\r
821 * than indexes to simplify the code and use the same list for both increments (in span()) and decrements (in
\r
824 * Assumption: The maximum offset is limited, and the offsets that are stored at any one time are relatively dense, that
\r
825 * is, there are normally no gaps of hundreds or thousands of offset values.
\r
827 * The implementation uses a circular buffer of byte flags, each indicating whether the corresponding offset is in the
\r
828 * list. This avoids inserting into a sorted list of offsets (or absolute indexes) and physically moving part of the
\r
831 * Note: In principle, the caller should setMaxLength() to the maximum of the max string length and U16_LENGTH/U8_LENGTH
\r
832 * to account for "long" single code points.
\r
834 * Note: If maxLength were guaranteed to be no more than 32 or 64, the list could be stored as bit flags in a single
\r
835 * integer. Rather than handling a circular buffer with a start list index, the integer would simply be shifted when
\r
836 * lower offsets are removed. UnicodeSet does not have a limit on the lengths of strings.
\r
838 static class OffsetList {
\r
839 private boolean[] list;
\r
840 private int length;
\r
843 public OffsetList() {
\r
844 list = new boolean[16]; // default size
\r
847 public void setMaxLength(int maxLength) {
\r
848 if (maxLength > list.length) {
\r
849 list = new boolean[maxLength];
\r
854 public void clear() {
\r
855 for (int i = list.length; i-- > 0;) {
\r
858 start = length = 0;
\r
861 public boolean isEmpty() {
\r
862 return (length == 0);
\r
865 // Reduce all stored offsets by delta, used when the current position
\r
867 // There must not be any offsets lower than delta.
\r
868 // If there is an offset equal to delta, it is removed.
\r
869 // delta=[1..maxLength]
\r
870 public void shift(int delta) {
\r
871 int i = start + delta;
\r
872 if (i >= list.length) {
\r
882 // Add an offset. The list must not contain it yet.
\r
883 // offset=[1..maxLength]
\r
884 public void addOffset(int offset) {
\r
885 int i = start + offset;
\r
886 if (i >= list.length) {
\r
893 // offset=[1..maxLength]
\r
894 public boolean containsOffset(int offset) {
\r
895 int i = start + offset;
\r
896 if (i >= list.length) {
\r
902 // Find the lowest stored offset from a non-empty list, remove it,
\r
903 // and reduce all other offsets by this minimum.
\r
904 // Returns [1..maxLength].
\r
905 public int popMinimum() {
\r
906 // Look for the next offset in list[start+1..list.length-1].
\r
907 int i = start, result;
\r
908 while (++i < list.length) {
\r
912 result = i - start;
\r
919 // Wrap around and look for the next offset in list[0..start].
\r
920 // Since the list is not empty, there will be one.
\r
921 result = list.length - start;
\r
929 return result += i;
\r