jars/icu4j-4_4_2-src/main/classes/core/src/com/ibm/icu/impl/UnicodeSetStringSpan.java

   1 /*\r
   2  ******************************************************************************\r
   3  *\r
   4  *   Copyright (C) 2009-2010, International Business Machines\r
   5  *   Corporation and others.  All Rights Reserved.\r
   6  *\r
   7  ******************************************************************************\r
   8  */\r
   9 \r
  10 package com.ibm.icu.impl;\r
  11 \r
  12 import java.util.ArrayList;\r
  13 \r
  14 import com.ibm.icu.text.UnicodeSet;\r
  15 import com.ibm.icu.text.UnicodeSet.SpanCondition;\r
  16 \r
  17 /*\r
  18  * Implement span() etc. for a set with strings.\r
  19  * Avoid recursion because of its exponential complexity.\r
  20  * Instead, try multiple paths at once and track them with an IndexList.\r
  21  */\r
  22 public class UnicodeSetStringSpan {\r
  23 \r
  24     /*\r
  25      * Which span() variant will be used? The object is either built for one variant and used once, or built for all and\r
  26      * may be used many times.\r
  27      */\r
  28     public static final int FWD           = 0x20;\r
  29     public static final int BACK          = 0x10;\r
  30     public static final int UTF16         = 8;\r
  31     public static final int CONTAINED     = 2;\r
  32     public static final int NOT_CONTAINED = 1;\r
  33 \r
  34     public static final int ALL = 0x3f;\r
  35 \r
  36     public static final int FWD_UTF16_CONTAINED      = FWD  | UTF16 |     CONTAINED;\r
  37     public static final int FWD_UTF16_NOT_CONTAINED  = FWD  | UTF16 | NOT_CONTAINED;\r
  38     public static final int BACK_UTF16_CONTAINED     = BACK | UTF16 |     CONTAINED;\r
  39     public static final int BACK_UTF16_NOT_CONTAINED = BACK | UTF16 | NOT_CONTAINED;\r
  40 \r
  41     // Special spanLength short values. (since Java has not unsigned byte type)\r
  42     // All code points in the string are contained in the parent set.\r
  43     static final short ALL_CP_CONTAINED = 0xff;\r
  44     // The spanLength is >=0xfe.\r
  45     static final short LONG_SPAN = ALL_CP_CONTAINED - 1;\r
  46 \r
  47     // Set for span(). Same as parent but without strings.\r
  48     private UnicodeSet spanSet;\r
  49 \r
  50     // Set for span(not contained).\r
  51     // Same as spanSet, plus characters that start or end strings.\r
  52     private UnicodeSet spanNotSet;\r
  53 \r
  54     // The strings of the parent set.\r
  55     private ArrayList<String> strings;\r
  56 \r
  57     // the lengths of span(), spanBack() etc. for each string.\r
  58     private short[] spanLengths;\r
  59 \r
  60     // Maximum lengths of relevant strings.\r
  61     private int maxLength16;\r
  62 \r
  63     // Set up for all variants of span()?\r
  64     private boolean all;\r
  65 \r
  66     // Span helper\r
  67     private OffsetList offsets;\r
  68 \r
  69     // Construct for all variants of span(), or only for any one variant.\r
  70     // Initialize as little as possible, for single use.\r
  71     public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which) {\r
  72         spanSet = new UnicodeSet(0, 0x10ffff);\r
  73         strings = setStrings;\r
  74         all = (which == ALL);\r
  75         spanSet.retainAll(set);\r
  76         if (0 != (which & NOT_CONTAINED)) {\r
  77             // Default to the same sets.\r
  78             // addToSpanNotSet() will create a separate set if necessary.\r
  79             spanNotSet = spanSet;\r
  80         }\r
  81         offsets = new OffsetList();\r
  82 \r
  83         // Determine if the strings even need to be taken into account at all for span() etc.\r
  84         // If any string is relevant, then all strings need to be used for\r
  85         // span(longest match) but only the relevant ones for span(while contained).\r
  86         // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH\r
  87         // and do not store UTF-8 strings if !thisRelevant and CONTAINED.\r
  88         // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)\r
  89         // Also count the lengths of the UTF-8 versions of the strings for memory allocation.\r
  90         int stringsLength = strings.size();\r
  91 \r
  92         int i, spanLength;\r
  93         boolean someRelevant = false;\r
  94         for (i = 0; i < stringsLength; ++i) {\r
  95             String string = strings.get(i);\r
  96             int length16 = string.length();\r
  97             spanLength = spanSet.span(string, SpanCondition.CONTAINED);\r
  98             if (spanLength < length16) { // Relevant string.\r
  99                 someRelevant = true;\r
 100             }\r
 101             if ((0 != (which & UTF16)) && length16 > maxLength16) {\r
 102                 maxLength16 = length16;\r
 103             }\r
 104         }\r
 105         if (!someRelevant) {\r
 106             maxLength16 = 0;\r
 107             return;\r
 108         }\r
 109 \r
 110         // Freeze after checking for the need to use strings at all because freezing\r
 111         // a set takes some time and memory which are wasted if there are no relevant strings.\r
 112         if (all) {\r
 113             spanSet.freeze();\r
 114         }\r
 115 \r
 116         int spanBackLengthsOffset;\r
 117 \r
 118         // Allocate a block of meta data.\r
 119         int allocSize;\r
 120         if (all) {\r
 121             // 2 sets of span lengths\r
 122             allocSize = stringsLength * (2);\r
 123         } else {\r
 124             allocSize = stringsLength; // One set of span lengths.\r
 125         }\r
 126         spanLengths = new short[allocSize];\r
 127 \r
 128         if (all) {\r
 129             // Store span lengths for all span() variants.\r
 130             spanBackLengthsOffset = stringsLength;\r
 131         } else {\r
 132             // Store span lengths for only one span() variant.\r
 133             spanBackLengthsOffset = 0;\r
 134         }\r
 135 \r
 136         // Set the meta data and spanNotSet and write the UTF-8 strings.\r
 137 \r
 138         for (i = 0; i < stringsLength; ++i) {\r
 139             String string = strings.get(i);\r
 140             int length16 = string.length();\r
 141             spanLength = spanSet.span(string, SpanCondition.CONTAINED);\r
 142             if (spanLength < length16) { // Relevant string.\r
 143                 if (0 != (which & UTF16)) {\r
 144                     if (0 != (which & CONTAINED)) {\r
 145                         if (0 != (which & FWD)) {\r
 146                             spanLengths[i] = makeSpanLengthByte(spanLength);\r
 147                         }\r
 148                         if (0 != (which & BACK)) {\r
 149                             spanLength = length16\r
 150                                     - spanSet.spanBack(string, length16, SpanCondition.CONTAINED);\r
 151                             spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength);\r
 152                         }\r
 153                     } else /* not CONTAINED, not all, but NOT_CONTAINED */{\r
 154                         spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant\r
 155                                                                                      // flag.\r
 156                     }\r
 157                 }\r
 158                 if (0 != (which & NOT_CONTAINED)) {\r
 159                     // Add string start and end code points to the spanNotSet so that\r
 160                     // a span(while not contained) stops before any string.\r
 161                     int c;\r
 162                     if (0 != (which & FWD)) {\r
 163                         c = string.codePointAt(0);\r
 164                         addToSpanNotSet(c);\r
 165                     }\r
 166                     if (0 != (which & BACK)) {\r
 167                         c = string.codePointBefore(length16);\r
 168                         addToSpanNotSet(c);\r
 169                     }\r
 170                 }\r
 171             } else { // Irrelevant string.\r
 172                 if (all) {\r
 173                     spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED;\r
 174                 } else {\r
 175                     // All spanXYZLengths pointers contain the same address.\r
 176                     spanLengths[i] = ALL_CP_CONTAINED;\r
 177                 }\r
 178             }\r
 179         }\r
 180 \r
 181         // Finish.\r
 182         if (all) {\r
 183             spanNotSet.freeze();\r
 184         }\r
 185     }\r
 186 \r
 187     /**\r
 188      * Constructs a copy of an existing UnicodeSetStringSpan.\r
 189      * Assumes which==ALL for a frozen set.\r
 190      */\r
 191     public UnicodeSetStringSpan(final UnicodeSetStringSpan otherStringSpan, final ArrayList<String> newParentSetStrings) {\r
 192         spanSet = otherStringSpan.spanSet;\r
 193         strings = newParentSetStrings;\r
 194         maxLength16 = otherStringSpan.maxLength16;\r
 195         all = true;\r
 196         if (otherStringSpan.spanNotSet == otherStringSpan.spanSet) {\r
 197             spanNotSet = spanSet;\r
 198         } else {\r
 199             spanNotSet = (UnicodeSet) otherStringSpan.spanNotSet.clone();\r
 200         }\r
 201         offsets = new OffsetList();\r
 202 \r
 203         spanLengths = otherStringSpan.spanLengths.clone();\r
 204     }\r
 205 \r
 206     /*\r
 207      * Do the strings need to be checked in span() etc.?\r
 208      * \r
 209      * @return TRUE if strings need to be checked (call span() here), FALSE if not (use a BMPSet for best performance).\r
 210      */\r
 211     public boolean needsStringSpanUTF16() {\r
 212         return (maxLength16 != 0);\r
 213     }\r
 214 \r
 215     // For fast UnicodeSet::contains(c).\r
 216     public boolean contains(int c) {\r
 217         return spanSet.contains(c);\r
 218     }\r
 219 \r
 220     // Add a starting or ending string character to the spanNotSet\r
 221     // so that a character span ends before any string.\r
 222     private void addToSpanNotSet(int c) {\r
 223         if (spanNotSet == null || spanNotSet == spanSet) {\r
 224             if (spanSet.contains(c)) {\r
 225                 return; // Nothing to do.\r
 226             }\r
 227             spanNotSet = spanSet.cloneAsThawed();\r
 228         }\r
 229         spanNotSet.add(c);\r
 230     }\r
 231 \r
 232     /*\r
 233      * Note: In span() when spanLength==0 (after a string match, or at the beginning after an empty code point span) and\r
 234      * in spanNot() and spanNotUTF8(), string matching could use a binary search because all string matches are done\r
 235      * from the same start index.\r
 236      * \r
 237      * For UTF-8, this would require a comparison function that returns UTF-16 order.\r
 238      * \r
 239      * This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets\r
 240      * with strings have very few very short strings. For cases with many strings, it might be better to use a different\r
 241      * API and implementation with a DFA (state machine).\r
 242      */\r
 243 \r
 244     /*\r
 245      * Algorithm for span(SpanCondition.CONTAINED)\r
 246      * \r
 247      * Theoretical algorithm: - Iterate through the string, and at each code point boundary: + If the code point there\r
 248      * is in the set, then remember to continue after it. + If a set string matches at the current position, then\r
 249      * remember to continue after it. + Either recursively span for each code point or string match, or recursively span\r
 250      * for all but the shortest one and iteratively continue the span with the shortest local match. + Remember the\r
 251      * longest recursive span (the farthest end point). + If there is no match at the current position, neither for the\r
 252      * code point there nor for any set string, then stop and return the longest recursive span length.\r
 253      * \r
 254      * Optimized implementation:\r
 255      * \r
 256      * (We assume that most sets will have very few very short strings. A span using a string-less set is extremely\r
 257      * fast.)\r
 258      * \r
 259      * Create and cache a spanSet which contains all of the single code points of the original set but none of its\r
 260      * strings.\r
 261      * \r
 262      * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). - Loop: + Try to match each set\r
 263      * string at the end of the spanLength. ~ Set strings that start with set-contained code points must be matched with\r
 264      * a partial overlap because the recursive algorithm would have tried to match them at every position. ~ Set strings\r
 265      * that entirely consist of set-contained code points are irrelevant for span(SpanCondition.CONTAINED)\r
 266      * because the recursive algorithm would continue after them anyway and find the longest recursive match from their\r
 267      * end. ~ Rather than recursing, note each end point of a set string match. + If no set string matched after\r
 268      * spanSet.span(), then return with where the spanSet.span() ended. + If at least one set string matched after\r
 269      * spanSet.span(), then pop the shortest string match end point and continue the loop, trying to match all set\r
 270      * strings from there. + If at least one more set string matched after a previous string match, then test if the\r
 271      * code point after the previous string match is also contained in the set. Continue the loop with the shortest end\r
 272      * point of either this code point or a matching set string. + If no more set string matched after a previous string\r
 273      * match, then try another spanLength=spanSet.span(SpanCondition.CONTAINED). Stop if spanLength==0,\r
 274      * otherwise continue the loop.\r
 275      * \r
 276      * By noting each end point of a set string match, the function visits each string position at most once and\r
 277      * finishes in linear time.\r
 278      * \r
 279      * The recursive algorithm may visit the same string position many times if multiple paths lead to it and finishes\r
 280      * in exponential time.\r
 281      */\r
 282 \r
 283     /*\r
 284      * Algorithm for span(SIMPLE)\r
 285      * \r
 286      * Theoretical algorithm: - Iterate through the string, and at each code point boundary: + If the code point there\r
 287      * is in the set, then remember to continue after it. + If a set string matches at the current position, then\r
 288      * remember to continue after it. + Continue from the farthest match position and ignore all others. + If there is\r
 289      * no match at the current position, then stop and return the current position.\r
 290      * \r
 291      * Optimized implementation:\r
 292      * \r
 293      * (Same assumption and spanSet as above.)\r
 294      * \r
 295      * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). - Loop: + Try to match each set\r
 296      * string at the end of the spanLength. ~ Set strings that start with set-contained code points must be matched with\r
 297      * a partial overlap because the standard algorithm would have tried to match them earlier. ~ Set strings that\r
 298      * entirely consist of set-contained code points must be matched with a full overlap because the longest-match\r
 299      * algorithm would hide set string matches that end earlier. Such set strings need not be matched earlier inside the\r
 300      * code point span because the standard algorithm would then have continued after the set string match anyway. ~\r
 301      * Remember the longest set string match (farthest end point) from the earliest starting point. + If no set string\r
 302      * matched after spanSet.span(), then return with where the spanSet.span() ended. + If at least one set string\r
 303      * matched, then continue the loop after the longest match from the earliest position. + If no more set string\r
 304      * matched after a previous string match, then try another\r
 305      * spanLength=spanSet.span(SpanCondition.CONTAINED). Stop if spanLength==0, otherwise continue the\r
 306      * loop.\r
 307      */\r
 308     /**\r
 309      * Span a string.\r
 310      * \r
 311      * @param s The string to be spanned\r
 312      * @param start The start index that the span begins\r
 313      * @param spanCondition The span condition\r
 314      * @return the length of the span\r
 315      * @draft ICU 4.4\r
 316      */\r
 317     public synchronized int span(CharSequence s, int start, int length, SpanCondition spanCondition) {\r
 318         if (spanCondition == SpanCondition.NOT_CONTAINED) {\r
 319             return spanNot(s, start, length);\r
 320         }\r
 321         int spanLength = spanSet.span(s.subSequence(start, start + length), SpanCondition.CONTAINED);\r
 322         if (spanLength == length) {\r
 323             return length;\r
 324         }\r
 325 \r
 326         // Consider strings; they may overlap with the span.\r
 327         int initSize = 0;\r
 328         if (spanCondition == SpanCondition.CONTAINED) {\r
 329             // Use offset list to try all possibilities.\r
 330             initSize = maxLength16;\r
 331         }\r
 332         offsets.setMaxLength(initSize);\r
 333         int pos = start + spanLength, rest = length - spanLength;\r
 334         int i, stringsLength = strings.size();\r
 335         for (;;) {\r
 336             if (spanCondition == SpanCondition.CONTAINED) {\r
 337                 for (i = 0; i < stringsLength; ++i) {\r
 338                     int overlap = spanLengths[i];\r
 339                     if (overlap == ALL_CP_CONTAINED) {\r
 340                         continue; // Irrelevant string.\r
 341                     }\r
 342                     String string = strings.get(i);\r
 343 \r
 344                     int length16 = string.length();\r
 345 \r
 346                     // Try to match this string at pos-overlap..pos.\r
 347                     if (overlap >= LONG_SPAN) {\r
 348                         overlap = length16;\r
 349                         // While contained: No point matching fully inside the code point span.\r
 350                         overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code\r
 351                                                                           // point.\r
 352                     }\r
 353                     if (overlap > spanLength) {\r
 354                         overlap = spanLength;\r
 355                     }\r
 356                     int inc = length16 - overlap; // Keep overlap+inc==length16.\r
 357                     for (;;) {\r
 358                         if (inc > rest) {\r
 359                             break;\r
 360                         }\r
 361                         // Try to match if the increment is not listed already.\r
 362                         if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) {\r
 363                             if (inc == rest) {\r
 364                                 return length; // Reached the end of the string.\r
 365                             }\r
 366                             offsets.addOffset(inc);\r
 367                         }\r
 368                         if (overlap == 0) {\r
 369                             break;\r
 370                         }\r
 371                         --overlap;\r
 372                         ++inc;\r
 373                     }\r
 374                 }\r
 375             } else /* SIMPLE */{\r
 376                 int maxInc = 0, maxOverlap = 0;\r
 377                 for (i = 0; i < stringsLength; ++i) {\r
 378                     int overlap = spanLengths[i];\r
 379                     // For longest match, we do need to try to match even an all-contained string\r
 380                     // to find the match from the earliest start.\r
 381 \r
 382                     String string = strings.get(i);\r
 383 \r
 384                     int length16 = string.length();\r
 385 \r
 386                     // Try to match this string at pos-overlap..pos.\r
 387                     if (overlap >= LONG_SPAN) {\r
 388                         overlap = length16;\r
 389                         // Longest match: Need to match fully inside the code point span\r
 390                         // to find the match from the earliest start.\r
 391                     }\r
 392                     if (overlap > spanLength) {\r
 393                         overlap = spanLength;\r
 394                     }\r
 395                     int inc = length16 - overlap; // Keep overlap+inc==length16.\r
 396                     for (;;) {\r
 397                         if (inc > rest || overlap < maxOverlap) {\r
 398                             break;\r
 399                         }\r
 400                         // Try to match if the string is longer or starts earlier.\r
 401                         if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc)\r
 402                                 && matches16CPB(s, pos - overlap, length, string, length16)) {\r
 403                             maxInc = inc; // Longest match from earliest start.\r
 404                             maxOverlap = overlap;\r
 405                             break;\r
 406                         }\r
 407                         --overlap;\r
 408                         ++inc;\r
 409                     }\r
 410                 }\r
 411 \r
 412                 if (maxInc != 0 || maxOverlap != 0) {\r
 413                     // Longest-match algorithm, and there was a string match.\r
 414                     // Simply continue after it.\r
 415                     pos += maxInc;\r
 416                     rest -= maxInc;\r
 417                     if (rest == 0) {\r
 418                         return length; // Reached the end of the string.\r
 419                     }\r
 420                     spanLength = 0; // Match strings from after a string match.\r
 421                     continue;\r
 422                 }\r
 423             }\r
 424             // Finished trying to match all strings at pos.\r
 425 \r
 426             if (spanLength != 0 || pos == 0) {\r
 427                 // The position is after an unlimited code point span (spanLength!=0),\r
 428                 // not after a string match.\r
 429                 // The only position where spanLength==0 after a span is pos==0.\r
 430                 // Otherwise, an unlimited code point span is only tried again when no\r
 431                 // strings match, and if such a non-initial span fails we stop.\r
 432                 if (offsets.isEmpty()) {\r
 433                     return pos - start; // No strings matched after a span.\r
 434                 }\r
 435                 // Match strings from after the next string match.\r
 436             } else {\r
 437                 // The position is after a string match (or a single code point).\r
 438                 if (offsets.isEmpty()) {\r
 439                     // No more strings matched after a previous string match.\r
 440                     // Try another code point span from after the last string match.\r
 441                     spanLength = spanSet.span(s.subSequence(pos, pos + rest), SpanCondition.CONTAINED);\r
 442                     if (spanLength == rest || // Reached the end of the string, or\r
 443                             spanLength == 0 // neither strings nor span progressed.\r
 444                     ) {\r
 445                         return pos + spanLength - start;\r
 446                     }\r
 447                     pos += spanLength;\r
 448                     rest -= spanLength;\r
 449                     continue; // spanLength>0: Match strings from after a span.\r
 450                 } else {\r
 451                     // Try to match only one code point from after a string match if some\r
 452                     // string matched beyond it, so that we try all possible positions\r
 453                     // and don't overshoot.\r
 454                     spanLength = spanOne(spanSet, s, pos, rest);\r
 455                     if (spanLength > 0) {\r
 456                         if (spanLength == rest) {\r
 457                             return length; // Reached the end of the string.\r
 458                         }\r
 459                         // Match strings after this code point.\r
 460                         // There cannot be any increments below it because UnicodeSet strings\r
 461                         // contain multiple code points.\r
 462                         pos += spanLength;\r
 463                         rest -= spanLength;\r
 464                         offsets.shift(spanLength);\r
 465                         spanLength = 0;\r
 466                         continue; // Match strings from after a single code point.\r
 467                     }\r
 468                     // Match strings from after the next string match.\r
 469                 }\r
 470             }\r
 471             int minOffset = offsets.popMinimum();\r
 472             pos += minOffset;\r
 473             rest -= minOffset;\r
 474             spanLength = 0; // Match strings from after a string match.\r
 475         }\r
 476     }\r
 477 \r
 478     /**\r
 479      * Span a string backwards.\r
 480      * \r
 481      * @param s The string to be spanned\r
 482      * @param spanCondition The span condition\r
 483      * @return The string index which starts the span (i.e. inclusive).\r
 484      * @draft ICU 4.4\r
 485      */\r
 486     public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) {\r
 487         if (spanCondition == SpanCondition.NOT_CONTAINED) {\r
 488             return spanNotBack(s, length);\r
 489         }\r
 490         int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED);\r
 491         if (pos == 0) {\r
 492             return 0;\r
 493         }\r
 494         int spanLength = length - pos;\r
 495 \r
 496         // Consider strings; they may overlap with the span.\r
 497         int initSize = 0;\r
 498         if (spanCondition == SpanCondition.CONTAINED) {\r
 499             // Use offset list to try all possibilities.\r
 500             initSize = maxLength16;\r
 501         }\r
 502         offsets.setMaxLength(initSize);\r
 503         int i, stringsLength = strings.size();\r
 504         int spanBackLengthsOffset = 0;\r
 505         if (all) {\r
 506             spanBackLengthsOffset = stringsLength;\r
 507         }\r
 508         for (;;) {\r
 509             if (spanCondition == SpanCondition.CONTAINED) {\r
 510                 for (i = 0; i < stringsLength; ++i) {\r
 511                     int overlap = spanLengths[spanBackLengthsOffset + i];\r
 512                     if (overlap == ALL_CP_CONTAINED) {\r
 513                         continue; // Irrelevant string.\r
 514                     }\r
 515                     String string = strings.get(i);\r
 516 \r
 517                     int length16 = string.length();\r
 518 \r
 519                     // Try to match this string at pos-(length16-overlap)..pos-length16.\r
 520                     if (overlap >= LONG_SPAN) {\r
 521                         overlap = length16;\r
 522                         // While contained: No point matching fully inside the code point span.\r
 523                         int len1 = 0;\r
 524                         len1 = string.offsetByCodePoints(0, 1);\r
 525                         overlap -= len1; // Length of the string minus the first code point.\r
 526                     }\r
 527                     if (overlap > spanLength) {\r
 528                         overlap = spanLength;\r
 529                     }\r
 530                     int dec = length16 - overlap; // Keep dec+overlap==length16.\r
 531                     for (;;) {\r
 532                         if (dec > pos) {\r
 533                             break;\r
 534                         }\r
 535                         // Try to match if the decrement is not listed already.\r
 536                         if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) {\r
 537                             if (dec == pos) {\r
 538                                 return 0; // Reached the start of the string.\r
 539                             }\r
 540                             offsets.addOffset(dec);\r
 541                         }\r
 542                         if (overlap == 0) {\r
 543                             break;\r
 544                         }\r
 545                         --overlap;\r
 546                         ++dec;\r
 547                     }\r
 548                 }\r
 549             } else /* SIMPLE */{\r
 550                 int maxDec = 0, maxOverlap = 0;\r
 551                 for (i = 0; i < stringsLength; ++i) {\r
 552                     int overlap = spanLengths[spanBackLengthsOffset + i];\r
 553                     // For longest match, we do need to try to match even an all-contained string\r
 554                     // to find the match from the latest end.\r
 555 \r
 556                     String string = strings.get(i);\r
 557 \r
 558                     int length16 = string.length();\r
 559 \r
 560                     // Try to match this string at pos-(length16-overlap)..pos-length16.\r
 561                     if (overlap >= LONG_SPAN) {\r
 562                         overlap = length16;\r
 563                         // Longest match: Need to match fully inside the code point span\r
 564                         // to find the match from the latest end.\r
 565                     }\r
 566                     if (overlap > spanLength) {\r
 567                         overlap = spanLength;\r
 568                     }\r
 569                     int dec = length16 - overlap; // Keep dec+overlap==length16.\r
 570                     for (;;) {\r
 571                         if (dec > pos || overlap < maxOverlap) {\r
 572                             break;\r
 573                         }\r
 574                         // Try to match if the string is longer or ends later.\r
 575                         if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec)\r
 576                                 && matches16CPB(s, pos - dec, length, string, length16)) {\r
 577                             maxDec = dec; // Longest match from latest end.\r
 578                             maxOverlap = overlap;\r
 579                             break;\r
 580                         }\r
 581                         --overlap;\r
 582                         ++dec;\r
 583                     }\r
 584                 }\r
 585 \r
 586                 if (maxDec != 0 || maxOverlap != 0) {\r
 587                     // Longest-match algorithm, and there was a string match.\r
 588                     // Simply continue before it.\r
 589                     pos -= maxDec;\r
 590                     if (pos == 0) {\r
 591                         return 0; // Reached the start of the string.\r
 592                     }\r
 593                     spanLength = 0; // Match strings from before a string match.\r
 594                     continue;\r
 595                 }\r
 596             }\r
 597             // Finished trying to match all strings at pos.\r
 598 \r
 599             if (spanLength != 0 || pos == length) {\r
 600                 // The position is before an unlimited code point span (spanLength!=0),\r
 601                 // not before a string match.\r
 602                 // The only position where spanLength==0 before a span is pos==length.\r
 603                 // Otherwise, an unlimited code point span is only tried again when no\r
 604                 // strings match, and if such a non-initial span fails we stop.\r
 605                 if (offsets.isEmpty()) {\r
 606                     return pos; // No strings matched before a span.\r
 607                 }\r
 608                 // Match strings from before the next string match.\r
 609             } else {\r
 610                 // The position is before a string match (or a single code point).\r
 611                 if (offsets.isEmpty()) {\r
 612                     // No more strings matched before a previous string match.\r
 613                     // Try another code point span from before the last string match.\r
 614                     int oldPos = pos;\r
 615                     pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED);\r
 616                     spanLength = oldPos - pos;\r
 617                     if (pos == 0 || // Reached the start of the string, or\r
 618                             spanLength == 0 // neither strings nor span progressed.\r
 619                     ) {\r
 620                         return pos;\r
 621                     }\r
 622                     continue; // spanLength>0: Match strings from before a span.\r
 623                 } else {\r
 624                     // Try to match only one code point from before a string match if some\r
 625                     // string matched beyond it, so that we try all possible positions\r
 626                     // and don't overshoot.\r
 627                     spanLength = spanOneBack(spanSet, s, pos);\r
 628                     if (spanLength > 0) {\r
 629                         if (spanLength == pos) {\r
 630                             return 0; // Reached the start of the string.\r
 631                         }\r
 632                         // Match strings before this code point.\r
 633                         // There cannot be any decrements below it because UnicodeSet strings\r
 634                         // contain multiple code points.\r
 635                         pos -= spanLength;\r
 636                         offsets.shift(spanLength);\r
 637                         spanLength = 0;\r
 638                         continue; // Match strings from before a single code point.\r
 639                     }\r
 640                     // Match strings from before the next string match.\r
 641                 }\r
 642             }\r
 643             pos -= offsets.popMinimum();\r
 644             spanLength = 0; // Match strings from before a string match.\r
 645         }\r
 646     }\r
 647 \r
 648     /*\r
 649      * Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED)\r
 650      * \r
 651      * Theoretical algorithm: - Iterate through the string, and at each code point boundary: + If the code point there\r
 652      * is in the set, then return with the current position. + If a set string matches at the current position, then\r
 653      * return with the current position.\r
 654      * \r
 655      * Optimized implementation:\r
 656      * \r
 657      * (Same assumption as for span() above.)\r
 658      * \r
 659      * Create and cache a spanNotSet which contains all of the single code points of the original set but none of its\r
 660      * strings. For each set string add its initial code point to the spanNotSet. (Also add its final code point for\r
 661      * spanNotBack().)\r
 662      * \r
 663      * - Loop:\r
 664      *   + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED).\r
 665      *   + If the current code point is in the original set, then return the current position.\r
 666      *   + If any set string matches at the current position, then return the current position.\r
 667      *   + If there is no match at the current position, neither for the code point\r
 668      * there nor for any set string, then skip this code point and continue the loop. This happens for\r
 669      * set-string-initial code points that were added to spanNotSet when there is not actually a match for such a set\r
 670      * string.\r
 671      *\r
 672      * @return the length of the span\r
 673      */\r
 674     private int spanNot(CharSequence s, int start, int length) {\r
 675         int pos = start, rest = length;\r
 676         int i, stringsLength = strings.size();\r
 677         do {\r
 678             // Span until we find a code point from the set,\r
 679             // or a code point that starts or ends some string.\r
 680             i = spanNotSet.span(s.subSequence(pos, pos + rest), SpanCondition.NOT_CONTAINED);\r
 681             if (i == rest) {\r
 682                 return length; // Reached the end of the string.\r
 683             }\r
 684             pos += i;\r
 685             rest -= i;\r
 686 \r
 687             // Check whether the current code point is in the original set,\r
 688             // without the string starts and ends.\r
 689             int cpLength = spanOne(spanSet, s, pos, rest);\r
 690             if (cpLength > 0) {\r
 691                 return pos - start; // There is a set element at pos.\r
 692             }\r
 693 \r
 694             // Try to match the strings at pos.\r
 695             for (i = 0; i < stringsLength; ++i) {\r
 696                 if (spanLengths[i] == ALL_CP_CONTAINED) {\r
 697                     continue; // Irrelevant string.\r
 698                 }\r
 699                 String string = strings.get(i);\r
 700 \r
 701                 int length16 = string.length();\r
 702                 if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) {\r
 703                     return pos - start; // There is a set element at pos.\r
 704                 }\r
 705             }\r
 706 \r
 707             // The span(while not contained) ended on a string start/end which is\r
 708             // not in the original set. Skip this code point and continue.\r
 709             // cpLength<0\r
 710             pos -= cpLength;\r
 711             rest += cpLength;\r
 712         } while (rest != 0);\r
 713         return length; // Reached the end of the string.\r
 714     }\r
 715 \r
 716     private int spanNotBack(CharSequence s, int length) {\r
 717         int pos = length;\r
 718         int i, stringsLength = strings.size();\r
 719         do {\r
 720             // Span until we find a code point from the set,\r
 721             // or a code point that starts or ends some string.\r
 722             pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED);\r
 723             if (pos == 0) {\r
 724                 return 0; // Reached the start of the string.\r
 725             }\r
 726 \r
 727             // Check whether the current code point is in the original set,\r
 728             // without the string starts and ends.\r
 729             int cpLength = spanOneBack(spanSet, s, pos);\r
 730             if (cpLength > 0) {\r
 731                 return pos; // There is a set element at pos.\r
 732             }\r
 733 \r
 734             // Try to match the strings at pos.\r
 735             for (i = 0; i < stringsLength; ++i) {\r
 736                 // Use spanLengths rather than a spanLengths pointer because\r
 737                 // it is easier and we only need to know whether the string is irrelevant\r
 738                 // which is the same in either array.\r
 739                 if (spanLengths[i] == ALL_CP_CONTAINED) {\r
 740                     continue; // Irrelevant string.\r
 741                 }\r
 742                 String string = strings.get(i);\r
 743 \r
 744                 int length16 = string.length();\r
 745                 if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) {\r
 746                     return pos; // There is a set element at pos.\r
 747                 }\r
 748             }\r
 749 \r
 750             // The span(while not contained) ended on a string start/end which is\r
 751             // not in the original set. Skip this code point and continue.\r
 752             // cpLength<0\r
 753             pos += cpLength;\r
 754         } while (pos != 0);\r
 755         return 0; // Reached the start of the string.\r
 756     }\r
 757 \r
 758     static short makeSpanLengthByte(int spanLength) {\r
 759         // 0xfe==UnicodeSetStringSpan::LONG_SPAN\r
 760         return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN;\r
 761     }\r
 762 \r
 763     // Compare strings without any argument checks. Requires length>0.\r
 764     private static boolean matches16(CharSequence s, int start, final String t, int length) {\r
 765         int end = start + length;\r
 766         while (length-- > 0) {\r
 767             if (s.charAt(--end) != t.charAt(length)) {\r
 768                 return false;\r
 769             }\r
 770         }\r
 771         return true;\r
 772     }\r
 773 \r
 774     /**\r
 775      * Compare 16-bit Unicode strings (which may be malformed UTF-16)\r
 776      * at code point boundaries.\r
 777      * That is, each edge of a match must not be in the middle of a surrogate pair.\r
 778      * @param start   The start index of s.\r
 779      * @param slength The length of s from start.\r
 780      * @param tlength The length of t.\r
 781      */\r
 782     static boolean matches16CPB(CharSequence s, int start, int slength, final String t, int tlength) {\r
 783         return !(0 < start && com.ibm.icu.text.UTF16.isLeadSurrogate (s.charAt(start - 1)) &&\r
 784                               com.ibm.icu.text.UTF16.isTrailSurrogate(s.charAt(start + 0)))\r
 785                 && !(tlength < slength && com.ibm.icu.text.UTF16.isLeadSurrogate (s.charAt(start + tlength - 1)) &&\r
 786                                        com.ibm.icu.text.UTF16.isTrailSurrogate(s.charAt(start + tlength)))\r
 787                 && matches16(s, start, t, tlength);\r
 788     }\r
 789 \r
 790     // Does the set contain the next code point?\r
 791     // If so, return its length; otherwise return its negative length.\r
 792     static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) {\r
 793         char c = s.charAt(start);\r
 794         if (c >= 0xd800 && c <= 0xdbff && length >= 2) {\r
 795             char c2 = s.charAt(start + 1);\r
 796             if (com.ibm.icu.text.UTF16.isTrailSurrogate(c2)) {\r
 797                 int supplementary = UCharacterProperty.getRawSupplementary(c, c2);\r
 798                 return set.contains(supplementary) ? 2 : -2;\r
 799             }\r
 800         }\r
 801         return set.contains(c) ? 1 : -1;\r
 802     }\r
 803 \r
 804     static int spanOneBack(final UnicodeSet set, CharSequence s, int length) {\r
 805         char c = s.charAt(length - 1);\r
 806         if (c >= 0xdc00 && c <= 0xdfff && length >= 2) {\r
 807             char c2 = s.charAt(length - 2);\r
 808             if (com.ibm.icu.text.UTF16.isLeadSurrogate(c2)) {\r
 809                 int supplementary = UCharacterProperty.getRawSupplementary(c2, c);\r
 810                 return set.contains(supplementary) ? 2 : -2;\r
 811             }\r
 812         }\r
 813         return set.contains(c) ? 1 : -1;\r
 814     }\r
 815 \r
 816 \r
 817     /*\r
 818      * Helper class for UnicodeSetStringSpan.\r
 819      *\r
 820      * List of offsets from the current position from where to try matching a code point or a string. Store offsets rather\r
 821      * than indexes to simplify the code and use the same list for both increments (in span()) and decrements (in\r
 822      * spanBack()).\r
 823      * \r
 824      * Assumption: The maximum offset is limited, and the offsets that are stored at any one time are relatively dense, that\r
 825      * is, there are normally no gaps of hundreds or thousands of offset values.\r
 826      * \r
 827      * The implementation uses a circular buffer of byte flags, each indicating whether the corresponding offset is in the\r
 828      * list. This avoids inserting into a sorted list of offsets (or absolute indexes) and physically moving part of the\r
 829      * list.\r
 830      * \r
 831      * Note: In principle, the caller should setMaxLength() to the maximum of the max string length and U16_LENGTH/U8_LENGTH\r
 832      * to account for "long" single code points.\r
 833      * \r
 834      * Note: If maxLength were guaranteed to be no more than 32 or 64, the list could be stored as bit flags in a single\r
 835      * integer. Rather than handling a circular buffer with a start list index, the integer would simply be shifted when\r
 836      * lower offsets are removed. UnicodeSet does not have a limit on the lengths of strings.\r
 837      */\r
 838     static class OffsetList {\r
 839         private boolean[] list;\r
 840         private int length;\r
 841         private int start;\r
 842 \r
 843         public OffsetList() {\r
 844             list = new boolean[16];  // default size\r
 845         }\r
 846 \r
 847         public void setMaxLength(int maxLength) {\r
 848             if (maxLength > list.length) {\r
 849                 list = new boolean[maxLength];\r
 850             }\r
 851             clear();\r
 852         }\r
 853 \r
 854         public void clear() {\r
 855             for (int i = list.length; i-- > 0;) {\r
 856                 list[i] = false;\r
 857             }\r
 858             start = length = 0;\r
 859         }\r
 860 \r
 861         public boolean isEmpty() {\r
 862             return (length == 0);\r
 863         }\r
 864 \r
 865         // Reduce all stored offsets by delta, used when the current position\r
 866         // moves by delta.\r
 867         // There must not be any offsets lower than delta.\r
 868         // If there is an offset equal to delta, it is removed.\r
 869         // delta=[1..maxLength]\r
 870         public void shift(int delta) {\r
 871             int i = start + delta;\r
 872             if (i >= list.length) {\r
 873                 i -= list.length;\r
 874             }\r
 875             if (list[i]) {\r
 876                 list[i] = false;\r
 877                 --length;\r
 878             }\r
 879             start = i;\r
 880         }\r
 881 \r
 882         // Add an offset. The list must not contain it yet.\r
 883         // offset=[1..maxLength]\r
 884         public void addOffset(int offset) {\r
 885             int i = start + offset;\r
 886             if (i >= list.length) {\r
 887                 i -= list.length;\r
 888             }\r
 889             list[i] = true;\r
 890             ++length;\r
 891         }\r
 892 \r
 893         // offset=[1..maxLength]\r
 894         public boolean containsOffset(int offset) {\r
 895             int i = start + offset;\r
 896             if (i >= list.length) {\r
 897                 i -= list.length;\r
 898             }\r
 899             return list[i];\r
 900         }\r
 901 \r
 902         // Find the lowest stored offset from a non-empty list, remove it,\r
 903         // and reduce all other offsets by this minimum.\r
 904         // Returns [1..maxLength].\r
 905         public int popMinimum() {\r
 906             // Look for the next offset in list[start+1..list.length-1].\r
 907             int i = start, result;\r
 908             while (++i < list.length) {\r
 909                 if (list[i]) {\r
 910                     list[i] = false;\r
 911                     --length;\r
 912                     result = i - start;\r
 913                     start = i;\r
 914                     return result;\r
 915                 }\r
 916             }\r
 917             // i==list.length\r
 918 \r
 919             // Wrap around and look for the next offset in list[0..start].\r
 920             // Since the list is not empty, there will be one.\r
 921             result = list.length - start;\r
 922             i = 0;\r
 923             while (!list[i]) {\r
 924                 ++i;\r
 925             }\r
 926             list[i] = false;\r
 927             --length;\r
 928             start = i;\r
 929             return result += i;\r
 930         }\r
 931     }\r
 932 }\r