jars/icu4j-4_4_2-src/main/classes/collate/src/com/ibm/icu/text/StringSearch.java

   1 /*\r
   2  *******************************************************************************\r
   3  * Copyright (C) 1996-2010, International Business Machines Corporation and    *\r
   4  * others. All Rights Reserved.                                                *\r
   5  *******************************************************************************\r
   6  */\r
   7 \r
   8 package com.ibm.icu.text;\r
   9 \r
  10 import java.text.CharacterIterator;\r
  11 import java.text.StringCharacterIterator;\r
  12 import java.util.Locale;\r
  13 \r
  14 import com.ibm.icu.impl.CharacterIteratorWrapper;\r
  15 import com.ibm.icu.impl.Norm2AllModes;\r
  16 import com.ibm.icu.impl.Normalizer2Impl;\r
  17 import com.ibm.icu.lang.UCharacter;\r
  18 import com.ibm.icu.util.ULocale;\r
  19 \r
  20 /**\r
  21  * <p>\r
  22  * <code>StringSearch</code> is the concrete subclass of \r
  23  * <code>SearchIterator</code> that provides language-sensitive text searching \r
  24  * based on the comparison rules defined in a {@link RuleBasedCollator} object.\r
  25  * </p>\r
  26  * <p>\r
  27  * <code>StringSearch</code> uses a version of the fast Boyer-Moore search\r
  28  * algorithm that has been adapted to work with the large character set of\r
  29  * Unicode. Refer to \r
  30  * <a href="http://www.icu-project.org/docs/papers/efficient_text_searching_in_java.html">\r
  31  * "Efficient Text Searching in Java"</a>, published in the \r
  32  * <i>Java Report</i> on February, 1999, for further information on the \r
  33  * algorithm.\r
  34  * </p>\r
  35  * <p>\r
  36  * Users are also strongly encouraged to read the section on \r
  37  * <a href="http://www.icu-project.org/userguide/searchString.html">\r
  38  * String Search</a> and \r
  39  * <a href="http://www.icu-project.org/userguide/Collate_Intro.html">\r
  40  * Collation</a> in the user guide before attempting to use this class.\r
  41  * </p>\r
  42  * <p>\r
  43  * String searching becomes a little complicated when accents are encountered at\r
  44  * match boundaries. If a match is found and it has preceding or trailing \r
  45  * accents not part of the match, the result returned will include the \r
  46  * preceding accents up to the first base character, if the pattern searched \r
  47  * for starts an accent. Likewise, \r
  48  * if the pattern ends with an accent, all trailing accents up to the first\r
  49  * base character will be included in the result.\r
  50  * </p>\r
  51  * <p>\r
  52  * For example, if a match is found in target text "a&#92;u0325&#92;u0300" for \r
  53  * the pattern\r
  54  * "a&#92;u0325", the result returned by StringSearch will be the index 0 and\r
  55  * length 3 &lt;0, 3&gt;. If a match is found in the target \r
  56  * "a&#92;u0325&#92;u0300" \r
  57  * for the pattern "&#92;u0300", then the result will be index 1 and length 2 \r
  58  * <1, 2>.\r
  59  * </p>\r
  60  * <p>\r
  61  * In the case where the decomposition mode is on for the RuleBasedCollator,\r
  62  * all matches that starts or ends with an accent will have its results include \r
  63  * preceding or following accents respectively. For example, if pattern "a" is\r
  64  * looked for in the target text "&aacute;&#92;u0325", the result will be\r
  65  * index 0 and length 2 &lt;0, 2&gt;.\r
  66  * </p>\r
  67  * <p>\r
  68  * The StringSearch class provides two options to handle accent matching \r
  69  * described below:\r
  70  * </p>\r
  71  * <p>\r
  72  * Let S' be the sub-string of a text string S between the offsets start and \r
  73  * end &lt;start, end&gt;.\r
  74  * <br>\r
  75  * A pattern string P matches a text string S at the offsets &lt;start, \r
  76  * length&gt; \r
  77  * <br>\r
  78  * if\r
  79  * <pre> \r
  80  * option 1. P matches some canonical equivalent string of S'. Suppose the \r
  81  *           RuleBasedCollator used for searching has a collation strength of \r
  82  *           TERTIARY, all accents are non-ignorable. If the pattern \r
  83  *           "a&#92;u0300" is searched in the target text \r
  84  *           "a&#92;u0325&#92;u0300", \r
  85  *           a match will be found, since the target text is canonically \r
  86  *           equivalent to "a&#92;u0300&#92;u0325"\r
  87  * option 2. P matches S' and if P starts or ends with a combining mark, \r
  88  *           there exists no non-ignorable combining mark before or after S' \r
  89  *           in S respectively. Following the example above, the pattern \r
  90  *           "a&#92;u0300" will not find a match in "a&#92;u0325&#92;u0300", \r
  91  *           since\r
  92  *           there exists a non-ignorable accent '&#92;u0325' in the middle of \r
  93  *           'a' and '&#92;u0300'. Even with a target text of \r
  94  *           "a&#92;u0300&#92;u0325" a match will not be found because of the \r
  95  *           non-ignorable trailing accent &#92;u0325.\r
  96  * </pre>\r
  97  * Option 2. will be the default mode for dealing with boundary accents unless\r
  98  * specified via the API setCanonical(boolean).\r
  99  * One restriction is to be noted for option 1. Currently there are no \r
 100  * composite characters that consists of a character with combining class > 0 \r
 101  * before a character with combining class == 0. However, if such a character \r
 102  * exists in the future, the StringSearch may not work correctly with option 1\r
 103  * when such characters are encountered.\r
 104  * </p>\r
 105  * <p>\r
 106  * <tt>SearchIterator</tt> provides APIs to specify the starting position \r
 107  * within the text string to be searched, e.g. <tt>setIndex</tt>,\r
 108  * <tt>preceding</tt> and <tt>following</tt>. Since the starting position will \r
 109  * be set as it is specified, please take note that there are some dangerous \r
 110  * positions which the search may render incorrect results:\r
 111  * <ul>\r
 112  * <li> The midst of a substring that requires decomposition.\r
 113  * <li> If the following match is to be found, the position should not be the\r
 114  *      second character which requires to be swapped with the preceding \r
 115  *      character. Vice versa, if the preceding match is to be found, \r
 116  *      position to search from should not be the first character which \r
 117  *      requires to be swapped with the next character. E.g certain Thai and\r
 118  *      Lao characters require swapping.\r
 119  * <li> If a following pattern match is to be found, any position within a \r
 120  *      contracting sequence except the first will fail. Vice versa if a \r
 121  *      preceding pattern match is to be found, a invalid starting point \r
 122  *      would be any character within a contracting sequence except the last.\r
 123  * </ul>\r
 124  * </p>\r
 125  * <p>\r
 126  * Though collator attributes will be taken into consideration while \r
 127  * performing matches, there are no APIs provided in StringSearch for setting \r
 128  * and getting the attributes. These attributes can be set by getting the \r
 129  * collator from <tt>getCollator</tt> and using the APIs in \r
 130  * <tt>com.ibm.icu.text.Collator</tt>. To update StringSearch to the new \r
 131  * collator attributes, <tt>reset()</tt> or \r
 132  * <tt>setCollator(RuleBasedCollator)</tt> has to be called.\r
 133  * </p>\r
 134  * <p>\r
 135  * Consult the \r
 136  * <a href="http://www.icu-project.org/userguide/searchString.html">\r
 137  * String Search</a> user guide and the <code>SearchIterator</code> \r
 138  * documentation for more information and examples of use.\r
 139  * </p>\r
 140  * <p>\r
 141  * This class is not subclassable\r
 142  * </p>\r
 143  * @see SearchIterator\r
 144  * @see RuleBasedCollator\r
 145  * @author Laura Werner, synwee\r
 146  * @stable ICU 2.0\r
 147  */\r
 148 // internal notes: all methods do not guarantee the correct status of the \r
 149 // characteriterator. the caller has to maintain the original index position\r
 150 // if necessary. methods could change the index position as it deems fit\r
 151 public final class StringSearch extends SearchIterator\r
 152 {\r
 153     \r
 154     // public constructors --------------------------------------------------\r
 155     \r
 156     /**\r
 157      * Initializes the iterator to use the language-specific rules defined in \r
 158      * the argument collator to search for argument pattern in the argument \r
 159      * target text. The argument breakiter is used to define logical matches.\r
 160      * See super class documentation for more details on the use of the target \r
 161      * text and BreakIterator.\r
 162      * @param pattern text to look for.\r
 163      * @param target target text to search for pattern. \r
 164      * @param collator RuleBasedCollator that defines the language rules\r
 165      * @param breakiter A {@link BreakIterator} that is used to determine the \r
 166      *                boundaries of a logical match. This argument can be null.\r
 167      * @exception IllegalArgumentException thrown when argument target is null,\r
 168      *            or of length 0\r
 169      * @see BreakIterator\r
 170      * @see RuleBasedCollator\r
 171      * @see SearchIterator\r
 172      * @stable ICU 2.0\r
 173      */\r
 174     public StringSearch(String pattern, CharacterIterator target,\r
 175                         RuleBasedCollator collator, BreakIterator breakiter) \r
 176     {\r
 177         super(target, breakiter);\r
 178         m_textBeginOffset_ = targetText.getBeginIndex();\r
 179         m_textLimitOffset_ = targetText.getEndIndex();\r
 180         m_collator_ = collator;\r
 181         m_colEIter_ = m_collator_.getCollationElementIterator(target);\r
 182         m_utilColEIter_ = collator.getCollationElementIterator("");\r
 183         m_ceMask_ = getMask(m_collator_.getStrength());\r
 184         m_isCanonicalMatch_ = false;\r
 185         m_pattern_ = new Pattern(pattern);\r
 186         m_matchedIndex_ = DONE;\r
 187         m_charBreakIter_ = BreakIterator.getCharacterInstance(/*m_collator_.getLocale(ULocale.ACTUAL_LOCALE)*/);\r
 188         m_charBreakIter_.setText(target);\r
 189         initialize();\r
 190     }\r
 191 \r
 192     /**\r
 193      * Initializes the iterator to use the language-specific rules defined in \r
 194      * the argument collator to search for argument pattern in the argument \r
 195      * target text. No BreakIterators are set to test for logical matches.\r
 196      * @param pattern text to look for.\r
 197      * @param target target text to search for pattern. \r
 198      * @param collator RuleBasedCollator that defines the language rules\r
 199      * @exception IllegalArgumentException thrown when argument target is null,\r
 200      *            or of length 0\r
 201      * @see RuleBasedCollator\r
 202      * @see SearchIterator\r
 203      * @stable ICU 2.0\r
 204      */\r
 205     public StringSearch(String pattern, CharacterIterator target,\r
 206                         RuleBasedCollator collator) \r
 207     {\r
 208         this(pattern, target, collator, null/*BreakIterator.getCharacterInstance()*/);\r
 209     }\r
 210 \r
 211     /**\r
 212      * Initializes the iterator to use the language-specific rules and \r
 213      * break iterator rules defined in the argument locale to search for \r
 214      * argument pattern in the argument target text. \r
 215      * See super class documentation for more details on the use of the target \r
 216      * text and BreakIterator.\r
 217      * @param pattern text to look for.\r
 218      * @param target target text to search for pattern. \r
 219      * @param locale locale to use for language and break iterator rules\r
 220      * @exception IllegalArgumentException thrown when argument target is null,\r
 221      *            or of length 0. ClassCastException thrown if the collator for \r
 222      *            the specified locale is not a RuleBasedCollator.\r
 223      * @see BreakIterator\r
 224      * @see RuleBasedCollator\r
 225      * @see SearchIterator\r
 226      * @stable ICU 2.0\r
 227      */\r
 228     public StringSearch(String pattern, CharacterIterator target, Locale locale)\r
 229     {\r
 230         this(pattern, target, ULocale.forLocale(locale));\r
 231     }\r
 232 \r
 233     /**\r
 234      * Initializes the iterator to use the language-specific rules and \r
 235      * break iterator rules defined in the argument locale to search for \r
 236      * argument pattern in the argument target text. \r
 237      * See super class documentation for more details on the use of the target \r
 238      * text and BreakIterator.\r
 239      * @param pattern text to look for.\r
 240      * @param target target text to search for pattern. \r
 241      * @param locale ulocale to use for language and break iterator rules\r
 242      * @exception IllegalArgumentException thrown when argument target is null,\r
 243      *            or of length 0. ClassCastException thrown if the collator for \r
 244      *            the specified locale is not a RuleBasedCollator.\r
 245      * @see BreakIterator\r
 246      * @see RuleBasedCollator\r
 247      * @see SearchIterator\r
 248      * @stable ICU 3.2\r
 249      */\r
 250     public StringSearch(String pattern, CharacterIterator target, ULocale locale)\r
 251     {\r
 252         this(pattern, target, (RuleBasedCollator)Collator.getInstance(locale),\r
 253              null/*BreakIterator.getCharacterInstance(locale)*/);\r
 254     }\r
 255 \r
 256     /**\r
 257      * Initializes the iterator to use the language-specific rules and \r
 258      * break iterator rules defined in the default locale to search for \r
 259      * argument pattern in the argument target text. \r
 260      * See super class documentation for more details on the use of the target \r
 261      * text and BreakIterator.\r
 262      * @param pattern text to look for.\r
 263      * @param target target text to search for pattern. \r
 264      * @exception IllegalArgumentException thrown when argument target is null,\r
 265      *            or of length 0. ClassCastException thrown if the collator for \r
 266      *            the default locale is not a RuleBasedCollator.\r
 267      * @see BreakIterator\r
 268      * @see RuleBasedCollator\r
 269      * @see SearchIterator\r
 270      * @stable ICU 2.0\r
 271      */\r
 272     public StringSearch(String pattern, String target) \r
 273     {\r
 274         this(pattern, new StringCharacterIterator(target),\r
 275              (RuleBasedCollator)Collator.getInstance(),\r
 276              null/*BreakIterator.getCharacterInstance()*/);\r
 277     }\r
 278 \r
 279     // public getters -----------------------------------------------------\r
 280     \r
 281     /**\r
 282      * <p>\r
 283      * Gets the RuleBasedCollator used for the language rules.\r
 284      * </p>\r
 285      * <p>\r
 286      * Since StringSearch depends on the returned RuleBasedCollator, any \r
 287      * changes to the RuleBasedCollator result should follow with a call to \r
 288      * either StringSearch.reset() or \r
 289      * StringSearch.setCollator(RuleBasedCollator) to ensure the correct \r
 290      * search behaviour.\r
 291      * </p>\r
 292      * @return RuleBasedCollator used by this StringSearch\r
 293      * @see RuleBasedCollator\r
 294      * @see #setCollator\r
 295      * @stable ICU 2.0\r
 296      */\r
 297     public RuleBasedCollator getCollator() \r
 298     {\r
 299         return m_collator_;\r
 300     }\r
 301     \r
 302     /**\r
 303      * Returns the pattern for which StringSearch is searching for.\r
 304      * @return the pattern searched for\r
 305      * @stable ICU 2.0\r
 306      */\r
 307     public String getPattern() \r
 308     {\r
 309         return m_pattern_.targetText;\r
 310     }\r
 311     \r
 312     /**\r
 313      * Return the index in the target text where the iterator is currently \r
 314      * positioned at. \r
 315      * If the iteration has gone past the end of the target text or past \r
 316      * the beginning for a backwards search, {@link #DONE} is returned.\r
 317      * @return index in the target text where the iterator is currently \r
 318      *         positioned at\r
 319      * @stable ICU 2.8\r
 320      */\r
 321     public int getIndex() \r
 322     {\r
 323         int result = m_colEIter_.getOffset();\r
 324         if (isOutOfBounds(m_textBeginOffset_, m_textLimitOffset_, result)) {\r
 325             return DONE;\r
 326         }\r
 327         return result;\r
 328     }\r
 329     \r
 330     /**\r
 331      * Determines whether canonical matches (option 1, as described in the \r
 332      * class documentation) is set.\r
 333      * See setCanonical(boolean) for more information.\r
 334      * @see #setCanonical\r
 335      * @return true if canonical matches is set, false otherwise\r
 336      * @stable ICU 2.8\r
 337      */\r
 338     public boolean isCanonical() \r
 339     {\r
 340         return m_isCanonicalMatch_;\r
 341     }\r
 342     \r
 343     // public setters -----------------------------------------------------\r
 344     \r
 345     /**\r
 346      * <p>\r
 347      * Sets the RuleBasedCollator to be used for language-specific searching.\r
 348      * </p>\r
 349      * <p>\r
 350      * This method causes internal data such as Boyer-Moore shift tables\r
 351      * to be recalculated, but the iterator's position is unchanged.\r
 352      * </p>\r
 353      * @param collator to use for this StringSearch\r
 354      * @exception IllegalArgumentException thrown when collator is null\r
 355      * @see #getCollator\r
 356      * @stable ICU 2.0\r
 357      */\r
 358     public void setCollator(RuleBasedCollator collator) \r
 359     {\r
 360         if (collator == null) {\r
 361             throw new IllegalArgumentException("Collator can not be null");\r
 362         }\r
 363         m_collator_ = collator;\r
 364         m_ceMask_ = getMask(m_collator_.getStrength());\r
 365         // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT\r
 366         initialize();\r
 367         m_colEIter_.setCollator(m_collator_);\r
 368         m_utilColEIter_.setCollator(m_collator_);\r
 369         m_charBreakIter_ = BreakIterator.getCharacterInstance(/*collator.getLocale(ULocale.VALID_LOCALE)*/);\r
 370         m_charBreakIter_.setText(targetText);\r
 371     }\r
 372     \r
 373     /**\r
 374      * <p>\r
 375      * Set the pattern to search for.  \r
 376      * </p>\r
 377      * <p>\r
 378      * This method causes internal data such as Boyer-Moore shift tables\r
 379      * to be recalculated, but the iterator's position is unchanged.\r
 380      * </p>\r
 381      * @param pattern for searching\r
 382      * @see #getPattern\r
 383      * @exception IllegalArgumentException thrown if pattern is null or of\r
 384      *               length 0\r
 385      * @stable ICU 2.0\r
 386      */\r
 387     public void setPattern(String pattern) \r
 388     {\r
 389         if (pattern == null || pattern.length() <= 0) {\r
 390             throw new IllegalArgumentException(\r
 391                     "Pattern to search for can not be null or of length 0");\r
 392         }\r
 393         m_pattern_.targetText = pattern;\r
 394         initialize();\r
 395     }\r
 396     \r
 397     /**\r
 398       * Set the target text to be searched. Text iteration will hence begin at \r
 399      * the start of the text string. This method is useful if you want to \r
 400      * re-use an iterator to search within a different body of text.\r
 401      * @param text new text iterator to look for match, \r
 402      * @exception IllegalArgumentException thrown when text is null or has\r
 403      *            0 length\r
 404      * @see #getTarget\r
 405      * @stable ICU 2.8\r
 406      */\r
 407     public void setTarget(CharacterIterator text)\r
 408     {\r
 409         super.setTarget(text);\r
 410         m_textBeginOffset_ = targetText.getBeginIndex();\r
 411         m_textLimitOffset_ = targetText.getEndIndex();\r
 412         m_colEIter_.setText(targetText);\r
 413         m_charBreakIter_.setText(targetText);\r
 414     }\r
 415     \r
 416     /**\r
 417      * <p>\r
 418      * Sets the position in the target text which the next search will start \r
 419      * from to the argument. This method clears all previous states.\r
 420      * </p>\r
 421      * <p>\r
 422      * This method takes the argument position and sets the position in the \r
 423      * target text accordingly, without checking if position is pointing to a \r
 424      * valid starting point to begin searching.\r
 425      * </p>\r
 426      * <p>\r
 427      * Search positions that may render incorrect results are highlighted in \r
 428      * the class documentation.\r
 429      * </p>\r
 430      * @param position index to start next search from.\r
 431      * @exception IndexOutOfBoundsException thrown if argument position is out\r
 432      *            of the target text range.\r
 433      * @see #getIndex\r
 434      * @stable ICU 2.8\r
 435      */\r
 436     public void setIndex(int position)\r
 437     {\r
 438         super.setIndex(position);\r
 439         m_matchedIndex_ = DONE;\r
 440         m_colEIter_.setExactOffset(position);\r
 441     }\r
 442     \r
 443     /**\r
 444      * <p>\r
 445      * Set the canonical match mode. See class documentation for details.\r
 446      * The default setting for this property is false.\r
 447      * </p>\r
 448      * @param allowCanonical flag indicator if canonical matches are allowed\r
 449      * @see #isCanonical\r
 450      * @stable ICU 2.8\r
 451      */\r
 452     public void setCanonical(boolean allowCanonical)\r
 453     {\r
 454         m_isCanonicalMatch_ = allowCanonical;\r
 455         if (m_isCanonicalMatch_ == true) {\r
 456             if (m_canonicalPrefixAccents_ == null) {\r
 457                 m_canonicalPrefixAccents_ = new StringBuilder();\r
 458             }\r
 459             else {\r
 460                 m_canonicalPrefixAccents_.delete(0, \r
 461                                             m_canonicalPrefixAccents_.length());\r
 462             }\r
 463             if (m_canonicalSuffixAccents_ == null) {\r
 464                 m_canonicalSuffixAccents_ = new StringBuilder();\r
 465             }\r
 466             else {\r
 467                 m_canonicalSuffixAccents_.delete(0, \r
 468                                             m_canonicalSuffixAccents_.length());\r
 469             }\r
 470         }\r
 471     }\r
 472     \r
 473     // public miscellaneous methods -----------------------------------------\r
 474     \r
 475     /** \r
 476      * <p>\r
 477      * Resets the search iteration. All properties will be reset to the \r
 478      * default value.\r
 479      * </p>\r
 480      * <p>\r
 481      * Search will begin at the start of the target text if a forward iteration \r
 482      * is initiated before a backwards iteration. Otherwise if a \r
 483      * backwards iteration is initiated before a forwards iteration, the search \r
 484      * will begin at the end of the target text.\r
 485      * </p>\r
 486      * <p>\r
 487      * Canonical match option will be reset to false, ie an exact match.\r
 488      * </p>\r
 489      * @stable ICU 2.8\r
 490      */\r
 491     public void reset()\r
 492     {\r
 493         // reset is setting the attributes that are already in string search, \r
 494         // hence all attributes in the collator should be retrieved without any \r
 495         // problems\r
 496         super.reset();\r
 497         m_isCanonicalMatch_ = false;\r
 498         m_ceMask_ = getMask(m_collator_.getStrength());\r
 499         // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT\r
 500         initialize();\r
 501         m_colEIter_.setCollator(m_collator_);\r
 502         m_colEIter_.reset();\r
 503         m_utilColEIter_.setCollator(m_collator_);\r
 504     }\r
 505 \r
 506     // protected methods -----------------------------------------------------\r
 507     \r
 508     /**\r
 509      * <p>\r
 510      * Concrete method to provide the mechanism \r
 511      * for finding the next <b>forwards</b> match in the target text.\r
 512      * See super class documentation for its use.\r
 513      * </p>  \r
 514      * @param start index in the target text at which the forwards search \r
 515      *        should begin.\r
 516      * @return the starting index of the next forwards match if found, DONE \r
 517      *         otherwise\r
 518      * @see #handlePrevious(int)\r
 519      * @see #DONE\r
 520      * @stable ICU 2.8\r
 521      */\r
 522     protected int handleNext(int start)\r
 523     {\r
 524         if (m_pattern_.m_CELength_ == 0) {\r
 525             matchLength = 0;\r
 526             if (m_matchedIndex_ == DONE && start == m_textBeginOffset_) {\r
 527                 m_matchedIndex_ = start;\r
 528                 return m_matchedIndex_;\r
 529             }\r
 530             \r
 531             targetText.setIndex(start);\r
 532             char ch = targetText.current();\r
 533             // ch can never be done, it is handled by next()\r
 534             char ch2 = targetText.next();\r
 535             if (ch2 == CharacterIterator.DONE) {\r
 536                 m_matchedIndex_ = DONE;    \r
 537             }\r
 538             else {\r
 539                 m_matchedIndex_ = targetText.getIndex();\r
 540             }\r
 541             if (UTF16.isLeadSurrogate(ch) && UTF16.isTrailSurrogate(ch2)) {\r
 542                 targetText.next();\r
 543                 m_matchedIndex_ = targetText.getIndex();\r
 544             }\r
 545         }\r
 546         else {\r
 547             if (matchLength <= 0) {\r
 548                 // we must have reversed direction after we reached the start\r
 549                 // of the target text\r
 550                 // see SearchIterator next(), it checks the bounds and returns\r
 551                 // if it exceeds the range. It does not allow setting of\r
 552                 // m_matchedIndex\r
 553                 if (start == m_textBeginOffset_) {\r
 554                     m_matchedIndex_ = DONE;\r
 555                 }\r
 556                 else {\r
 557                     // for boundary check purposes. this will ensure that the\r
 558                     // next match will not preceed the current offset\r
 559                     // note search->matchedIndex will always be set to something\r
 560                     // in the code\r
 561                     m_matchedIndex_ = start - 1;\r
 562                 }\r
 563             }\r
 564     \r
 565             // status checked below\r
 566             if (m_isCanonicalMatch_) {\r
 567                 // can't use exact here since extra accents are allowed.\r
 568                 handleNextCanonical(start);\r
 569             }\r
 570             else {\r
 571                 handleNextExact(start);\r
 572             }\r
 573         }\r
 574         if (m_matchedIndex_ == DONE) {\r
 575             targetText.setIndex(m_textLimitOffset_);\r
 576         }\r
 577         else {\r
 578             targetText.setIndex(m_matchedIndex_);\r
 579         }\r
 580         return m_matchedIndex_;\r
 581     }\r
 582     \r
 583     /**\r
 584      * <p>\r
 585      * Concrete method to provide the mechanism \r
 586      * for finding the next <b>backwards</b> match in the target text.\r
 587      * See super class documentation for its use.\r
 588      * </p>  \r
 589      * @param start index in the target text at which the backwards search \r
 590      *        should begin.\r
 591      * @return the starting index of the next backwards match if found, DONE \r
 592      *         otherwise\r
 593      * @see #handleNext(int)\r
 594      * @see #DONE\r
 595      * @stable ICU 2.8\r
 596      */\r
 597     protected int handlePrevious(int start)\r
 598     {\r
 599         if (m_pattern_.m_CELength_ == 0) {\r
 600             matchLength = 0;\r
 601             // start can never be DONE or 0, it is handled in previous\r
 602             targetText.setIndex(start);\r
 603             char ch = targetText.previous();\r
 604             if (ch == CharacterIterator.DONE) {\r
 605                 m_matchedIndex_ = DONE;\r
 606             }\r
 607             else {\r
 608                 m_matchedIndex_ = targetText.getIndex();\r
 609                 if (UTF16.isTrailSurrogate(ch)) {\r
 610                     if (UTF16.isLeadSurrogate(targetText.previous())) {\r
 611                         m_matchedIndex_ = targetText.getIndex();\r
 612                     }\r
 613                 }\r
 614             }            \r
 615         }\r
 616         else {\r
 617             if (matchLength == 0) {\r
 618                 // we must have reversed direction after we reached the end\r
 619                 // of the target text\r
 620                 // see SearchIterator next(), it checks the bounds and returns\r
 621                 // if it exceeds the range. It does not allow setting of\r
 622                 // m_matchedIndex\r
 623                 m_matchedIndex_ = DONE;\r
 624             }\r
 625             if (m_isCanonicalMatch_) {\r
 626                 // can't use exact here since extra accents are allowed.\r
 627                 handlePreviousCanonical(start);\r
 628             }\r
 629             else {\r
 630                 handlePreviousExact(start);\r
 631             }\r
 632         }\r
 633 \r
 634         if (m_matchedIndex_ == DONE) {\r
 635             targetText.setIndex(m_textBeginOffset_);\r
 636         }\r
 637         else {\r
 638             targetText.setIndex(m_matchedIndex_);\r
 639         }\r
 640         return m_matchedIndex_;\r
 641     }\r
 642 \r
 643     // private static inner classes ----------------------------------------\r
 644     \r
 645     private static class Pattern \r
 646     {\r
 647         // protected methods -----------------------------------------------\r
 648         \r
 649         /**\r
 650          * Pattern string\r
 651          */\r
 652         protected String targetText;\r
 653         /**\r
 654          * Array containing the collation elements of targetText\r
 655          */\r
 656         protected int m_CE_[];\r
 657         /**\r
 658          * Number of collation elements in m_CE_\r
 659          */\r
 660         protected int m_CELength_; \r
 661         /**\r
 662          * Flag indicator if targetText starts with an accent\r
 663          */\r
 664         protected boolean m_hasPrefixAccents_;\r
 665         /**\r
 666          * Flag indicator if targetText ends with an accent\r
 667          */\r
 668         protected boolean m_hasSuffixAccents_;\r
 669         /**\r
 670          * Default number of characters to shift for Boyer Moore\r
 671          */\r
 672         protected int m_defaultShiftSize_;\r
 673         /**\r
 674          * Number of characters to shift for Boyer Moore, depending on the\r
 675          * source text to search\r
 676          */\r
 677         protected char m_shift_[];\r
 678         /**\r
 679          * Number of characters to shift backwards for Boyer Moore, depending \r
 680          * on the source text to search\r
 681          */\r
 682         protected char m_backShift_[];\r
 683         \r
 684         // protected constructors ------------------------------------------\r
 685         \r
 686         /**\r
 687          * Empty constructor \r
 688          */\r
 689         protected Pattern(String pattern) \r
 690         {\r
 691             targetText = pattern;\r
 692             m_CE_ = new int[INITIAL_ARRAY_SIZE_];    \r
 693             m_CELength_ = 0;\r
 694             m_hasPrefixAccents_ = false;\r
 695             m_hasSuffixAccents_ = false;\r
 696             m_defaultShiftSize_ = 1;        \r
 697             m_shift_ = new char[MAX_TABLE_SIZE_];\r
 698             m_backShift_ = new char[MAX_TABLE_SIZE_];\r
 699         }\r
 700     }\r
 701 \r
 702 \r
 703     // private data members ------------------------------------------------\r
 704     \r
 705     /**\r
 706      * target text begin offset. Each targetText has a valid contiguous region \r
 707      * to iterate and this data member is the offset to the first such\r
 708      * character in the region.\r
 709      */\r
 710     private int m_textBeginOffset_;\r
 711     /**\r
 712      * target text limit offset. Each targetText has a valid contiguous region \r
 713      * to iterate and this data member is the offset to 1 after the last such\r
 714      * character in the region.\r
 715      */\r
 716     private int m_textLimitOffset_;\r
 717     /**\r
 718      * Upon completion of a search, m_matchIndex_ will store starting offset in\r
 719      * m_text for the match. The Value DONE is the default value. \r
 720      * If we are not at the start of the text or the end of the text and \r
 721      * m_matchedIndex_ is DONE it means that we can find any more matches in \r
 722      * that particular direction\r
 723      */\r
 724     private int m_matchedIndex_; \r
 725     /**\r
 726      * Current pattern to search for\r
 727      */\r
 728     private Pattern m_pattern_;\r
 729     /**\r
 730      * Collator whose rules are used to perform the search\r
 731      */\r
 732     private RuleBasedCollator m_collator_;\r
 733     /** \r
 734      * The collation element iterator for the text source.\r
 735      */\r
 736     private CollationElementIterator m_colEIter_;\r
 737     /** \r
 738      * Utility collation element, used throughout program for temporary \r
 739      * iteration.\r
 740      */\r
 741     private CollationElementIterator m_utilColEIter_;\r
 742     /**\r
 743      * The mask used on the collation elements to retrieve the valid strength\r
 744      * weight \r
 745      */\r
 746     private int m_ceMask_;\r
 747     /**\r
 748      * Buffer storing accents during a canonical search\r
 749      */\r
 750     private StringBuilder m_canonicalPrefixAccents_;\r
 751     /**\r
 752      * Buffer storing accents during a canonical search\r
 753      */\r
 754     private StringBuilder m_canonicalSuffixAccents_;\r
 755     /**\r
 756      * Flag to indicate if canonical search is to be done.\r
 757      * E.g looking for "a\u0300" in "a\u0318\u0300" will yield the match at 0.\r
 758      */\r
 759     private boolean m_isCanonicalMatch_;\r
 760     /**\r
 761      * Character break iterator for boundary checking.\r
 762      */\r
 763     private BreakIterator m_charBreakIter_;\r
 764     private final Normalizer2Impl m_nfcImpl_ = Norm2AllModes.getNFCInstance().impl;\r
 765     /**\r
 766      * Size of the shift tables\r
 767      */\r
 768     private static final int MAX_TABLE_SIZE_ = 257; \r
 769     /**\r
 770      * Initial array size\r
 771      */\r
 772     private static final int INITIAL_ARRAY_SIZE_ = 256;\r
 773     /**\r
 774      * Utility mask\r
 775      */\r
 776     private static final int SECOND_LAST_BYTE_SHIFT_ = 8;\r
 777     /**\r
 778      * Utility mask\r
 779      */\r
 780     private static final int LAST_BYTE_MASK_ = 0xff;\r
 781     /**\r
 782      * Utility buffer for return values and temporary storage\r
 783      */\r
 784     private int m_utilBuffer_[] = new int[2];\r
 785     /**\r
 786      *  Unsigned 32-Bit Integer Mask\r
 787      */\r
 788     private static final long UNSIGNED_32BIT_MASK = 0xffffffffL;\r
 789 \r
 790     // private methods -------------------------------------------------------\r
 791 \r
 792     /**\r
 793      * Hash a collation element from its full size (32 bits) down into a\r
 794      * value that can be used as an index into the shift tables.  Right\r
 795      * now we do a modulus by the size of the hash table.\r
 796      * @param ce collation element\r
 797      * @return collapsed version of the collation element\r
 798      */\r
 799     private static final int hash(int ce) \r
 800     {\r
 801         // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work\r
 802         // well with the new collation where most of the latin 1 characters\r
 803         // are of the value xx000xxx. their hashes will most of the time be 0\r
 804         // to be discussed on the hash algo.\r
 805         return CollationElementIterator.primaryOrder(ce) % MAX_TABLE_SIZE_;\r
 806     }\r
 807 \r
 808     private final char getFCD(int c) {\r
 809         return (char)m_nfcImpl_.getFCD16(c);\r
 810     }\r
 811     /**\r
 812      * Gets the fcd value for a character at the argument index.\r
 813      * This method takes into accounts of the supplementary characters.\r
 814      * Note this method changes the offset in the character iterator.\r
 815      * @param str UTF16 string where character for fcd retrieval resides\r
 816      * @param offset position of the character whose fcd is to be retrieved\r
 817      * @return fcd value\r
 818      */\r
 819     private final char getFCD(CharacterIterator str, int offset)\r
 820     {\r
 821         char ch = str.setIndex(offset);\r
 822         int result = m_nfcImpl_.getFCD16FromSingleLead(ch);\r
 823         if (result != 0 && Character.isHighSurrogate(ch)) {\r
 824             char c2 = str.next();\r
 825             if (Character.isLowSurrogate(c2)) {\r
 826                 result = m_nfcImpl_.getFCD16(Character.toCodePoint(ch, c2));\r
 827             } else {\r
 828                 result = 0;\r
 829             }\r
 830         }\r
 831         return (char)result;\r
 832     }\r
 833     /**\r
 834      * Gets the FCD value for the code point before the input offset.\r
 835      * Modifies the iterator's index.\r
 836      * @param iter text iterator\r
 837      * @param offset index after the character to test\r
 838      * @return FCD value for the character before offset\r
 839      */\r
 840     private final int getFCDBefore(CharacterIterator iter, int offset) {\r
 841         int result;\r
 842         iter.setIndex(offset);\r
 843         char c = iter.previous();\r
 844         if (UTF16.isSurrogate(c)) {\r
 845             if (Normalizer2Impl.UTF16Plus.isSurrogateLead(c)) {\r
 846                 result = 0;\r
 847             } else {\r
 848                 char lead = iter.previous();\r
 849                 if (Character.isHighSurrogate(lead)) {\r
 850                     result = m_nfcImpl_.getFCD16(Character.toCodePoint(lead, c));\r
 851                 } else {\r
 852                     result = 0;\r
 853                 }\r
 854             }\r
 855         } else {\r
 856             result = m_nfcImpl_.getFCD16FromSingleLead(c);\r
 857         }\r
 858         return result;\r
 859     }\r
 860     /**\r
 861      * Gets the fcd value for a character at the argument index.\r
 862      * This method takes into accounts of the supplementary characters.\r
 863      * @param str UTF16 string where character for fcd retrieval resides\r
 864      * @param offset position of the character whose fcd is to be retrieved\r
 865      * @return fcd value\r
 866      */\r
 867     private final char getFCD(String str, int offset)\r
 868     {\r
 869         char ch = str.charAt(offset);\r
 870         int result = m_nfcImpl_.getFCD16FromSingleLead(ch);\r
 871         if (result != 0 && Character.isHighSurrogate(ch)) {\r
 872             char c2;\r
 873             if (++offset < str.length() && Character.isLowSurrogate(c2 = str.charAt(offset))) {\r
 874                 result = m_nfcImpl_.getFCD16(Character.toCodePoint(ch, c2));\r
 875             } else {\r
 876                 result = 0;\r
 877             }\r
 878         }\r
 879         return (char)result;\r
 880     }\r
 881 \r
 882     /**\r
 883     * Getting the modified collation elements taking into account the collation \r
 884     * attributes\r
 885     * @param ce \r
 886     * @return the modified collation element\r
 887     */\r
 888     private final int getCE(int ce)\r
 889     {\r
 890         // note for tertiary we can't use the collator->tertiaryMask, that\r
 891         // is a preprocessed mask that takes into account case options. since\r
 892         // we are only concerned with exact matches, we don't need that.\r
 893         ce &= m_ceMask_;\r
 894         \r
 895         if (m_collator_.isAlternateHandlingShifted()) {\r
 896             // alternate handling here, since only the 16 most significant \r
 897             // digits is only used, we can safely do a compare without masking\r
 898             // if the ce is a variable, we mask and get only the primary values\r
 899             // no shifting to quartenary is required since all primary values\r
 900             // less than variabletop will need to be masked off anyway.\r
 901             if (((m_collator_.m_variableTopValue_  << 16) & UNSIGNED_32BIT_MASK) > (ce & UNSIGNED_32BIT_MASK)) {\r
 902                 if (m_collator_.getStrength() == Collator.QUATERNARY) {\r
 903                     ce = CollationElementIterator.primaryOrder(ce);\r
 904                 }\r
 905                 else { \r
 906                     ce = CollationElementIterator.IGNORABLE;\r
 907                 }\r
 908             }\r
 909         }\r
 910     \r
 911         return ce;\r
 912     }\r
 913     \r
 914     /**\r
 915      * Appends a int to a int array, increasing the size of the array when \r
 916      * we are out of space.\r
 917      * @param offset in array to append to\r
 918      * @param value to append\r
 919      * @param array to append to\r
 920      * @return the array appended to, this could be a new and bigger array\r
 921      */\r
 922     private static final int[] append(int offset, int value, int array[])\r
 923     {\r
 924         if (offset >= array.length) {\r
 925             int temp[] = new int[offset + INITIAL_ARRAY_SIZE_];\r
 926             System.arraycopy(array, 0, temp, 0, array.length);\r
 927             array = temp;\r
 928         }\r
 929         array[offset] = value;\r
 930         return array;\r
 931     }\r
 932     \r
 933     /**\r
 934      * Initializing the ce table for a pattern. Stores non-ignorable collation \r
 935      * keys. Table size will be estimated by the size of the pattern text. \r
 936      * Table expansion will be perform as we go along. Adding 1 to ensure that \r
 937      * the table size definitely increases.\r
 938      * Internal method, status assumed to be a success.\r
 939      * @return total number of expansions \r
 940      */\r
 941     private final int initializePatternCETable()\r
 942     {\r
 943         m_utilColEIter_.setText(m_pattern_.targetText);\r
 944         \r
 945         int offset = 0;\r
 946         int result = 0;\r
 947         int ce = m_utilColEIter_.next();\r
 948     \r
 949         while (ce != CollationElementIterator.NULLORDER) {\r
 950             int newce = getCE(ce);\r
 951             if (newce != CollationElementIterator.IGNORABLE) {\r
 952                 m_pattern_.m_CE_ = append(offset, newce, m_pattern_.m_CE_);\r
 953                 offset ++;            \r
 954             }\r
 955             result += m_utilColEIter_.getMaxExpansion(ce) - 1;\r
 956             ce = m_utilColEIter_.next();\r
 957         }\r
 958     \r
 959         m_pattern_.m_CE_ = append(offset, 0, m_pattern_.m_CE_);\r
 960         m_pattern_.m_CELength_ = offset;\r
 961     \r
 962         return result;\r
 963     }\r
 964     \r
 965     /**\r
 966      * Initializes the pattern struct.\r
 967      * Internal method, status assumed to be success.\r
 968      * @return expansionsize the total expansion size of the pattern\r
 969      */ \r
 970     private final int initializePattern()\r
 971     {\r
 972         if (m_collator_.getStrength() == Collator.PRIMARY) {\r
 973             m_pattern_.m_hasPrefixAccents_ = false;\r
 974             m_pattern_.m_hasSuffixAccents_ = false;\r
 975         } else {\r
 976             m_pattern_.m_hasPrefixAccents_ = (getFCD(m_pattern_.targetText, 0) \r
 977                                                  >> SECOND_LAST_BYTE_SHIFT_) != 0;\r
 978             m_pattern_.m_hasSuffixAccents_ = (getFCD(m_pattern_.targetText.codePointBefore(\r
 979                                                         m_pattern_.targetText.length()))\r
 980                                                 & LAST_BYTE_MASK_) != 0;\r
 981         }\r
 982         // since intializePattern is an internal method status is a success.\r
 983         return initializePatternCETable();   \r
 984     }\r
 985     \r
 986     /**\r
 987      * Initializing shift tables, with the default values.\r
 988      * If a corresponding default value is 0, the shift table is not set.\r
 989      * @param shift table for forwards shift \r
 990      * @param backshift table for backwards shift\r
 991      * @param cetable table containing pattern ce\r
 992      * @param cesize size of the pattern ces\r
 993      * @param expansionsize total size of the expansions\r
 994      * @param defaultforward the default forward value\r
 995      * @param defaultbackward the default backward value\r
 996      */\r
 997      private final void setShiftTable(char shift[], \r
 998                                                     char backshift[], \r
 999                                                     int cetable[], int cesize, \r
1000                                                       int expansionsize,\r
1001                                                     char defaultforward,\r
1002                                                       char defaultbackward)\r
1003     {\r
1004         // estimate the value to shift. to do that we estimate the smallest \r
1005         // number of characters to give the relevant ces, ie approximately\r
1006         // the number of ces minus their expansion, since expansions can come \r
1007         // from a character.\r
1008         for (int count = 0; count < MAX_TABLE_SIZE_; count ++) {\r
1009             shift[count] = defaultforward;\r
1010         }\r
1011         cesize --; // down to the last index\r
1012         for (int count = 0; count < cesize; count ++) {\r
1013             // number of ces from right of array to the count\r
1014             int temp = defaultforward - count - 1;\r
1015             shift[hash(cetable[count])] = temp > 1 ? ((char)temp) : 1;\r
1016         }\r
1017         shift[hash(cetable[cesize])] = 1;\r
1018         // for ignorables we just shift by one. see test examples.\r
1019         shift[hash(0)] = 1;\r
1020         \r
1021         for (int count = 0; count < MAX_TABLE_SIZE_; count ++) {\r
1022             backshift[count] = defaultbackward;\r
1023         }\r
1024         for (int count = cesize; count > 0; count --) {\r
1025             // the original value count does not seem to work\r
1026             backshift[hash(cetable[count])] = (char)(count > expansionsize ? \r
1027                                                       count - expansionsize : 1);\r
1028         }\r
1029         backshift[hash(cetable[0])] = 1;\r
1030         backshift[hash(0)] = 1;\r
1031     }\r
1032     \r
1033     /**\r
1034      * <p>Building of the pattern collation element list and the Boyer Moore \r
1035      * StringSearch table.</p>\r
1036      * <p>The canonical match will only be performed after the default match \r
1037      * fails.</p>\r
1038      * <p>For both cases we need to remember the size of the composed and \r
1039      * decomposed versions of the string. Since the Boyer-Moore shift \r
1040      * calculations shifts by a number of characters in the text and tries to \r
1041      * match the pattern from that offset, the shift value can not be too large \r
1042      * in case we miss some characters. To choose a right shift size, we \r
1043      * estimate the NFC form of the and use its size as a shift guide. The NFC \r
1044      * form should be the small possible representation of the pattern. Anyways, \r
1045      * we'll err on the smaller shift size. Hence the calculation for \r
1046      * minlength. Canonical match will be performed slightly differently. We'll \r
1047      * split the pattern into 3 parts, the prefix accents (PA), the middle \r
1048      * string bounded by the first and last base character (MS), the ending \r
1049      * accents (EA). Matches will be done on MS first, and only when we match \r
1050      * MS then some processing will be required for the prefix and end accents \r
1051      * in order to determine if they match PA and EA. Hence the default shift \r
1052      * values for the canonical match will take the size of either end's accent \r
1053      * into consideration. Forwards search will take the end accents into \r
1054      * consideration for the default shift values and the backwards search will \r
1055      * take the prefix accents into consideration.</p>\r
1056      * <p>If pattern has no non-ignorable ce, we return a illegal argument \r
1057      * error.</p>\r
1058      */ \r
1059     private final void initialize()\r
1060     {\r
1061         int expandlength  = initializePattern();   \r
1062         if (m_pattern_.m_CELength_ > 0) {\r
1063             char minlength = (char)(m_pattern_.m_CELength_ > expandlength \r
1064                                 ? m_pattern_.m_CELength_ - expandlength : 1);\r
1065             m_pattern_.m_defaultShiftSize_ = minlength;\r
1066             setShiftTable(m_pattern_.m_shift_, m_pattern_.m_backShift_, \r
1067                           m_pattern_.m_CE_, m_pattern_.m_CELength_, \r
1068                           expandlength, minlength, minlength);\r
1069         }\r
1070         else {\r
1071             m_pattern_.m_defaultShiftSize_ = 0;\r
1072         }\r
1073     }\r
1074     \r
1075     /**\r
1076      * Determine whether the search text bounded by the offset start and end is \r
1077      * one or more whole units of text as determined by the breakiterator in \r
1078      * StringSearch.\r
1079      * @param start target text start offset\r
1080      * @param end target text end offset\r
1081      */\r
1082     private final boolean isBreakUnit(int start, int end) \r
1083     {\r
1084         if (breakIterator != null) {\r
1085             int startindex = breakIterator.first();\r
1086             int endindex   = breakIterator.last();\r
1087             \r
1088             // out-of-range indexes are never boundary positions\r
1089             if (start < startindex || start > endindex || end < startindex \r
1090                 || end > endindex) {\r
1091                 return false;\r
1092             }\r
1093             // otherwise, we can use following() on the position before the \r
1094             // specified one and return true of the position we get back is the \r
1095             // one the user specified\r
1096             boolean result = (start == startindex \r
1097                               || breakIterator.following(start - 1) == start) \r
1098                              && (end == endindex \r
1099                                   || breakIterator.following(end - 1) == end);\r
1100             if (result) {\r
1101                 // iterates the individual ces\r
1102                 m_utilColEIter_.setText(\r
1103                     new CharacterIteratorWrapper(targetText), start);\r
1104                 for (int count = 0; count < m_pattern_.m_CELength_;\r
1105                      count ++) {\r
1106                     int ce = getCE(m_utilColEIter_.next());\r
1107                     if (ce == CollationElementIterator.IGNORABLE) {\r
1108                         count --;\r
1109                         continue;\r
1110                     }\r
1111                     if (ce != m_pattern_.m_CE_[count]) {\r
1112                         return false;\r
1113                     }\r
1114                 }\r
1115                 int nextce = m_utilColEIter_.next();\r
1116                 while (m_utilColEIter_.getOffset() == end \r
1117                        && getCE(nextce) == CollationElementIterator.IGNORABLE) {\r
1118                     nextce = m_utilColEIter_.next();       \r
1119                 }\r
1120                 if (nextce != CollationElementIterator.NULLORDER \r
1121                     && m_utilColEIter_.getOffset() == end) {\r
1122                     // extra collation elements at the end of the match\r
1123                     return false;\r
1124                 }\r
1125             }\r
1126             return result;\r
1127         }\r
1128         return true;\r
1129     }\r
1130 \r
1131     /**\r
1132      * Getting the next base character offset if current offset is an accent, \r
1133      * or the current offset if the current character contains a base character. \r
1134      * accents the following base character will be returned\r
1135      * @param text string\r
1136      * @param textoffset current offset\r
1137      * @param textlength length of text string\r
1138      * @return the next base character or the current offset\r
1139      *         if the current character is contains a base character.\r
1140      */\r
1141     private final int getNextBaseOffset(CharacterIterator text, int textoffset)\r
1142     {\r
1143         if (textoffset >= text.getEndIndex()) {\r
1144             return textoffset;\r
1145         }\r
1146         // iteration ends with reading CharacterIterator.DONE which has fcd==0\r
1147         char c = text.setIndex(textoffset);\r
1148         for (;;) {\r
1149             if ((m_nfcImpl_.getFCD16FromSingleLead(c) >> SECOND_LAST_BYTE_SHIFT_) == 0) {\r
1150                 return textoffset;\r
1151             }\r
1152             char next = text.next();\r
1153             if (Character.isSurrogatePair(c, next)) {\r
1154                 int fcd = m_nfcImpl_.getFCD16(Character.toCodePoint(c, next));\r
1155                 if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {\r
1156                     return textoffset;\r
1157                 }\r
1158                 next = text.next();\r
1159                 textoffset += 2;\r
1160             } else {\r
1161                 ++textoffset;\r
1162             }\r
1163             c = next;\r
1164         }\r
1165     }\r
1166 \r
1167     /**\r
1168      * Gets the next base character offset depending on the string search \r
1169      * pattern data\r
1170      * @param textoffset one offset away from the last character\r
1171      *                   to search for.\r
1172      * @return start index of the next base character or the current offset\r
1173      *         if the current character is contains a base character.\r
1174      */\r
1175     private final int getNextBaseOffset(int textoffset)\r
1176     {\r
1177         if (m_pattern_.m_hasSuffixAccents_ && textoffset < m_textLimitOffset_) {\r
1178             if ((getFCDBefore(targetText, textoffset) & LAST_BYTE_MASK_) != 0) {\r
1179                 return getNextBaseOffset(targetText, textoffset);\r
1180             }\r
1181         }\r
1182         return textoffset;\r
1183     }\r
1184 \r
1185     /**\r
1186      * Shifting the collation element iterator position forward to prepare for\r
1187      * a following match. If the last character is a unsafe character, we'll \r
1188      * only shift by 1 to capture contractions, normalization etc.\r
1189      * Internal method, status assumed to be success.\r
1190      * @param textoffset start text position to do search\r
1191      * @param ce the text ce which failed the match.\r
1192      * @param patternceindex index of the ce within the pattern ce buffer which\r
1193      *        failed the match\r
1194      * @return final offset\r
1195      */\r
1196     private int shiftForward(int textoffset, int ce, int patternceindex)\r
1197                                     \r
1198     {\r
1199         if (ce != CollationElementIterator.NULLORDER) {\r
1200             int shift = m_pattern_.m_shift_[hash(ce)];\r
1201             // this is to adjust for characters in the middle of the \r
1202             // substring for matching that failed.\r
1203             int adjust = m_pattern_.m_CELength_ - patternceindex;\r
1204             if (adjust > 1 && shift >= adjust) {\r
1205                 shift -= adjust - 1;\r
1206             }\r
1207             textoffset += shift;\r
1208         }\r
1209         else {\r
1210             textoffset += m_pattern_.m_defaultShiftSize_;\r
1211         }\r
1212          \r
1213         textoffset = getNextBaseOffset(textoffset);\r
1214         // check for unsafe characters\r
1215         // * if it is the start or middle of a contraction: to be done after \r
1216         //   a initial match is found\r
1217         // * thai or lao base consonant character: similar to contraction\r
1218         // * high surrogate character: similar to contraction\r
1219         // * next character is a accent: shift to the next base character\r
1220         return textoffset;\r
1221     }\r
1222     \r
1223     /**\r
1224      * Gets the offset to the next safe point in text.\r
1225      * ie. not the middle of a contraction, swappable characters or \r
1226      * supplementary characters.\r
1227      * @param textoffset offset in string\r
1228      * @param end offset in string\r
1229      * @return offset to the next safe character\r
1230      */\r
1231     private final int getNextSafeOffset(int textoffset, int end)\r
1232     {\r
1233         int result = textoffset; // first contraction character\r
1234         targetText.setIndex(result);\r
1235         while (result != end && \r
1236             m_collator_.isUnsafe(targetText.current())) {\r
1237                result ++;\r
1238                targetText.setIndex(result);\r
1239         }\r
1240         return result; \r
1241     }\r
1242     \r
1243     /** \r
1244      * This checks for accents in the potential match started with a composite \r
1245      * character.\r
1246      * This is really painful... we have to check that composite character do \r
1247      * not have any extra accents. We have to normalize the potential match and \r
1248      * find the immediate decomposed character before the match.\r
1249      * The first composite character would have been taken care of by the fcd \r
1250      * checks in checkForwardExactMatch.\r
1251      * This is the slow path after the fcd of the first character and \r
1252      * the last character has been checked by checkForwardExactMatch and we \r
1253      * determine that the potential match has extra non-ignorable preceding\r
1254      * ces.\r
1255      * E.g. looking for \u0301 acute in \u01FA A ring above and acute, \r
1256      * checkExtraMatchAccent should fail since there is a middle ring in \r
1257      * \u01FA Note here that accents checking are slow and cautioned in the API \r
1258      * docs.\r
1259      * Internal method, status assumed to be a success, caller should check \r
1260      * status before calling this method\r
1261      * @param start index of the potential unfriendly composite character\r
1262      * @param end index of the potential unfriendly composite character\r
1263      * @return true if there is non-ignorable accents before at the beginning\r
1264      *              of the match, false otherwise.\r
1265      */\r
1266     private final boolean checkExtraMatchAccents(int start, int end)\r
1267     {\r
1268         boolean result = false;\r
1269         if (m_pattern_.m_hasPrefixAccents_) {\r
1270             targetText.setIndex(start);\r
1271             \r
1272             if (UTF16.isLeadSurrogate(targetText.next())) {\r
1273                 if (!UTF16.isTrailSurrogate(targetText.next())) {\r
1274                     targetText.previous();\r
1275                 }\r
1276             }\r
1277             // we are only concerned with the first composite character\r
1278             String str = getString(targetText, start, end);\r
1279             if (Normalizer.quickCheck(str, Normalizer.NFD,0) \r
1280                                                     == Normalizer.NO) {\r
1281                 int safeoffset = getNextSafeOffset(start, end);\r
1282                 if (safeoffset != end) {\r
1283                     safeoffset ++;\r
1284                 }\r
1285                 String decomp = Normalizer.decompose(\r
1286                                 str.substring(0, safeoffset - start), false);\r
1287                 m_utilColEIter_.setText(decomp);\r
1288                 int firstce = m_pattern_.m_CE_[0];\r
1289                 boolean ignorable = true;\r
1290                 int ce = CollationElementIterator.IGNORABLE;\r
1291                 int offset = 0;\r
1292                 while (ce != firstce) {\r
1293                     offset = m_utilColEIter_.getOffset();\r
1294                     if (ce != firstce \r
1295                         && ce != CollationElementIterator.IGNORABLE) {\r
1296                         ignorable = false;\r
1297                     }\r
1298                     ce = m_utilColEIter_.next();\r
1299                 }\r
1300                 m_utilColEIter_.setExactOffset(offset); // back up 1 to the \r
1301                 m_utilColEIter_.previous();             // right offset\r
1302                 offset = m_utilColEIter_.getOffset();\r
1303                 result = !ignorable && (UCharacter.getCombiningClass(\r
1304                                             UTF16.charAt(decomp, offset)) != 0);\r
1305             }\r
1306         }\r
1307     \r
1308         return result;\r
1309     }\r
1310     \r
1311     /**\r
1312     * Used by exact matches, checks if there are accents before the match. \r
1313     * This is really painful... we have to check that composite characters at\r
1314     * the start of the matches have to not have any extra accents. \r
1315     * We check the FCD of the character first, if it starts with an accent and \r
1316     * the first pattern ce does not match the first ce of the character, we \r
1317     * bail.\r
1318     * Otherwise we try normalizing the first composite \r
1319     * character and find the immediate decomposed character before the match to \r
1320     * see if it is an non-ignorable accent.\r
1321     * Now normalizing the first composite character is enough because we ensure \r
1322     * that when the match is passed in here with extra beginning ces, the \r
1323     * first or last ce that match has to occur within the first character.\r
1324     * E.g. looking for \u0301 acute in \u01FA A ring above and acute, \r
1325     * checkExtraMatchAccent should fail since there is a middle ring in \u01FA\r
1326     * Note here that accents checking are slow and cautioned in the API docs.\r
1327     * @param start offset \r
1328     * @param end offset\r
1329     * @return true if there are accents on either side of the match, \r
1330     *         false otherwise\r
1331     */\r
1332     private final boolean hasAccentsBeforeMatch(int start, int end) \r
1333     {\r
1334         if (m_pattern_.m_hasPrefixAccents_) {\r
1335             // we have been iterating forwards previously\r
1336             boolean ignorable = true;\r
1337             int firstce = m_pattern_.m_CE_[0];\r
1338             m_colEIter_.setExactOffset(start);\r
1339             int ce  = getCE(m_colEIter_.next());\r
1340             while (ce != firstce) {\r
1341                 if (ce != CollationElementIterator.IGNORABLE) {\r
1342                     ignorable = false;\r
1343                 }\r
1344                 ce = getCE(m_colEIter_.next());\r
1345             }\r
1346             if (!ignorable && m_colEIter_.isInBuffer()) {\r
1347                 // within normalization buffer, discontiguous handled here\r
1348                 return true;\r
1349             }\r
1350     \r
1351             // within text\r
1352             boolean accent = (getFCD(targetText, start) >> SECOND_LAST_BYTE_SHIFT_)\r
1353                                                         != 0; \r
1354             if (!accent) {\r
1355                 return checkExtraMatchAccents(start, end);\r
1356             }\r
1357             if (!ignorable) {\r
1358                 return true;\r
1359             }\r
1360             if (start > m_textBeginOffset_) {\r
1361                 targetText.setIndex(start);\r
1362                 targetText.previous();\r
1363                 if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) \r
1364                                                                         != 0) {\r
1365                     m_colEIter_.setExactOffset(start);\r
1366                     ce = m_colEIter_.previous();\r
1367                     if (ce != CollationElementIterator.NULLORDER \r
1368                         && ce != CollationElementIterator.IGNORABLE) {\r
1369                         return true;\r
1370                     }\r
1371                 }\r
1372             }\r
1373         }\r
1374       \r
1375         return false;\r
1376     }\r
1377     \r
1378     /**\r
1379      * Used by exact matches, checks if there are accents bounding the match.\r
1380      * Note this is the initial boundary check. If the potential match\r
1381      * starts or ends with composite characters, the accents in those\r
1382      * characters will be determined later.\r
1383      * Not doing backwards iteration here, since discontiguos contraction for \r
1384      * backwards collation element iterator, use up too many characters.\r
1385      * E.g. looking for \u030A ring in \u01FA A ring above and acute, \r
1386      * should fail since there is a acute at the end of \u01FA\r
1387      * Note here that accents checking are slow and cautioned in the API docs.\r
1388      * @param start offset of match\r
1389      * @param end end offset of the match\r
1390      * @return true if there are accents on either side of the match, \r
1391      *         false otherwise\r
1392      */\r
1393     private final boolean hasAccentsAfterMatch(int start, int end) \r
1394     {\r
1395         if (m_pattern_.m_hasSuffixAccents_) {\r
1396             targetText.setIndex(end);\r
1397             if (end > m_textBeginOffset_ \r
1398                 && UTF16.isTrailSurrogate(targetText.previous())) {\r
1399                 if (targetText.getIndex() > m_textBeginOffset_ &&\r
1400                     !UTF16.isLeadSurrogate(targetText.previous())) {\r
1401                     targetText.next();\r
1402                 }\r
1403             }\r
1404             if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) != 0) {\r
1405                 int firstce  = m_pattern_.m_CE_[0];\r
1406                 m_colEIter_.setExactOffset(start);\r
1407                 while (getCE(m_colEIter_.next()) != firstce) {\r
1408                 }\r
1409                 int count = 1;\r
1410                 while (count < m_pattern_.m_CELength_) {\r
1411                     if (getCE(m_colEIter_.next()) \r
1412                         == CollationElementIterator.IGNORABLE) {\r
1413                         count --;\r
1414                     }\r
1415                     count ++;\r
1416                 }\r
1417                 //int ce = getCE(m_colEIter_.next());\r
1418                 int ce = m_colEIter_.next();\r
1419                 if (ce != CollationElementIterator.NULLORDER \r
1420                         && ce != CollationElementIterator.IGNORABLE) {\r
1421                     ce = getCE(ce);\r
1422                 }\r
1423                 if (ce != CollationElementIterator.NULLORDER \r
1424                             && ce != CollationElementIterator.IGNORABLE) {\r
1425                     if (m_colEIter_.getOffset() <= end) {\r
1426                         return true;\r
1427                     }\r
1428                     if ((getFCD(targetText, end) >> SECOND_LAST_BYTE_SHIFT_) \r
1429                         != 0) {\r
1430                         return true;\r
1431                     }\r
1432                 }\r
1433             }\r
1434         }\r
1435         return false;\r
1436     }\r
1437     \r
1438     /**\r
1439     * Checks if the offset runs out of the text string range\r
1440     * @param textstart offset of the first character in the range\r
1441     * @param textlimit limit offset of the text string range\r
1442     * @param offset to test\r
1443     * @return true if offset is out of bounds, false otherwise\r
1444     */\r
1445     private static final boolean isOutOfBounds(int textstart, int textlimit, \r
1446                                                 int offset)\r
1447     {\r
1448         return offset < textstart || offset > textlimit;\r
1449     }\r
1450     \r
1451     /**\r
1452      * Checks for identical match\r
1453      * @param strsrch string search data\r
1454      * @param start offset of possible match\r
1455      * @param end offset of possible match\r
1456      * @return true if identical match is found\r
1457      */\r
1458     private final boolean checkIdentical(int start, int end) \r
1459     {\r
1460         if (m_collator_.getStrength() != Collator.IDENTICAL) {\r
1461             return true;\r
1462         }\r
1463     \r
1464         String textstr = getString(targetText, start, end - start);\r
1465         if (Normalizer.quickCheck(textstr, Normalizer.NFD,0) \r
1466                                                     == Normalizer.NO) {\r
1467             textstr = Normalizer.decompose(textstr, false);\r
1468         }\r
1469         String patternstr = m_pattern_.targetText;\r
1470         if (Normalizer.quickCheck(patternstr, Normalizer.NFD,0) \r
1471                                                     == Normalizer.NO) {\r
1472             patternstr = Normalizer.decompose(patternstr, false);\r
1473         }\r
1474         return textstr.equals(patternstr);\r
1475     }\r
1476     \r
1477     /**\r
1478      * Checks to see if the match is repeated\r
1479      * @param start new match start index\r
1480      * @param limit new match limit index\r
1481      * @return true if the the match is repeated, false otherwise\r
1482      */\r
1483     private final boolean checkRepeatedMatch(int start, int limit)\r
1484     {\r
1485         if (m_matchedIndex_ == DONE) {\r
1486             return false;\r
1487         }\r
1488         int end = limit - 1; // last character in the match\r
1489         int lastmatchend = m_matchedIndex_ + matchLength - 1; \r
1490         if (!isOverlapping()) {\r
1491             return (start >= m_matchedIndex_ && start <= lastmatchend) \r
1492                     || (end >= m_matchedIndex_ && end <= lastmatchend)\r
1493                     || (start <= m_matchedIndex_ && end >= lastmatchend);\r
1494                       \r
1495         }\r
1496         return start <= m_matchedIndex_ && end >= lastmatchend;\r
1497     }\r
1498     \r
1499     /**\r
1500      * Checks match for contraction. \r
1501      * If the match ends with a partial contraction we fail.\r
1502      * If the match starts too far off (because of backwards iteration) we try \r
1503      * to chip off the extra characters depending on whether a breakiterator \r
1504      * has been used.\r
1505      * Temporary utility buffer used to return modified start and end.\r
1506      * @param start offset of potential match, to be modified if necessary\r
1507      * @param end offset of potential match, to be modified if necessary\r
1508      * @return true if match passes the contraction test, false otherwise.\r
1509      */\r
1510     private final boolean checkNextExactContractionMatch(int start, int end) \r
1511     {\r
1512         // This part checks if either ends of the match contains potential \r
1513         // contraction. If so we'll have to iterate through them\r
1514         char endchar = 0;\r
1515         if (end < m_textLimitOffset_) {\r
1516             targetText.setIndex(end);\r
1517             endchar = targetText.current();\r
1518         }\r
1519         char poststartchar = 0;\r
1520         if (start + 1 < m_textLimitOffset_) {\r
1521             targetText.setIndex(start + 1);\r
1522             poststartchar = targetText.current();\r
1523         }\r
1524         if (m_collator_.isUnsafe(endchar) \r
1525             || m_collator_.isUnsafe(poststartchar)) {\r
1526             // expansion prefix, what's left to iterate\r
1527             int bufferedCEOffset = m_colEIter_.m_CEBufferOffset_;\r
1528             boolean hasBufferedCE = bufferedCEOffset > 0;\r
1529             m_colEIter_.setExactOffset(start);\r
1530             int temp = start;\r
1531             while (bufferedCEOffset > 0) {\r
1532                 // getting rid of the redundant ce, caused by setOffset.\r
1533                 // since backward contraction/expansion may have extra ces if \r
1534                 // we are in the normalization buffer, hasAccentsBeforeMatch \r
1535                 // would have taken care of it.\r
1536                 // E.g. the character \u01FA will have an expansion of 3, but \r
1537                 // if we are only looking for acute and ring \u030A and \u0301, \r
1538                 // we'll have to skip the first ce in the expansion buffer.\r
1539                 m_colEIter_.next();\r
1540                 if (m_colEIter_.getOffset() != temp) {\r
1541                     start = temp;\r
1542                     temp  = m_colEIter_.getOffset();\r
1543                 }\r
1544                 bufferedCEOffset --;\r
1545             }\r
1546     \r
1547             int count = 0;\r
1548             while (count < m_pattern_.m_CELength_) {\r
1549                 int ce = getCE(m_colEIter_.next());\r
1550                 if (ce == CollationElementIterator.IGNORABLE) {\r
1551                     continue;\r
1552                 }\r
1553                 if (hasBufferedCE && count == 0 \r
1554                     && m_colEIter_.getOffset() != temp) {\r
1555                     start = temp;\r
1556                     temp   = m_colEIter_.getOffset();\r
1557                 }\r
1558                 if (ce != m_pattern_.m_CE_[count]) {\r
1559                     end ++;\r
1560                     end = getNextBaseOffset(end);  \r
1561                     m_utilBuffer_[0] = start;\r
1562                     m_utilBuffer_[1] = end;\r
1563                     return false;\r
1564                 }\r
1565                 count ++;\r
1566             }\r
1567         } \r
1568         m_utilBuffer_[0] = start;\r
1569         m_utilBuffer_[1] = end;\r
1570         return true;\r
1571     }\r
1572     \r
1573     \r
1574     /**\r
1575      * Checks and sets the match information if found.\r
1576      * Checks \r
1577      * <ul>\r
1578      * <li> the potential match does not repeat the previous match\r
1579      * <li> boundaries are correct\r
1580      * <li> exact matches has no extra accents\r
1581      * <li> identical matchesb\r
1582      * <li> potential match does not end in the middle of a contraction\r
1583      * </ul>\r
1584      * Otherwise the offset will be shifted to the next character.\r
1585      * The result m_matchIndex_ and m_matchLength_ will be set to the truncated\r
1586      * more fitting result value.\r
1587      * Uses the temporary utility buffer for storing the modified textoffset.\r
1588      * @param textoffset offset in the collation element text.\r
1589      * @return true if the match is valid, false otherwise\r
1590      */\r
1591     private final boolean checkNextExactMatch(int textoffset)\r
1592     {\r
1593         int start = m_colEIter_.getOffset();        \r
1594         if (!checkNextExactContractionMatch(start, textoffset)) {\r
1595             // returns the modified textoffset\r
1596             m_utilBuffer_[0] = m_utilBuffer_[1];\r
1597             return false;\r
1598         }\r
1599     \r
1600         start = m_utilBuffer_[0];\r
1601         textoffset = m_utilBuffer_[1];\r
1602         // this totally matches, however we need to check if it is repeating\r
1603         if (!isBreakUnit(start, textoffset) \r
1604             || checkRepeatedMatch(start, textoffset) \r
1605             || hasAccentsBeforeMatch(start, textoffset) \r
1606             || !checkIdentical(start, textoffset) \r
1607             || hasAccentsAfterMatch(start, textoffset)) {\r
1608             textoffset ++;\r
1609             textoffset = getNextBaseOffset(textoffset);  \r
1610             m_utilBuffer_[0] = textoffset;\r
1611             return false;\r
1612         }\r
1613         \r
1614         if (m_collator_.getStrength() == Collator.PRIMARY) {\r
1615             textoffset = checkBreakBoundary(textoffset);\r
1616         }\r
1617             \r
1618         // totally match, we will get rid of the ending ignorables.\r
1619         m_matchedIndex_  = start;\r
1620         matchLength = textoffset - start;\r
1621         return true;\r
1622     }\r
1623     \r
1624     /**\r
1625     * Getting the previous base character offset, or the current offset if the \r
1626     * current character is a base character\r
1627     * @param text the source text to work on\r
1628     * @param textoffset one offset after the current character\r
1629     * @return the offset of the next character after the base character or the \r
1630     *             first composed character with accents\r
1631     */\r
1632     private final int getPreviousBaseOffset(CharacterIterator text, \r
1633                                             int textoffset)\r
1634     {\r
1635         if (textoffset > m_textBeginOffset_) {\r
1636             while (true) {\r
1637                 int result = textoffset;\r
1638                 text.setIndex(result);\r
1639                 if (UTF16.isTrailSurrogate(text.previous())) {\r
1640                     if (text.getIndex() != text.getBeginIndex() &&\r
1641                         !UTF16.isLeadSurrogate(text.previous())) {\r
1642                         text.next();\r
1643                     }\r
1644                 }\r
1645                 textoffset = text.getIndex();\r
1646                 char fcd = getFCD(text, textoffset);\r
1647                 if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {\r
1648                     if ((fcd & LAST_BYTE_MASK_) != 0) {\r
1649                         return textoffset;\r
1650                     }\r
1651                     return result;\r
1652                 }\r
1653                 if (textoffset == m_textBeginOffset_) {\r
1654                     return m_textBeginOffset_;\r
1655                 }\r
1656             }\r
1657         }\r
1658         return textoffset;\r
1659     }\r
1660     \r
1661     /**\r
1662     * Getting the indexes of the accents that are not blocked in the argument\r
1663     * accent array\r
1664     * @param accents accents in nfd.\r
1665     * @param accentsindex array to store the indexes of accents in accents that \r
1666     *         are not blocked\r
1667     * @return the length of populated accentsindex\r
1668     */\r
1669     private int getUnblockedAccentIndex(StringBuilder accents, \r
1670                                         int accentsindex[])\r
1671     {\r
1672         int index = 0;\r
1673         int length = accents.length();\r
1674         int cclass = 0;\r
1675         int result = 0;\r
1676         while (index < length) {\r
1677             int codepoint = UTF16.charAt(accents, index);\r
1678             int tempclass = UCharacter.getCombiningClass(codepoint);\r
1679             if (tempclass != cclass) {\r
1680                 cclass = tempclass;\r
1681                 accentsindex[result] = index;\r
1682                 result ++;\r
1683             }\r
1684             if (UCharacter.isSupplementary(codepoint)) {\r
1685                 index += 2;\r
1686             }\r
1687             else {\r
1688                 index ++;\r
1689             }\r
1690         }\r
1691         accentsindex[result] = length;\r
1692         return result;\r
1693     }\r
1694 \r
1695     /**\r
1696      * Appends 3 StringBuilder/CharacterIterator together into a destination \r
1697      * string buffer.\r
1698      * @param source1 string buffer\r
1699      * @param source2 character iterator\r
1700      * @param start2 start of the character iterator to merge\r
1701      * @param end2 end of the character iterator to merge\r
1702      * @param source3 string buffer\r
1703      * @return appended string buffer\r
1704      */\r
1705     private static final StringBuilder merge(StringBuilder source1, \r
1706                                              CharacterIterator source2,\r
1707                                              int start2, int end2,\r
1708                                              StringBuilder source3) \r
1709     {\r
1710         StringBuilder result = new StringBuilder();    \r
1711         if (source1 != null && source1.length() != 0) {\r
1712             result.append(source1);\r
1713         }\r
1714         source2.setIndex(start2);\r
1715         while (source2.getIndex() < end2) {\r
1716             result.append(source2.current());\r
1717             source2.next();\r
1718         }\r
1719         if (source3 != null && source3.length() != 0) {\r
1720             result.append(source3);\r
1721         }\r
1722         return result;\r
1723     }\r
1724     \r
1725     /**\r
1726     * Running through a collation element iterator to see if the contents \r
1727     * matches pattern in string search data\r
1728     * @param coleiter collation element iterator to test\r
1729     * @return true if a match if found, false otherwise\r
1730     */\r
1731     private final boolean checkCollationMatch(CollationElementIterator coleiter)\r
1732     {\r
1733         int patternceindex = m_pattern_.m_CELength_;\r
1734         int offset = 0;\r
1735         while (patternceindex > 0) {\r
1736             int ce = getCE(coleiter.next());\r
1737             if (ce == CollationElementIterator.IGNORABLE) {\r
1738                 continue;\r
1739             }\r
1740             if (ce != m_pattern_.m_CE_[offset]) {\r
1741                 return false;\r
1742             }\r
1743             offset ++;\r
1744             patternceindex --;\r
1745         }\r
1746         return true;\r
1747     }\r
1748     \r
1749     /**\r
1750      * Rearranges the front accents to try matching.\r
1751      * Prefix accents in the text will be grouped according to their combining \r
1752      * class and the groups will be mixed and matched to try find the perfect \r
1753      * match with the pattern.\r
1754      * So for instance looking for "\u0301" in "\u030A\u0301\u0325"\r
1755      * step 1: split "\u030A\u0301" into 6 other type of potential accent \r
1756      *            substrings "\u030A", "\u0301", "\u0325", "\u030A\u0301", \r
1757      *            "\u030A\u0325", "\u0301\u0325".\r
1758      * step 2: check if any of the generated substrings matches the pattern.\r
1759      * Internal method, status is assumed to be success, caller has to check \r
1760      * status before calling this method.\r
1761      * @param start first offset of the accents to start searching\r
1762      * @param end start of the last accent set\r
1763      * @return DONE if a match is not found, otherwise return the starting\r
1764      *         offset of the match. Note this start includes all preceding \r
1765      *            accents.\r
1766      */\r
1767     private int doNextCanonicalPrefixMatch(int start, int end)\r
1768     {\r
1769         if ((getFCD(targetText, start) & LAST_BYTE_MASK_) == 0) {\r
1770             // die... failed at a base character\r
1771             return DONE;\r
1772         }\r
1773     \r
1774         start = targetText.getIndex(); // index changed by fcd\r
1775         int offset = getNextBaseOffset(targetText, start);\r
1776         start = getPreviousBaseOffset(start);\r
1777     \r
1778         StringBuilder accents = new StringBuilder();\r
1779         String accentstr = getString(targetText, start, offset - start);\r
1780         // normalizing the offensive string\r
1781         if (Normalizer.quickCheck(accentstr, Normalizer.NFD,0) \r
1782                                                     == Normalizer.NO) {\r
1783             accentstr = Normalizer.decompose(accentstr, false);\r
1784         }\r
1785         accents.append(accentstr);\r
1786             \r
1787         int accentsindex[] = new int[INITIAL_ARRAY_SIZE_];      \r
1788         int accentsize = getUnblockedAccentIndex(accents, accentsindex);\r
1789         int count = (2 << (accentsize - 1)) - 1;  \r
1790         while (count > 0) {\r
1791             // copy the base characters\r
1792             m_canonicalPrefixAccents_.delete(0, \r
1793                                         m_canonicalPrefixAccents_.length());\r
1794             int k = 0;\r
1795             for (; k < accentsindex[0]; k ++) {\r
1796                 m_canonicalPrefixAccents_.append(accents.charAt(k));\r
1797             }\r
1798             // forming all possible canonical rearrangement by dropping\r
1799             // sets of accents\r
1800             for (int i = 0; i <= accentsize - 1; i ++) {\r
1801                 int mask = 1 << (accentsize - i - 1);\r
1802                 if ((count & mask) != 0) {\r
1803                     for (int j = accentsindex[i]; j < accentsindex[i + 1]; \r
1804                                                                         j ++) {\r
1805                         m_canonicalPrefixAccents_.append(accents.charAt(j));\r
1806                     }\r
1807                 }\r
1808             }\r
1809             StringBuilder match = merge(m_canonicalPrefixAccents_,\r
1810                                        targetText, offset, end,\r
1811                                        m_canonicalSuffixAccents_);\r
1812                 \r
1813             // if status is a failure, ucol_setText does nothing.\r
1814             // run the collator iterator through this match\r
1815             m_utilColEIter_.setText(match.toString());\r
1816             if (checkCollationMatch(m_utilColEIter_)) {\r
1817                  return start;\r
1818             }\r
1819             count --;\r
1820         }\r
1821         return DONE;\r
1822     }\r
1823 \r
1824     /**\r
1825     * Gets the offset to the safe point in text before textoffset.\r
1826     * ie. not the middle of a contraction, swappable characters or \r
1827     * supplementary characters.\r
1828     * @param start offset in string\r
1829     * @param textoffset offset in string\r
1830     * @return offset to the previous safe character\r
1831     */\r
1832     private final int getPreviousSafeOffset(int start, int textoffset)\r
1833     {\r
1834         int result = textoffset; // first contraction character\r
1835         targetText.setIndex(textoffset);\r
1836         while (result >= start && m_collator_.isUnsafe(targetText.previous())) {\r
1837             result = targetText.getIndex();\r
1838         }\r
1839         if (result != start) {\r
1840             // the first contraction character is consider unsafe here\r
1841             result = targetText.getIndex(); // originally result --;\r
1842         }\r
1843         return result; \r
1844     }\r
1845 \r
1846     /**\r
1847      * Take the rearranged end accents and tries matching. If match failed at\r
1848      * a seperate preceding set of accents (seperated from the rearranged on by\r
1849      * at least a base character) then we rearrange the preceding accents and \r
1850      * tries matching again.\r
1851      * We allow skipping of the ends of the accent set if the ces do not match. \r
1852      * However if the failure is found before the accent set, it fails.\r
1853      * Internal method, status assumed to be success, caller has to check \r
1854      * status before calling this method.\r
1855      * @param textoffset of the start of the rearranged accent\r
1856      * @return DONE if a match is not found, otherwise return the starting\r
1857      *         offset of the match. Note this start includes all preceding \r
1858      *         accents.\r
1859      */\r
1860     private int doNextCanonicalSuffixMatch(int textoffset)\r
1861     {\r
1862         int safelength = 0;\r
1863         StringBuilder safetext;\r
1864         int safeoffset = m_textBeginOffset_; \r
1865         \r
1866         if (textoffset != m_textBeginOffset_ \r
1867             && m_canonicalSuffixAccents_.length() > 0\r
1868             && m_collator_.isUnsafe(m_canonicalSuffixAccents_.charAt(0))) {\r
1869             safeoffset     = getPreviousSafeOffset(m_textBeginOffset_, \r
1870                                                     textoffset);\r
1871             safelength     = textoffset - safeoffset;\r
1872             safetext       = merge(null, targetText, safeoffset, textoffset, \r
1873                                    m_canonicalSuffixAccents_);\r
1874         }\r
1875         else {\r
1876             safetext = m_canonicalSuffixAccents_;\r
1877         }\r
1878     \r
1879         // if status is a failure, ucol_setText does nothing\r
1880         CollationElementIterator coleiter = m_utilColEIter_;\r
1881         coleiter.setText(safetext.toString());\r
1882         // status checked in loop below\r
1883     \r
1884         int ceindex = m_pattern_.m_CELength_ - 1;\r
1885         boolean isSafe = true; // indication flag for position in safe zone\r
1886         \r
1887         while (ceindex >= 0) {\r
1888             int textce = coleiter.previous();\r
1889             if (textce == CollationElementIterator.NULLORDER) {\r
1890                 // check if we have passed the safe buffer\r
1891                 if (coleiter == m_colEIter_) {\r
1892                     return DONE;\r
1893                 }\r
1894                 coleiter = m_colEIter_;\r
1895                 if (safetext != m_canonicalSuffixAccents_) {\r
1896                     safetext.delete(0, safetext.length());\r
1897                 }\r
1898                 coleiter.setExactOffset(safeoffset);\r
1899                 // status checked at the start of the loop\r
1900                 isSafe = false;\r
1901                 continue;\r
1902             }\r
1903             textce = getCE(textce);\r
1904             if (textce != CollationElementIterator.IGNORABLE \r
1905                 && textce != m_pattern_.m_CE_[ceindex]) {\r
1906                 // do the beginning stuff\r
1907                 int failedoffset = coleiter.getOffset();\r
1908                 if (isSafe && failedoffset >= safelength) {\r
1909                     // alas... no hope. failed at rearranged accent set\r
1910                     return DONE;\r
1911                 }\r
1912                 else {\r
1913                     if (isSafe) {\r
1914                         failedoffset += safeoffset;\r
1915                     }\r
1916                     \r
1917                     // try rearranging the front accents\r
1918                     int result = doNextCanonicalPrefixMatch(failedoffset, \r
1919                                                             textoffset);\r
1920                     if (result != DONE) {\r
1921                         // if status is a failure, ucol_setOffset does nothing\r
1922                         m_colEIter_.setExactOffset(result);\r
1923                     }\r
1924                     return result;\r
1925                 }\r
1926             }\r
1927             if (textce == m_pattern_.m_CE_[ceindex]) {\r
1928                 ceindex --;\r
1929             }\r
1930         }\r
1931         // set offset here\r
1932         if (isSafe) {\r
1933             int result = coleiter.getOffset();\r
1934             // sets the text iterator with the correct expansion and offset\r
1935             int leftoverces = coleiter.m_CEBufferOffset_;\r
1936             if (result >= safelength) { \r
1937                 result = textoffset;\r
1938             }\r
1939             else {\r
1940                 result += safeoffset;\r
1941             }\r
1942             m_colEIter_.setExactOffset(result);\r
1943             m_colEIter_.m_CEBufferOffset_ = leftoverces;\r
1944             return result;\r
1945         }\r
1946         \r
1947         return coleiter.getOffset();              \r
1948     }\r
1949     \r
1950     /**\r
1951      * Trying out the substring and sees if it can be a canonical match.\r
1952      * This will try normalizing the end accents and arranging them into \r
1953      * canonical equivalents and check their corresponding ces with the pattern \r
1954      * ce.\r
1955      * Suffix accents in the text will be grouped according to their combining \r
1956      * class and the groups will be mixed and matched to try find the perfect \r
1957      * match with the pattern.\r
1958      * So for instance looking for "\u0301" in "\u030A\u0301\u0325"\r
1959      * step 1: split "\u030A\u0301" into 6 other type of potential accent \r
1960      *         substrings\r
1961      *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", \r
1962      *         "\u0301\u0325".\r
1963      * step 2: check if any of the generated substrings matches the pattern.\r
1964      * @param textoffset end offset in the collation element text that ends with \r
1965      *                   the accents to be rearranged\r
1966      * @return true if the match is valid, false otherwise\r
1967      */\r
1968     private boolean doNextCanonicalMatch(int textoffset)\r
1969     {\r
1970         int offset = m_colEIter_.getOffset();\r
1971         targetText.setIndex(textoffset);\r
1972         if (UTF16.isTrailSurrogate(targetText.previous()) \r
1973             && targetText.getIndex() > m_textBeginOffset_) { \r
1974             if (!UTF16.isLeadSurrogate(targetText.previous())) {\r
1975                 targetText.next();\r
1976             }\r
1977         }\r
1978         if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) == 0) {\r
1979             if (m_pattern_.m_hasPrefixAccents_) {\r
1980                 offset = doNextCanonicalPrefixMatch(offset, textoffset);\r
1981                 if (offset != DONE) {\r
1982                     m_colEIter_.setExactOffset(offset);\r
1983                     return true;\r
1984                 }\r
1985             }\r
1986             return false;\r
1987         }\r
1988     \r
1989         if (!m_pattern_.m_hasSuffixAccents_) {\r
1990             return false;\r
1991         }\r
1992     \r
1993         StringBuilder accents = new StringBuilder();\r
1994         // offset to the last base character in substring to search\r
1995         int baseoffset = getPreviousBaseOffset(targetText, textoffset);\r
1996         // normalizing the offensive string\r
1997         String accentstr = getString(targetText, baseoffset, \r
1998                                      textoffset - baseoffset);\r
1999         if (Normalizer.quickCheck(accentstr, Normalizer.NFD,0) \r
2000                                                     == Normalizer.NO) {\r
2001             accentstr = Normalizer.decompose(accentstr, false);\r
2002         }\r
2003         accents.append(accentstr);\r
2004         // status checked in loop below\r
2005             \r
2006         int accentsindex[] = new int[INITIAL_ARRAY_SIZE_];\r
2007         int size = getUnblockedAccentIndex(accents, accentsindex);\r
2008     \r
2009         // 2 power n - 1 plus the full set of accents\r
2010         int  count = (2 << (size - 1)) - 1;  \r
2011         while (count > 0) {\r
2012             m_canonicalSuffixAccents_.delete(0, \r
2013                                            m_canonicalSuffixAccents_.length());\r
2014             // copy the base characters\r
2015             for (int k = 0; k < accentsindex[0]; k ++) {\r
2016                 m_canonicalSuffixAccents_.append(accents.charAt(k));\r
2017             }\r
2018             // forming all possible canonical rearrangement by dropping\r
2019             // sets of accents\r
2020             for (int i = 0; i <= size - 1; i ++) {\r
2021                 int mask = 1 << (size - i - 1);\r
2022                 if ((count & mask) != 0) {\r
2023                     for (int j = accentsindex[i]; j < accentsindex[i + 1]; \r
2024                         j ++) {\r
2025                         m_canonicalSuffixAccents_.append(accents.charAt(j));\r
2026                     }\r
2027                 }\r
2028             }\r
2029             offset = doNextCanonicalSuffixMatch(baseoffset);\r
2030             if (offset != DONE) {\r
2031                 return true; // match found\r
2032             }\r
2033             count --;\r
2034         }\r
2035         return false;\r
2036     }\r
2037     \r
2038     /**\r
2039      * Gets the previous base character offset depending on the string search \r
2040      * pattern data\r
2041      * @param strsrch string search data\r
2042      * @param textoffset current offset, current character\r
2043      * @return the offset of the next character after this base character or \r
2044      *             itself if it is a composed character with accents\r
2045      */\r
2046     private final int getPreviousBaseOffset(int textoffset)\r
2047     {\r
2048         if (m_pattern_.m_hasPrefixAccents_ && textoffset > m_textBeginOffset_) {\r
2049             int offset = textoffset;\r
2050             if ((getFCD(targetText, offset) >> SECOND_LAST_BYTE_SHIFT_) != 0) {\r
2051                 return getPreviousBaseOffset(targetText, textoffset);\r
2052             }\r
2053         }\r
2054         return textoffset;\r
2055     }\r
2056     \r
2057     /**\r
2058      * Checks match for contraction. \r
2059      * If the match ends with a partial contraction we fail.\r
2060      * If the match starts too far off (because of backwards iteration) we try \r
2061      * to chip off the extra characters.\r
2062      * Uses the temporary util buffer for return values of the modified start\r
2063      * and end.\r
2064      * @param start offset of potential match, to be modified if necessary\r
2065      * @param end offset of potential match, to be modified if necessary\r
2066      * @return true if match passes the contraction test, false otherwise. \r
2067      */\r
2068     private boolean checkNextCanonicalContractionMatch(int start, int end) \r
2069     {\r
2070         // This part checks if either ends of the match contains potential \r
2071         // contraction. If so we'll have to iterate through them\r
2072         char schar = 0;\r
2073         char echar = 0;\r
2074         if (end < m_textLimitOffset_) {\r
2075             targetText.setIndex(end);\r
2076             echar = targetText.current();\r
2077         }\r
2078         if (start < m_textLimitOffset_) {\r
2079             targetText.setIndex(start + 1);\r
2080             schar = targetText.current();\r
2081         }\r
2082         if (m_collator_.isUnsafe(echar) || m_collator_.isUnsafe(schar)) {\r
2083             int expansion  = m_colEIter_.m_CEBufferOffset_;\r
2084             boolean hasExpansion = expansion > 0;\r
2085             m_colEIter_.setExactOffset(start);\r
2086             int temp = start;\r
2087             while (expansion > 0) {\r
2088                 // getting rid of the redundant ce, caused by setOffset.\r
2089                 // since backward contraction/expansion may have extra ces if \r
2090                 // we are in the normalization buffer, hasAccentsBeforeMatch \r
2091                 // would have taken care of it.\r
2092                 // E.g. the character \u01FA will have an expansion of 3, but \r
2093                 // if we are only looking for acute and ring \u030A and \u0301, \r
2094                 // we'll have to skip the first ce in the expansion buffer.\r
2095                 m_colEIter_.next();\r
2096                 if (m_colEIter_.getOffset() != temp) {\r
2097                     start = temp;\r
2098                     temp  = m_colEIter_.getOffset();\r
2099                 }\r
2100                 expansion --;\r
2101             }\r
2102     \r
2103             int count = 0;\r
2104             while (count < m_pattern_.m_CELength_) {\r
2105                 int ce = getCE(m_colEIter_.next());\r
2106                 // status checked below, note that if status is a failure\r
2107                 // ucol_next returns UCOL_NULLORDER\r
2108                 if (ce == CollationElementIterator.IGNORABLE) {\r
2109                     continue;\r
2110                 }\r
2111                 if (hasExpansion && count == 0 \r
2112                     && m_colEIter_.getOffset() != temp) {\r
2113                     start = temp;\r
2114                     temp = m_colEIter_.getOffset();\r
2115                 }\r
2116     \r
2117                 if (count == 0 && ce != m_pattern_.m_CE_[0]) {\r
2118                     // accents may have extra starting ces, this occurs when a \r
2119                     // pure accent pattern is matched without rearrangement\r
2120                     // text \u0325\u0300 and looking for \u0300\r
2121                     int expected = m_pattern_.m_CE_[0]; \r
2122                     if ((getFCD(targetText, start) & LAST_BYTE_MASK_) != 0) {\r
2123                         ce = getCE(m_colEIter_.next());\r
2124                         while (ce != expected \r
2125                                && ce != CollationElementIterator.NULLORDER \r
2126                                && m_colEIter_.getOffset() <= end) {\r
2127                             ce = getCE(m_colEIter_.next());\r
2128                         }\r
2129                     }\r
2130                 }\r
2131                 if (ce != m_pattern_.m_CE_[count]) {\r
2132                     end ++;\r
2133                     end = getNextBaseOffset(end);  \r
2134                     m_utilBuffer_[0] = start;\r
2135                     m_utilBuffer_[1] = end;\r
2136                     return false;\r
2137                 }\r
2138                 count ++;\r
2139             }\r
2140         } \r
2141         m_utilBuffer_[0] = start;\r
2142         m_utilBuffer_[1] = end;\r
2143         return true;\r
2144     }\r
2145 \r
2146     /**\r
2147      * Checks and sets the match information if found.\r
2148      * Checks \r
2149      * <ul>\r
2150      * <li> the potential match does not repeat the previous match\r
2151      * <li> boundaries are correct\r
2152      * <li> potential match does not end in the middle of a contraction\r
2153      * <li> identical matches\r
2154      * </ul>\r
2155      * Otherwise the offset will be shifted to the next character.\r
2156      * The result m_matchIndex_ and m_matchLength_ will be set to the truncated\r
2157      * more fitting result value.\r
2158      * Uses the temporary utility buffer for storing the modified textoffset.\r
2159      * @param textoffset offset in the collation element text.\r
2160      * @return true if the match is valid, false otherwise\r
2161      */\r
2162     private boolean checkNextCanonicalMatch(int textoffset)\r
2163     {\r
2164         // to ensure that the start and ends are not composite characters\r
2165         // if we have a canonical accent match\r
2166         if ((m_pattern_.m_hasSuffixAccents_ \r
2167                 && m_canonicalSuffixAccents_.length() != 0) || \r
2168             (m_pattern_.m_hasPrefixAccents_ \r
2169                 && m_canonicalPrefixAccents_.length() != 0)) {\r
2170             m_matchedIndex_ = getPreviousBaseOffset(m_colEIter_.getOffset());\r
2171             matchLength = textoffset - m_matchedIndex_;\r
2172             return true;\r
2173         }\r
2174     \r
2175         int start = m_colEIter_.getOffset();\r
2176         if (!checkNextCanonicalContractionMatch(start, textoffset)) {\r
2177             // return the modified textoffset\r
2178             m_utilBuffer_[0] = m_utilBuffer_[1]; \r
2179             return false;\r
2180         }\r
2181         start = m_utilBuffer_[0];\r
2182         textoffset = m_utilBuffer_[1];\r
2183         start = getPreviousBaseOffset(start);\r
2184         // this totally matches, however we need to check if it is repeating\r
2185         if (checkRepeatedMatch(start, textoffset) \r
2186             || !isBreakUnit(start, textoffset) \r
2187             || !checkIdentical(start, textoffset)) {\r
2188             textoffset ++;\r
2189             textoffset = getNextBaseOffset(targetText, textoffset);\r
2190             m_utilBuffer_[0] = textoffset;\r
2191             return false;\r
2192         }\r
2193         \r
2194         m_matchedIndex_  = start;\r
2195         matchLength = textoffset - start;\r
2196         return true;\r
2197     }\r
2198     \r
2199     /**\r
2200      * Shifting the collation element iterator position forward to prepare for\r
2201      * a preceding match. If the first character is a unsafe character, we'll \r
2202      * only shift by 1 to capture contractions, normalization etc.\r
2203      * @param textoffset start text position to do search\r
2204      * @param ce the text ce which failed the match.\r
2205      * @param patternceindex index of the ce within the pattern ce buffer which\r
2206      *        failed the match\r
2207      * @return final offset\r
2208      */\r
2209     private int reverseShift(int textoffset, int ce, int patternceindex)\r
2210     {         \r
2211         if (isOverlapping()) {\r
2212             if (textoffset != m_textLimitOffset_) {\r
2213                 textoffset --;\r
2214             }\r
2215             else {\r
2216                 textoffset -= m_pattern_.m_defaultShiftSize_;\r
2217             }\r
2218         }\r
2219         else {\r
2220             if (ce != CollationElementIterator.NULLORDER) {\r
2221                 int shift = m_pattern_.m_backShift_[hash(ce)];\r
2222                 \r
2223                 // this is to adjust for characters in the middle of the substring \r
2224                 // for matching that failed.\r
2225                 int adjust = patternceindex;\r
2226                 if (adjust > 1 && shift > adjust) {\r
2227                     shift -= adjust - 1;\r
2228                 }\r
2229                 textoffset -= shift;\r
2230             }\r
2231             else {\r
2232                 textoffset -= m_pattern_.m_defaultShiftSize_;\r
2233             }\r
2234         }    \r
2235         \r
2236         textoffset = getPreviousBaseOffset(textoffset);\r
2237         return textoffset;\r
2238     }\r
2239 \r
2240     /**\r
2241      * Checks match for contraction. \r
2242      * If the match starts with a partial contraction we fail.\r
2243      * Uses the temporary utility buffer to return the modified start and end.\r
2244      * @param start offset of potential match, to be modified if necessary\r
2245      * @param end offset of potential match, to be modified if necessary\r
2246      * @return true if match passes the contraction test, false otherwise.\r
2247      */\r
2248     private boolean checkPreviousExactContractionMatch(int start, int end) \r
2249     {\r
2250         // This part checks if either ends of the match contains potential \r
2251         // contraction. If so we'll have to iterate through them\r
2252         char echar = 0;\r
2253         if (end < m_textLimitOffset_) {\r
2254             targetText.setIndex(end);\r
2255             echar = targetText.current();\r
2256         }\r
2257         char schar = 0;\r
2258         if (start + 1 < m_textLimitOffset_) {\r
2259             targetText.setIndex(start + 1);\r
2260             schar = targetText.current();\r
2261         }\r
2262         if (m_collator_.isUnsafe(echar) || m_collator_.isUnsafe(schar)) {\r
2263             // expansion suffix, what's left to iterate\r
2264             int expansion = m_colEIter_.m_CEBufferSize_ \r
2265                                             - m_colEIter_.m_CEBufferOffset_;\r
2266             boolean hasExpansion = expansion > 0;\r
2267             m_colEIter_.setExactOffset(end);\r
2268             int temp = end;\r
2269             while (expansion > 0) {\r
2270                 // getting rid of the redundant ce\r
2271                 // since forward contraction/expansion may have extra ces\r
2272                 // if we are in the normalization buffer, hasAccentsBeforeMatch\r
2273                 // would have taken care of it.\r
2274                 // E.g. the character \u01FA will have an expansion of 3, but if\r
2275                 // we are only looking for A ring A\u030A, we'll have to skip the \r
2276                 // last ce in the expansion buffer\r
2277                 m_colEIter_.previous();\r
2278                 if (m_colEIter_.getOffset() != temp) {\r
2279                     end = temp;\r
2280                     temp = m_colEIter_.getOffset();\r
2281                 }\r
2282                 expansion --;\r
2283             }\r
2284     \r
2285             int count = m_pattern_.m_CELength_;\r
2286             while (count > 0) {\r
2287                 int ce = getCE(m_colEIter_.previous());\r
2288                 // status checked below, note that if status is a failure\r
2289                 // ucol_previous returns UCOL_NULLORDER\r
2290                 if (ce == CollationElementIterator.IGNORABLE) {\r
2291                     continue;\r
2292                 }\r
2293                 if (hasExpansion && count == 0 \r
2294                     && m_colEIter_.getOffset() != temp) {\r
2295                     end = temp;\r
2296                     temp = m_colEIter_.getOffset();\r
2297                 }\r
2298                 if (ce != m_pattern_.m_CE_[count - 1]) {\r
2299                     start --;\r
2300                     start = getPreviousBaseOffset(targetText, start);\r
2301                     m_utilBuffer_[0] = start;\r
2302                     m_utilBuffer_[1] = end;\r
2303                     return false;\r
2304                 }\r
2305                 count --;\r
2306             }\r
2307         } \r
2308         m_utilBuffer_[0] = start;\r
2309         m_utilBuffer_[1] = end;\r
2310         return true;\r
2311     }\r
2312     \r
2313     /**\r
2314      * Checks and sets the match information if found.\r
2315      * Checks \r
2316      * <ul>\r
2317      * <li> the current match does not repeat the last match\r
2318      * <li> boundaries are correct\r
2319      * <li> exact matches has no extra accents\r
2320      * <li> identical matches\r
2321      * </ul>\r
2322      * Otherwise the offset will be shifted to the preceding character.\r
2323      * Uses the temporary utility buffer to store the modified textoffset.\r
2324      * @param textoffset offset in the collation element text. the returned value\r
2325      *        will be the truncated start offset of the match or the new start \r
2326      *        search offset.\r
2327      * @return true if the match is valid, false otherwise\r
2328      */\r
2329     private final boolean checkPreviousExactMatch(int textoffset)\r
2330     {\r
2331         // to ensure that the start and ends are not composite characters\r
2332         int end = m_colEIter_.getOffset();        \r
2333         if (!checkPreviousExactContractionMatch(textoffset, end)) {\r
2334             return false;\r
2335         }\r
2336         textoffset = m_utilBuffer_[0];\r
2337         end = m_utilBuffer_[1];\r
2338             \r
2339         // this totally matches, however we need to check if it is repeating\r
2340         // the old match\r
2341         if (checkRepeatedMatch(textoffset, end) \r
2342             || !isBreakUnit(textoffset, end) \r
2343             || hasAccentsBeforeMatch(textoffset, end) \r
2344             || !checkIdentical(textoffset, end) \r
2345             || hasAccentsAfterMatch(textoffset, end)) {\r
2346             textoffset --;\r
2347             textoffset = getPreviousBaseOffset(targetText, textoffset);\r
2348             m_utilBuffer_[0] = textoffset;\r
2349             return false;\r
2350         }\r
2351         \r
2352         if (m_collator_.getStrength() == Collator.PRIMARY) {\r
2353             end = checkBreakBoundary(end);\r
2354         }\r
2355         \r
2356         m_matchedIndex_ = textoffset;\r
2357         matchLength = end - textoffset;\r
2358         return true;\r
2359     }\r
2360 \r
2361     /**\r
2362      * Rearranges the end accents to try matching.\r
2363      * Suffix accents in the text will be grouped according to their combining \r
2364      * class and the groups will be mixed and matched to try find the perfect \r
2365      * match with the pattern.\r
2366      * So for instance looking for "\u0301" in "\u030A\u0301\u0325"\r
2367      * step 1: split "\u030A\u0301" into 6 other type of potential accent \r
2368      *             substrings\r
2369      *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", \r
2370      *         "\u0301\u0325".\r
2371      * step 2: check if any of the generated substrings matches the pattern.\r
2372      * @param start offset of the first base character\r
2373      * @param end start of the last accent set\r
2374      * @return DONE if a match is not found, otherwise return the ending\r
2375      *         offset of the match. Note this start includes all following \r
2376      *         accents.\r
2377      */\r
2378     private int doPreviousCanonicalSuffixMatch(int start, int end)\r
2379     {\r
2380         targetText.setIndex(end);\r
2381         if (UTF16.isTrailSurrogate(targetText.previous()) \r
2382             && targetText.getIndex() > m_textBeginOffset_) {\r
2383             if (!UTF16.isLeadSurrogate(targetText.previous())) {\r
2384                 targetText.next();\r
2385             } \r
2386         }\r
2387         if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) == 0) {\r
2388             // die... failed at a base character\r
2389             return DONE;\r
2390         }\r
2391         end = getNextBaseOffset(targetText, end);\r
2392     \r
2393         StringBuilder accents = new StringBuilder();\r
2394         int offset = getPreviousBaseOffset(targetText, end);\r
2395         // normalizing the offensive string\r
2396         String accentstr = getString(targetText, offset, end - offset);\r
2397         if (Normalizer.quickCheck(accentstr, Normalizer.NFD,0) \r
2398                                                     == Normalizer.NO) {\r
2399             accentstr = Normalizer.decompose(accentstr, false);\r
2400         }\r
2401         accents.append(accentstr);    \r
2402             \r
2403         int accentsindex[] = new int[INITIAL_ARRAY_SIZE_];      \r
2404         int accentsize = getUnblockedAccentIndex(accents, accentsindex);\r
2405         int count = (2 << (accentsize - 1)) - 1;  \r
2406         while (count > 0) {\r
2407             m_canonicalSuffixAccents_.delete(0, \r
2408                                            m_canonicalSuffixAccents_.length());\r
2409             // copy the base characters\r
2410             for (int k = 0; k < accentsindex[0]; k ++) {\r
2411                  m_canonicalSuffixAccents_.append(accents.charAt(k));\r
2412             }\r
2413             // forming all possible canonical rearrangement by dropping\r
2414             // sets of accents\r
2415             for (int i = 0; i <= accentsize - 1; i ++) {\r
2416                 int mask = 1 << (accentsize - i - 1);\r
2417                 if ((count & mask) != 0) {\r
2418                     for (int j = accentsindex[i]; j < accentsindex[i + 1]; \r
2419                                                                         j ++) {\r
2420                         m_canonicalSuffixAccents_.append(accents.charAt(j));\r
2421                     }\r
2422                 }\r
2423             }\r
2424             StringBuilder match = merge(m_canonicalPrefixAccents_, targetText,\r
2425                                         start, offset, \r
2426                                         m_canonicalSuffixAccents_);\r
2427             // run the collator iterator through this match\r
2428             // if status is a failure ucol_setText does nothing\r
2429             m_utilColEIter_.setText(match.toString());\r
2430             if (checkCollationMatch(m_utilColEIter_)) {\r
2431                 return end;\r
2432             }\r
2433             count --;\r
2434         }\r
2435         return DONE;\r
2436     }\r
2437     \r
2438     /**\r
2439      * Take the rearranged start accents and tries matching. If match failed at\r
2440      * a seperate following set of accents (seperated from the rearranged on by\r
2441      * at least a base character) then we rearrange the preceding accents and \r
2442      * tries matching again.\r
2443      * We allow skipping of the ends of the accent set if the ces do not match. \r
2444      * However if the failure is found before the accent set, it fails.\r
2445      * Internal method, status assumed to be success, caller has to check \r
2446      * status before calling this method.\r
2447      * @param textoffset of the ends of the rearranged accent\r
2448      * @return DONE if a match is not found, otherwise return the ending offset \r
2449      *             of the match. Note this start includes all following accents.\r
2450      */\r
2451     private int doPreviousCanonicalPrefixMatch(int textoffset)\r
2452     {\r
2453        // int safelength = 0;\r
2454         StringBuilder safetext;\r
2455         int safeoffset = textoffset;\r
2456     \r
2457         if (textoffset > m_textBeginOffset_\r
2458             && m_collator_.isUnsafe(m_canonicalPrefixAccents_.charAt(\r
2459                                     m_canonicalPrefixAccents_.length() - 1))) {\r
2460             safeoffset = getNextSafeOffset(textoffset, m_textLimitOffset_);\r
2461             //safelength = safeoffset - textoffset;\r
2462             safetext = merge(m_canonicalPrefixAccents_, targetText, textoffset, \r
2463                              safeoffset, null);\r
2464         }\r
2465         else {\r
2466             safetext = m_canonicalPrefixAccents_;\r
2467         }\r
2468     \r
2469         // if status is a failure, ucol_setText does nothing\r
2470         CollationElementIterator coleiter = m_utilColEIter_;\r
2471         coleiter.setText(safetext.toString());\r
2472         // status checked in loop below\r
2473         \r
2474         int ceindex = 0;\r
2475         boolean isSafe = true; // safe zone indication flag for position\r
2476         int prefixlength = m_canonicalPrefixAccents_.length();\r
2477         \r
2478         while (ceindex < m_pattern_.m_CELength_) {\r
2479             int textce = coleiter.next();\r
2480             if (textce == CollationElementIterator.NULLORDER) {\r
2481                 // check if we have passed the safe buffer\r
2482                 if (coleiter == m_colEIter_) {\r
2483                     return DONE;\r
2484                 }\r
2485                 if (safetext != m_canonicalPrefixAccents_) {\r
2486                     safetext.delete(0, safetext.length());\r
2487                 }\r
2488                 coleiter = m_colEIter_;\r
2489                 coleiter.setExactOffset(safeoffset);\r
2490                 // status checked at the start of the loop\r
2491                 isSafe = false;\r
2492                 continue;\r
2493             }\r
2494             textce = getCE(textce);\r
2495             if (textce != CollationElementIterator.IGNORABLE \r
2496                 && textce != m_pattern_.m_CE_[ceindex]) {\r
2497                 // do the beginning stuff\r
2498                 int failedoffset = coleiter.getOffset();\r
2499                 if (isSafe && failedoffset <= prefixlength) {\r
2500                     // alas... no hope. failed at rearranged accent set\r
2501                     return DONE;\r
2502                 }\r
2503                 else {\r
2504                     if (isSafe) {\r
2505                         failedoffset = safeoffset - failedoffset;\r
2506                         if (safetext != m_canonicalPrefixAccents_) {\r
2507                             safetext.delete(0, safetext.length());\r
2508                         }\r
2509                     }\r
2510                     \r
2511                     // try rearranging the end accents\r
2512                     int result = doPreviousCanonicalSuffixMatch(textoffset, \r
2513                                                                 failedoffset);\r
2514                     if (result != DONE) {\r
2515                         // if status is a failure, ucol_setOffset does nothing\r
2516                         m_colEIter_.setExactOffset(result);\r
2517                     }\r
2518                     return result;\r
2519                 }\r
2520             }\r
2521             if (textce == m_pattern_.m_CE_[ceindex]) {\r
2522                 ceindex ++;\r
2523             }\r
2524         }\r
2525         // set offset here\r
2526         if (isSafe) {\r
2527             int result = coleiter.getOffset();\r
2528             // sets the text iterator here with the correct expansion and offset\r
2529             int leftoverces = coleiter.m_CEBufferSize_ \r
2530                                                 - coleiter.m_CEBufferOffset_;\r
2531             if (result <= prefixlength) { \r
2532                 result = textoffset;\r
2533             }\r
2534             else {\r
2535                 result = textoffset + (safeoffset - result);\r
2536             }\r
2537             m_colEIter_.setExactOffset(result);\r
2538             m_colEIter_.m_CEBufferOffset_ = m_colEIter_.m_CEBufferSize_ \r
2539                                                                 - leftoverces;\r
2540             return result;\r
2541         }\r
2542         \r
2543         return coleiter.getOffset();              \r
2544     }\r
2545     \r
2546     /**\r
2547      * Trying out the substring and sees if it can be a canonical match.\r
2548      * This will try normalizing the starting accents and arranging them into \r
2549      * canonical equivalents and check their corresponding ces with the pattern \r
2550      * ce.\r
2551      * Prefix accents in the text will be grouped according to their combining \r
2552      * class and the groups will be mixed and matched to try find the perfect \r
2553      * match with the pattern.\r
2554      * So for instance looking for "\u0301" in "\u030A\u0301\u0325"\r
2555      * step 1: split "\u030A\u0301" into 6 other type of potential accent \r
2556      *            substrings\r
2557      *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", \r
2558      *         "\u0301\u0325".\r
2559      * step 2: check if any of the generated substrings matches the pattern.\r
2560      * @param textoffset start offset in the collation element text that starts \r
2561      *                   with the accents to be rearranged\r
2562      * @return true if the match is valid, false otherwise\r
2563      */\r
2564     private boolean doPreviousCanonicalMatch(int textoffset)\r
2565     {\r
2566         int offset = m_colEIter_.getOffset();\r
2567         if ((getFCD(targetText, textoffset) >> SECOND_LAST_BYTE_SHIFT_) == 0) {\r
2568             if (m_pattern_.m_hasSuffixAccents_) {\r
2569                 offset = doPreviousCanonicalSuffixMatch(textoffset, offset);\r
2570                 if (offset != DONE) {\r
2571                     m_colEIter_.setExactOffset(offset);\r
2572                     return true;\r
2573                 }\r
2574             }\r
2575             return false;\r
2576         }\r
2577     \r
2578         if (!m_pattern_.m_hasPrefixAccents_) {\r
2579             return false;\r
2580         }\r
2581     \r
2582         StringBuilder accents = new StringBuilder();\r
2583         // offset to the last base character in substring to search\r
2584         int baseoffset = getNextBaseOffset(targetText, textoffset);\r
2585         // normalizing the offensive string\r
2586         String textstr = getString(targetText, textoffset, \r
2587                                                     baseoffset - textoffset);\r
2588         if (Normalizer.quickCheck(textstr, Normalizer.NFD,0) \r
2589                                                     == Normalizer.NO) {\r
2590             textstr = Normalizer.decompose(textstr, false);\r
2591         }\r
2592         accents.append(textstr);\r
2593         // status checked in loop\r
2594             \r
2595         int accentsindex[] = new int[INITIAL_ARRAY_SIZE_];\r
2596         int size = getUnblockedAccentIndex(accents, accentsindex);\r
2597     \r
2598         // 2 power n - 1 plus the full set of accents\r
2599         int count = (2 << (size - 1)) - 1;  \r
2600         while (count > 0) {\r
2601             m_canonicalPrefixAccents_.delete(0, \r
2602                                         m_canonicalPrefixAccents_.length());\r
2603             // copy the base characters\r
2604             for (int k = 0; k < accentsindex[0]; k ++) {\r
2605                 m_canonicalPrefixAccents_.append(accents.charAt(k));\r
2606             }\r
2607             // forming all possible canonical rearrangement by dropping\r
2608             // sets of accents\r
2609             for (int i = 0; i <= size - 1; i ++) {\r
2610                 int mask = 1 << (size - i - 1);\r
2611                 if ((count & mask) != 0) {\r
2612                     for (int j = accentsindex[i]; j < accentsindex[i + 1]; \r
2613                          j ++) {\r
2614                         m_canonicalPrefixAccents_.append(accents.charAt(j));\r
2615                     }\r
2616                 }\r
2617             }\r
2618             offset = doPreviousCanonicalPrefixMatch(baseoffset);\r
2619             if (offset != DONE) {\r
2620                 return true; // match found\r
2621             }\r
2622             count --;\r
2623         }\r
2624         return false;\r
2625     }\r
2626     \r
2627     /**\r
2628      * Checks match for contraction. \r
2629      * If the match starts with a partial contraction we fail.\r
2630      * Uses the temporary utility buffer to return the modified start and end.\r
2631      * @param start offset of potential match, to be modified if necessary\r
2632      * @param end offset of potential match, to be modified if necessary\r
2633      * @return true if match passes the contraction test, false otherwise.\r
2634      */\r
2635     private boolean checkPreviousCanonicalContractionMatch(int start, int end) \r
2636     {\r
2637         int temp = end;\r
2638         // This part checks if either ends of the match contains potential \r
2639         // contraction. If so we'll have to iterate through them\r
2640         char echar = 0;\r
2641         char schar = 0;\r
2642         if (end < m_textLimitOffset_) {\r
2643             targetText.setIndex(end);\r
2644             echar = targetText.current();\r
2645         }\r
2646         if (start + 1 < m_textLimitOffset_) {\r
2647             targetText.setIndex(start + 1);\r
2648             schar = targetText.current();\r
2649         }\r
2650         if (m_collator_.isUnsafe(echar) || m_collator_.isUnsafe(schar)) {\r
2651             int expansion = m_colEIter_.m_CEBufferSize_ \r
2652                                             - m_colEIter_.m_CEBufferOffset_;\r
2653             boolean hasExpansion = expansion > 0;\r
2654             m_colEIter_.setExactOffset(end);\r
2655             while (expansion > 0) {\r
2656                 // getting rid of the redundant ce\r
2657                 // since forward contraction/expansion may have extra ces\r
2658                 // if we are in the normalization buffer, hasAccentsBeforeMatch\r
2659                 // would have taken care of it.\r
2660                 // E.g. the character \u01FA will have an expansion of 3, but \r
2661                 // if we are only looking for A ring A\u030A, we'll have to \r
2662                 // skip the last ce in the expansion buffer\r
2663                 m_colEIter_.previous();\r
2664                 if (m_colEIter_.getOffset() != temp) {\r
2665                     end = temp;\r
2666                     temp = m_colEIter_.getOffset();\r
2667                 }\r
2668                 expansion --;\r
2669             }\r
2670     \r
2671             int count = m_pattern_.m_CELength_;\r
2672             while (count > 0) {\r
2673                 int ce = getCE(m_colEIter_.previous());\r
2674                 // status checked below, note that if status is a failure\r
2675                 // previous() returns NULLORDER\r
2676                 if (ce == CollationElementIterator.IGNORABLE) {\r
2677                     continue;\r
2678                 }\r
2679                 if (hasExpansion && count == 0 \r
2680                     && m_colEIter_.getOffset() != temp) {\r
2681                     end = temp;\r
2682                     temp = m_colEIter_.getOffset();\r
2683                 }\r
2684                 if (count == m_pattern_.m_CELength_ \r
2685                     && ce != m_pattern_.m_CE_[m_pattern_.m_CELength_ - 1]) {\r
2686                     // accents may have extra starting ces, this occurs when a \r
2687                     // pure accent pattern is matched without rearrangement\r
2688                     int expected = m_pattern_.m_CE_[m_pattern_.m_CELength_ - 1];\r
2689                     targetText.setIndex(end);\r
2690                     if (UTF16.isTrailSurrogate(targetText.previous())) {\r
2691                         if (targetText.getIndex() > m_textBeginOffset_ &&\r
2692                             !UTF16.isLeadSurrogate(targetText.previous())) {\r
2693                             targetText.next();\r
2694                         }\r
2695                     }\r
2696                     end = targetText.getIndex();\r
2697                     if ((getFCD(targetText, end) & LAST_BYTE_MASK_) != 0) {\r
2698                         ce = getCE(m_colEIter_.previous());\r
2699                         while (ce != expected \r
2700                                 && ce != CollationElementIterator.NULLORDER \r
2701                                 && m_colEIter_.getOffset() <= start) {\r
2702                             ce = getCE(m_colEIter_.previous());\r
2703                         }\r
2704                     }\r
2705                 }\r
2706                 if (ce != m_pattern_.m_CE_[count - 1]) {\r
2707                     start --;\r
2708                     start = getPreviousBaseOffset(start);\r
2709                     m_utilBuffer_[0] = start;\r
2710                     m_utilBuffer_[1] = end;\r
2711                     return false;\r
2712                 }\r
2713                 count --;\r
2714             }\r
2715         } \r
2716         m_utilBuffer_[0] = start;\r
2717         m_utilBuffer_[1] = end;\r
2718         return true;\r
2719     }\r
2720     \r
2721     /**\r
2722      * Checks and sets the match information if found.\r
2723      * Checks \r
2724      * <ul>\r
2725      * <li> the potential match does not repeat the previous match\r
2726      * <li> boundaries are correct\r
2727      * <li> potential match does not end in the middle of a contraction\r
2728      * <li> identical matches\r
2729      * </ul>\r
2730      * Otherwise the offset will be shifted to the next character.\r
2731      * Uses the temporary utility buffer for storing the modified textoffset.\r
2732      * @param textoffset offset in the collation element text. the returned \r
2733      *             value will be the truncated start offset of the match or the \r
2734      *             new start search offset.\r
2735      * @return true if the match is valid, false otherwise\r
2736      */\r
2737     private boolean checkPreviousCanonicalMatch(int textoffset)\r
2738     {\r
2739         // to ensure that the start and ends are not composite characters\r
2740         // if we have a canonical accent match\r
2741         if (m_pattern_.m_hasSuffixAccents_ \r
2742             && m_canonicalSuffixAccents_.length() != 0 \r
2743             || m_pattern_.m_hasPrefixAccents_ \r
2744             && m_canonicalPrefixAccents_.length() != 0) {\r
2745             m_matchedIndex_ = textoffset;\r
2746             matchLength = getNextBaseOffset(m_colEIter_.getOffset()) \r
2747                                                                 - textoffset;\r
2748             return true;\r
2749         }\r
2750     \r
2751         int end = m_colEIter_.getOffset();\r
2752         if (!checkPreviousCanonicalContractionMatch(textoffset, end)) {\r
2753             // storing the modified textoffset\r
2754             return false;\r
2755         }\r
2756         textoffset = m_utilBuffer_[0];\r
2757         end = m_utilBuffer_[1];\r
2758         end = getNextBaseOffset(end);\r
2759         // this totally matches, however we need to check if it is repeating\r
2760         if (checkRepeatedMatch(textoffset, end) \r
2761             || !isBreakUnit(textoffset, end) \r
2762             || !checkIdentical(textoffset, end)) {\r
2763             textoffset --;\r
2764             textoffset = getPreviousBaseOffset(textoffset);\r
2765             m_utilBuffer_[0] = textoffset;\r
2766             return false;\r
2767         }\r
2768         \r
2769         m_matchedIndex_ = textoffset;\r
2770         matchLength = end - textoffset;\r
2771         return true;\r
2772     }\r
2773     \r
2774     /**\r
2775      * Method that does the next exact match\r
2776      * @param start the offset to start shifting from and performing the \r
2777      *        next exact match\r
2778      */\r
2779     private void handleNextExact(int start)\r
2780     {\r
2781         int textoffset = shiftForward(start, \r
2782                                          CollationElementIterator.NULLORDER,\r
2783                                          m_pattern_.m_CELength_);\r
2784         int targetce = CollationElementIterator.IGNORABLE;\r
2785         while (textoffset <= m_textLimitOffset_) {\r
2786             m_colEIter_.setExactOffset(textoffset);\r
2787             int patternceindex = m_pattern_.m_CELength_ - 1;\r
2788             boolean found = false;\r
2789             int lastce = CollationElementIterator.NULLORDER;\r
2790             \r
2791             while (true) {\r
2792                 // finding the last pattern ce match, imagine composite \r
2793                 // characters. for example: search for pattern A in text \u00C0\r
2794                 // we'll have to skip \u0300 the grave first before we get to A\r
2795                 targetce = m_colEIter_.previous();\r
2796                 if (targetce == CollationElementIterator.NULLORDER) {\r
2797                     found = false;\r
2798                     break;\r
2799                 }\r
2800                 targetce = getCE(targetce);\r
2801                 if (targetce == CollationElementIterator.IGNORABLE && \r
2802                     m_colEIter_.isInBuffer()) { \r
2803                     // this is for the text \u0315\u0300 that requires \r
2804                     // normalization and pattern \u0300, where \u0315 is ignorable\r
2805                     continue;\r
2806                 }\r
2807                 if (lastce == CollationElementIterator.NULLORDER \r
2808                     || lastce == CollationElementIterator.IGNORABLE) {\r
2809                     lastce = targetce;\r
2810                 }\r
2811                 if (targetce == m_pattern_.m_CE_[patternceindex]) {\r
2812                     // the first ce can be a contraction\r
2813                     found = true;\r
2814                     break;\r
2815                 }\r
2816                 if (m_colEIter_.m_CEBufferOffset_ <= 0) {\r
2817                     found = false;\r
2818                     break;\r
2819                 }\r
2820             }\r
2821     \r
2822             while (found && patternceindex > 0) {\r
2823                 lastce = targetce;\r
2824                 targetce = m_colEIter_.previous();\r
2825                 if (targetce == CollationElementIterator.NULLORDER) {\r
2826                     found = false;\r
2827                     break;\r
2828                 }\r
2829                 targetce = getCE(targetce);\r
2830                 if (targetce == CollationElementIterator.IGNORABLE) {\r
2831                     continue;\r
2832                 }\r
2833     \r
2834                 patternceindex --;\r
2835                 found = found && targetce == m_pattern_.m_CE_[patternceindex]; \r
2836             }\r
2837             \r
2838             targetce = lastce;\r
2839     \r
2840             if (!found) {\r
2841                 textoffset = shiftForward(textoffset, lastce, patternceindex);\r
2842                 // status checked at loop.\r
2843                 patternceindex = m_pattern_.m_CELength_;\r
2844                 continue;\r
2845             }\r
2846             \r
2847             if (checkNextExactMatch(textoffset)) {\r
2848                 // status checked in ucol_setOffset\r
2849                 return;\r
2850             }\r
2851             textoffset = m_utilBuffer_[0];\r
2852         }\r
2853         setMatchNotFound();\r
2854     }\r
2855 \r
2856     /**\r
2857      * Method that does the next canonical match\r
2858      * @param start the offset to start shifting from and performing the \r
2859      *        next canonical match\r
2860      */\r
2861     private void handleNextCanonical(int start)\r
2862     {\r
2863         boolean hasPatternAccents = \r
2864            m_pattern_.m_hasSuffixAccents_ || m_pattern_.m_hasPrefixAccents_;\r
2865               \r
2866         // shifting it check for setting offset\r
2867         // if setOffset is called previously or there was no previous match, we\r
2868         // leave the offset as it is.\r
2869         int textoffset = shiftForward(start, CollationElementIterator.NULLORDER, \r
2870                                         m_pattern_.m_CELength_);\r
2871         m_canonicalPrefixAccents_.delete(0, m_canonicalPrefixAccents_.length());\r
2872         m_canonicalSuffixAccents_.delete(0, m_canonicalSuffixAccents_.length());\r
2873         int targetce = CollationElementIterator.IGNORABLE;\r
2874         \r
2875         while (textoffset <= m_textLimitOffset_)\r
2876         {\r
2877             m_colEIter_.setExactOffset(textoffset);\r
2878             int patternceindex = m_pattern_.m_CELength_ - 1;\r
2879             boolean found = false;\r
2880             int lastce = CollationElementIterator.NULLORDER;\r
2881             \r
2882             while (true) {\r
2883                 // finding the last pattern ce match, imagine composite characters\r
2884                 // for example: search for pattern A in text \u00C0\r
2885                 // we'll have to skip \u0300 the grave first before we get to A\r
2886                 targetce = m_colEIter_.previous();\r
2887                 if (targetce == CollationElementIterator.NULLORDER) {\r
2888                     found = false;\r
2889                     break;\r
2890                 }\r
2891                 targetce = getCE(targetce);\r
2892                 if (lastce == CollationElementIterator.NULLORDER \r
2893                             || lastce == CollationElementIterator.IGNORABLE) {\r
2894                     lastce = targetce;\r
2895                 }\r
2896                 if (targetce == m_pattern_.m_CE_[patternceindex]) {\r
2897                     // the first ce can be a contraction\r
2898                     found = true;\r
2899                     break;\r
2900                 }\r
2901                 if (m_colEIter_.m_CEBufferOffset_ <= 0) {\r
2902                     found = false;\r
2903                     break;\r
2904                 }\r
2905             }\r
2906             \r
2907             while (found && patternceindex > 0) {\r
2908                 targetce    = m_colEIter_.previous();\r
2909                 if (targetce == CollationElementIterator.NULLORDER) {\r
2910                     found = false;\r
2911                     break;\r
2912                 }\r
2913                 targetce    = getCE(targetce);\r
2914                 if (targetce == CollationElementIterator.IGNORABLE) {\r
2915                     continue;\r
2916                 }\r
2917     \r
2918                 patternceindex --;\r
2919                 found = found && targetce == m_pattern_.m_CE_[patternceindex]; \r
2920             }\r
2921     \r
2922             // initializing the rearranged accent array\r
2923             if (hasPatternAccents && !found) {\r
2924                 found = doNextCanonicalMatch(textoffset);\r
2925             }\r
2926     \r
2927             if (!found) {\r
2928                 textoffset = shiftForward(textoffset, lastce, patternceindex);\r
2929                 // status checked at loop\r
2930                 patternceindex = m_pattern_.m_CELength_;\r
2931                 continue;\r
2932             }\r
2933             \r
2934             if (checkNextCanonicalMatch(textoffset)) {\r
2935                 return;\r
2936             }\r
2937             textoffset = m_utilBuffer_[0];\r
2938         }\r
2939         setMatchNotFound();\r
2940     }\r
2941     \r
2942     /**\r
2943      * Method that does the previous exact match\r
2944      * @param start the offset to start shifting from and performing the \r
2945      *        previous exact match\r
2946      */\r
2947     private void handlePreviousExact(int start)\r
2948     {\r
2949         int textoffset = reverseShift(start, CollationElementIterator.NULLORDER, \r
2950                                       m_pattern_.m_CELength_);\r
2951         while (textoffset >= m_textBeginOffset_)\r
2952         {\r
2953             m_colEIter_.setExactOffset(textoffset);\r
2954             int patternceindex = 1;\r
2955             int targetce = CollationElementIterator.IGNORABLE;\r
2956             boolean found = false;\r
2957             int firstce = CollationElementIterator.NULLORDER;\r
2958             \r
2959             while (true) {\r
2960                 // finding the first pattern ce match, imagine composite \r
2961                 // characters. for example: search for pattern \u0300 in text \r
2962                 // \u00C0, we'll have to skip A first before we get to \r
2963                 // \u0300 the grave accent\r
2964                 targetce = m_colEIter_.next();\r
2965                 if (targetce == CollationElementIterator.NULLORDER) {\r
2966                     found = false;\r
2967                     break;\r
2968                 }\r
2969                 targetce = getCE(targetce);\r
2970                 if (firstce == CollationElementIterator.NULLORDER \r
2971                     || firstce == CollationElementIterator.IGNORABLE) {\r
2972                     firstce = targetce;\r
2973                 }\r
2974                 if (targetce == CollationElementIterator.IGNORABLE && m_collator_.getStrength() != Collator.PRIMARY) {\r
2975                     continue;\r
2976                 }         \r
2977                 if (targetce == m_pattern_.m_CE_[0]) {\r
2978                     found = true;\r
2979                     break;\r
2980                 }\r
2981                 if (m_colEIter_.m_CEBufferOffset_ == -1 \r
2982                     || m_colEIter_.m_CEBufferOffset_ \r
2983                                             == m_colEIter_.m_CEBufferSize_) {\r
2984                     // checking for accents in composite character\r
2985                     found = false;\r
2986                     break;\r
2987                 }\r
2988             }\r
2989     \r
2990             //targetce = firstce;\r
2991             \r
2992             while (found && patternceindex < m_pattern_.m_CELength_) {\r
2993                 firstce = targetce;\r
2994                 targetce = m_colEIter_.next();\r
2995                 if (targetce == CollationElementIterator.NULLORDER) {\r
2996                     found = false;\r
2997                     break;\r
2998                 }\r
2999                 targetce = getCE(targetce);\r
3000                 if (targetce == CollationElementIterator.IGNORABLE) {\r
3001                     continue;\r
3002                 }\r
3003     \r
3004                 found = found && targetce == m_pattern_.m_CE_[patternceindex]; \r
3005                 patternceindex ++;\r
3006             }\r
3007             \r
3008             targetce = firstce;\r
3009     \r
3010             if (!found) {\r
3011                 textoffset = reverseShift(textoffset, targetce, patternceindex);\r
3012                 patternceindex = 0;\r
3013                 continue;\r
3014             }\r
3015             \r
3016             if (checkPreviousExactMatch(textoffset)) {\r
3017                 return;\r
3018             }\r
3019             textoffset = m_utilBuffer_[0];\r
3020         }\r
3021         setMatchNotFound();\r
3022     }\r
3023     \r
3024     /**\r
3025      * Method that does the previous canonical match\r
3026      * @param start the offset to start shifting from and performing the \r
3027      *        previous canonical match\r
3028      */\r
3029     private void handlePreviousCanonical(int start)\r
3030     {\r
3031         boolean hasPatternAccents = \r
3032            m_pattern_.m_hasSuffixAccents_ || m_pattern_.m_hasPrefixAccents_;\r
3033               \r
3034         // shifting it check for setting offset\r
3035         // if setOffset is called previously or there was no previous match, we\r
3036         // leave the offset as it is.\r
3037         int textoffset = reverseShift(start, CollationElementIterator.NULLORDER, \r
3038                                           m_pattern_.m_CELength_);\r
3039         m_canonicalPrefixAccents_.delete(0, m_canonicalPrefixAccents_.length());\r
3040         m_canonicalSuffixAccents_.delete(0, m_canonicalSuffixAccents_.length());\r
3041         \r
3042         while (textoffset >= m_textBeginOffset_)\r
3043         {\r
3044             m_colEIter_.setExactOffset(textoffset);\r
3045             int patternceindex = 1;\r
3046             int targetce = CollationElementIterator.IGNORABLE;\r
3047             boolean found = false;\r
3048             int firstce = CollationElementIterator.NULLORDER;\r
3049             \r
3050             while (true) {\r
3051                 // finding the first pattern ce match, imagine composite \r
3052                 // characters. for example: search for pattern \u0300 in text \r
3053                 // \u00C0, we'll have to skip A first before we get to \r
3054                 // \u0300 the grave accent\r
3055                 targetce = m_colEIter_.next();\r
3056                 if (targetce == CollationElementIterator.NULLORDER) {\r
3057                     found = false;\r
3058                     break;\r
3059                 }\r
3060                 targetce = getCE(targetce);\r
3061                 if (firstce == CollationElementIterator.NULLORDER \r
3062                     || firstce == CollationElementIterator.IGNORABLE) {\r
3063                     firstce = targetce;\r
3064                 }\r
3065                 \r
3066                 if (targetce == m_pattern_.m_CE_[0]) {\r
3067                     // the first ce can be a contraction\r
3068                     found = true;\r
3069                     break;\r
3070                 }\r
3071                 if (m_colEIter_.m_CEBufferOffset_ == -1 \r
3072                     || m_colEIter_.m_CEBufferOffset_ \r
3073                                             == m_colEIter_.m_CEBufferSize_) {\r
3074                     // checking for accents in composite character\r
3075                     found = false;\r
3076                     break;\r
3077                 }\r
3078             }\r
3079     \r
3080             targetce = firstce;\r
3081             \r
3082             while (found && patternceindex < m_pattern_.m_CELength_) {\r
3083                 targetce = m_colEIter_.next();\r
3084                 if (targetce == CollationElementIterator.NULLORDER) {\r
3085                     found = false;\r
3086                     break;\r
3087                 }\r
3088                 targetce = getCE(targetce);\r
3089                 if (targetce == CollationElementIterator.IGNORABLE) {\r
3090                     continue;\r
3091                 }\r
3092     \r
3093                 found = found && targetce == m_pattern_.m_CE_[patternceindex]; \r
3094                 patternceindex ++;\r
3095             }\r
3096     \r
3097             // initializing the rearranged accent array\r
3098             if (hasPatternAccents && !found) {\r
3099                 found = doPreviousCanonicalMatch(textoffset);\r
3100             }\r
3101     \r
3102             if (!found) {\r
3103                 textoffset = reverseShift(textoffset, targetce, patternceindex);\r
3104                 patternceindex = 0;\r
3105                 continue;\r
3106             }\r
3107     \r
3108             if (checkPreviousCanonicalMatch(textoffset)) {\r
3109                 return;\r
3110             }\r
3111             textoffset = m_utilBuffer_[0];\r
3112         }\r
3113         setMatchNotFound();\r
3114     }\r
3115     \r
3116     /**\r
3117      * Gets a substring out of a CharacterIterator\r
3118      * @param text CharacterIterator\r
3119      * @param start start offset\r
3120      * @param length of substring\r
3121      * @return substring from text starting at start and length length\r
3122      */\r
3123     private static final String getString(CharacterIterator text, int start,\r
3124                                             int length)\r
3125     {\r
3126         StringBuilder result = new StringBuilder(length);\r
3127         int offset = text.getIndex();\r
3128         text.setIndex(start);\r
3129         for (int i = 0; i < length; i ++) {\r
3130             result.append(text.current());\r
3131             text.next();\r
3132         }\r
3133         text.setIndex(offset);\r
3134         return result.toString();\r
3135     }\r
3136     \r
3137     /**\r
3138      * Getting the mask for collation strength\r
3139      * @param strength collation strength\r
3140       * @return collation element mask\r
3141      */\r
3142     private static final int getMask(int strength) \r
3143     {\r
3144         switch (strength) \r
3145         {\r
3146             case Collator.PRIMARY:\r
3147                 return RuleBasedCollator.CE_PRIMARY_MASK_;\r
3148             case Collator.SECONDARY:\r
3149                 return RuleBasedCollator.CE_SECONDARY_MASK_ \r
3150                        | RuleBasedCollator.CE_PRIMARY_MASK_;\r
3151             default:\r
3152                 return RuleBasedCollator.CE_TERTIARY_MASK_ \r
3153                        | RuleBasedCollator.CE_SECONDARY_MASK_ \r
3154                        | RuleBasedCollator.CE_PRIMARY_MASK_;\r
3155         }\r
3156     }\r
3157     \r
3158     /**\r
3159      * Sets match not found \r
3160      */\r
3161     private void setMatchNotFound() \r
3162     {\r
3163         // this method resets the match result regardless of the error status.\r
3164         m_matchedIndex_ = DONE;\r
3165         setMatchLength(0);\r
3166     }\r
3167     \r
3168     /**\r
3169      * Check the boundaries of the match.\r
3170      */\r
3171     private int checkBreakBoundary(int end) {\r
3172         if (!m_charBreakIter_.isBoundary(end)) {\r
3173             end = m_charBreakIter_.following(end);\r
3174         }\r
3175         return end;\r
3176     }\r
3177 }\r