jars/icu4j-52_1/main/classes/collate/src/com/ibm/icu/text/CollationElementIterator.java

   1 /**
   2 *******************************************************************************
   3 * Copyright (C) 1996-2013, International Business Machines Corporation and    *
   4 * others. All Rights Reserved.                                                *
   5 *******************************************************************************
   6 *
   7 *
   8 *******************************************************************************
   9 */
  10 package com.ibm.icu.text;
  11
  12 /***
  13  * import java.text.StringCharacterIterator;
  14  * import java.text.CharacterIterator;
  15  */
  16 import java.text.CharacterIterator;
  17 import java.util.MissingResourceException;
  18
  19 import com.ibm.icu.impl.CharacterIteratorWrapper;
  20 import com.ibm.icu.impl.ICUDebug;
  21 import com.ibm.icu.impl.Norm2AllModes;
  22 import com.ibm.icu.impl.Normalizer2Impl;
  23 import com.ibm.icu.impl.StringUCharacterIterator;
  24 import com.ibm.icu.impl.UCharacterProperty;
  25 import com.ibm.icu.lang.UCharacter;
  26
  27 /**
  28  * <p><code>CollationElementIterator</code> is an iterator created by
  29  * a RuleBasedCollator to walk through a string. The return result of
  30  * each iteration is a 32-bit collation element that defines the
  31  * ordering priority of the next character or sequence of characters
  32  * in the source string.</p>
  33  *
  34  * <p>For illustration, consider the following in Spanish:
  35  * <blockquote>
  36  * <pre>
  37  * "ca" -> the first collation element is collation_element('c') and second
  38  *         collation element is collation_element('a').
  39  *
  40  * Since "ch" in Spanish sorts as one entity, the below example returns one
  41  * collation element for the two characters 'c' and 'h'
  42  *
  43  * "cha" -> the first collation element is collation_element('ch') and second
  44  *          collation element is collation_element('a').
  45  * </pre>
  46  * </blockquote>
  47  * And in German,
  48  * <blockquote>
  49  * <pre>
  50  * Since the character '&#230;' is a composed character of 'a' and 'e', the
  51  * iterator returns two collation elements for the single character '&#230;'
  52  *
  53  * "&#230;b" -> the first collation element is collation_element('a'), the
  54  *              second collation element is collation_element('e'), and the
  55  *              third collation element is collation_element('b').
  56  * </pre>
  57  * </blockquote>
  58  * </p>
  59  *
  60  * <p>For collation ordering comparison, the collation element results
  61  * can not be compared simply by using basic arithmetric operators,
  62  * e.g. &lt;, == or &gt;, further processing has to be done. Details
  63  * can be found in the ICU
  64  * <a href="http://www.icu-project.org/userguide/Collate_ServiceArchitecture.html">
  65  * user guide</a>. An example of using the CollationElementIterator
  66  * for collation ordering comparison is the class
  67  * <a href=StringSearch.html> com.ibm.icu.text.StringSearch</a>.</p>
  68  *
  69  * <p>To construct a CollationElementIterator object, users
  70  * call the method getCollationElementIterator() on a
  71  * RuleBasedCollator that defines the desired sorting order.</p>
  72  *
  73  * <p> Example:
  74  * <blockquote>
  75  * <pre>
  76  *  String testString = "This is a test";
  77  *  RuleBasedCollator rbc = new RuleBasedCollator("&amp;a&lt;b");
  78  *  CollationElementIterator iterator = rbc.getCollationElementIterator(testString);
  79  *  int primaryOrder = iterator.IGNORABLE;
  80  *  while (primaryOrder != iterator.NULLORDER) {
  81  *      int order = iterator.next();
  82  *      if (order != iterator.IGNORABLE &&
  83  *          order != iterator.NULLORDER) {
  84  *          // order is valid, not ignorable and we have not passed the end
  85  *          // of the iteration, we do something
  86  *          primaryOrder = CollationElementIterator.primaryOrder(order);
  87  *          System.out.println("Next primary order 0x" +
  88  *                             Integer.toHexString(primaryOrder));
  89  *      }
  90  *  }
  91  * </pre>
  92  * </blockquote>
  93  * </p>
  94  * <p>
  95  * The method next() returns the collation order of the next character based on
  96  * the comparison level of the collator. The method previous() returns the
  97  * collation order of the previous character based on the comparison level of
  98  * the collator. The Collation Element Iterator moves only in one direction
  99  * between calls to reset(), setOffset(), or setText(). That is, next() and
 100  * previous() can not be inter-used. Whenever previous() is to be called after
 101  * next() or vice versa, reset(), setOffset() or setText() has to be called first
 102  * to reset the status, shifting current position to either the end or the start of
 103  * the string (reset() or setText()), or the specified position (setOffset()).
 104  * Hence at the next call of next() or previous(), the first or last collation order,
 105  * or collation order at the specified position will be returned. If a change of
 106  * direction is done without one of these calls, the result is undefined.
 107  * </p>
 108  * <p>
 109  * This class is not subclassable.
 110  * </p>
 111  * @see Collator
 112  * @see RuleBasedCollator
 113  * @see StringSearch
 114  * @author Syn Wee Quek
 115  * @stable ICU 2.8
 116  */
 117 public final class CollationElementIterator
 118 {
 119
 120
 121     // public data members --------------------------------------------------
 122
 123     /**
 124      * <p>This constant is returned by the iterator in the methods
 125      * next() and previous() when the end or the beginning of the
 126      * source string has been reached, and there are no more valid
 127      * collation elements to return.</p>
 128      *
 129      * <p>See class documentation for an example of use.</p>
 130      * @stable ICU 2.8
 131      * @see #next
 132      * @see #previous */
 133     public final static int NULLORDER = 0xffffffff;
 134
 135     /**
 136      * <p>This constant is returned by the iterator in the methods
 137      * next() and previous() when a collation element result is to be
 138      * ignored.</p>
 139      *
 140      * <p>See class documentation for an example of use.</p>
 141      * @stable ICU 2.8
 142      * @see #next
 143      * @see #previous */
 144     public static final int IGNORABLE = 0;
 145
 146     // public methods -------------------------------------------------------
 147
 148     // public getters -------------------------------------------------------
 149
 150     /**
 151      * <p>Returns the character offset in the source string
 152      * corresponding to the next collation element. I.e., getOffset()
 153      * returns the position in the source string corresponding to the
 154      * collation element that will be returned by the next call to
 155      * next() or previous(). This value could be any of:
 156      * <ul>
 157      * <li> The index of the <b>first</b> character corresponding to
 158      * the next collation element. (This means that if
 159      * <code>setOffset(offset)</code> sets the index in the middle of
 160      * a contraction, <code>getOffset()</code> returns the index of
 161      * the first character in the contraction, which may not be equal
 162      * to the original offset that was set. Hence calling getOffset()
 163      * immediately after setOffset(offset) does not guarantee that the
 164      * original offset set will be returned.)
 165      * <li> If normalization is on, the index of the <b>immediate</b>
 166      * subsequent character, or composite character with the first
 167      * character, having a combining class of 0.
 168      * <li> The length of the source string, if iteration has reached
 169      * the end.
 170      *</ul>
 171      * </p>
 172      * @return The character offset in the source string corresponding to the
 173      *         collation element that will be returned by the next call to
 174      *         next() or previous().
 175      * @stable ICU 2.8
 176      */
 177     public int getOffset()
 178     {
 179         if (m_bufferOffset_ != -1) {
 180             if (m_isForwards_) {
 181                 return m_FCDLimit_;
 182             }
 183             return m_FCDStart_;
 184         }
 185         return m_source_.getIndex();
 186     }
 187
 188
 189     /**
 190      * <p> Returns the maximum length of any expansion sequence that ends with
 191      * the specified collation element. If there is no expansion with this
 192      * collation element as the last element, returns 1.
 193      * </p>
 194      * @param ce a collation element returned by previous() or next().
 195      * @return the maximum length of any expansion sequence ending
 196      *         with the specified collation element.
 197      * @stable ICU 2.8
 198      */
 199     public int getMaxExpansion(int ce)
 200     {
 201         int start = 0;
 202         int limit = m_collator_.m_expansionEndCE_.length;
 203         long unsignedce = ce & 0xFFFFFFFFl;
 204         while (start < limit - 1) {
 205             int mid = start + ((limit - start) >> 1);
 206             long midce = m_collator_.m_expansionEndCE_[mid] & 0xFFFFFFFFl;
 207             if (unsignedce <= midce) {
 208                 limit = mid;
 209             }
 210             else {
 211                 start = mid;
 212             }
 213         }
 214         int result = 1;
 215         if (m_collator_.m_expansionEndCE_[start] == ce) {
 216             result = m_collator_.m_expansionEndCEMaxSize_[start];
 217         }
 218         else if (limit < m_collator_.m_expansionEndCE_.length &&
 219                  m_collator_.m_expansionEndCE_[limit] == ce) {
 220             result = m_collator_.m_expansionEndCEMaxSize_[limit];
 221         }
 222         else if ((ce & 0xFFFF) == 0x00C0) {
 223             result = 2;
 224         }
 225         return result;
 226     }
 227
 228     // public other methods -------------------------------------------------
 229
 230     /**
 231      * <p> Resets the cursor to the beginning of the string. The next
 232      * call to next() or previous() will return the first and last
 233      * collation element in the string, respectively.</p>
 234      *
 235      * <p>If the RuleBasedCollator used by this iterator has had its
 236      * attributes changed, calling reset() will reinitialize the
 237      * iterator to use the new attributes.</p>
 238      *
 239      * @stable ICU 2.8
 240      */
 241     public void reset()
 242     {
 243         m_source_.setToStart();
 244         updateInternalState();
 245         m_direction = 0;    // initial state
 246     }
 247
 248     /**
 249      * <p>Get the next collation element in the source string.</p>
 250      *
 251      * <p>This iterator iterates over a sequence of collation elements
 252      * that were built from the string. Because there isn't
 253      * necessarily a one-to-one mapping from characters to collation
 254      * elements, this doesn't mean the same thing as "return the
 255      * collation element [or ordering priority] of the next character
 256      * in the string".</p>
 257      *
 258      * <p>This function returns the collation element that the
 259      * iterator is currently pointing to, and then updates the
 260      * internal pointer to point to the next element.</p>
 261      *
 262      * @return the next collation element or NULLORDER if the end of the
 263      *         iteration has been reached.
 264      * @stable ICU 2.8
 265      */
 266     public int next()
 267     {
 268         assert m_direction >= 0;
 269         m_direction = 1;
 270
 271         m_isForwards_ = true;
 272         if (m_CEBufferSize_ > 0) {
 273             if (m_CEBufferOffset_ < m_CEBufferSize_) {
 274                 // if there are expansions left in the buffer, we return it
 275                 return m_CEBuffer_[m_CEBufferOffset_ ++];
 276             }
 277             m_CEBufferSize_ = 0;
 278             m_CEBufferOffset_ = 0;
 279         }
 280
 281         int result = NULLORDER;
 282         char ch = 0;
 283         do {
 284             int ch_int = nextChar();
 285             if (ch_int == UCharacterIterator.DONE) {
 286                 return NULLORDER;
 287             }
 288             ch = (char)ch_int;
 289             if (m_collator_.m_isHiragana4_) {
 290                 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
 291                  * based on whether the previous codepoint was Hiragana or Katakana.
 292                  */
 293                 m_isCodePointHiragana_ = (m_isCodePointHiragana_ && (ch >= 0x3099 && ch <= 0x309C)) ||
 294                                          ((ch >= 0x3040 && ch <= 0x309e) && !(ch > 0x3094 && ch < 0x309d));
 295             }
 296
 297             if (ch <= 0xFF) {
 298                 // For latin-1 characters we never need to fall back to the UCA
 299                 // table because all of the UCA data is replicated in the
 300                 // latinOneMapping array.
 301                 // Except: Special CEs can result in CE_NOT_FOUND_,
 302                 // for example if the default entry for a prefix-special is "not found",
 303                 // and we do need to fall back to the UCA in such a case.
 304                 // TODO: It would be better if tailoring specials never resulted in "not found"
 305                 // unless the corresponding UCA result is also "not found".
 306                 // That would require a change in the ICU4J collator-from-rule builder.
 307                 result = m_collator_.m_trie_.getLatin1LinearValue(ch);
 308             } else {
 309                 result = m_collator_.m_trie_.getLeadValue(ch);
 310             }
 311             if (!RuleBasedCollator.isSpecial(result)) {
 312                 return result;
 313             }
 314             if (result != CE_NOT_FOUND_) {
 315                 result = nextSpecial(m_collator_, result, ch);
 316             }
 317             if (result == CE_NOT_FOUND_) {
 318                 // couldn't find a good CE in the tailoring
 319                 if (RuleBasedCollator.UCA_ != null) {
 320                     result = RuleBasedCollator.UCA_.m_trie_.getLeadValue(ch);
 321                     if (RuleBasedCollator.isSpecial(result)) {
 322                         // UCA also gives us a special CE
 323                         result = nextSpecial(RuleBasedCollator.UCA_, result, ch);
 324                     }
 325                 }
 326                 if(result == CE_NOT_FOUND_) {
 327                     // maybe there is no UCA, unlikely in Java, but ported for consistency
 328                     result = nextImplicit(ch);
 329                 }
 330             }
 331         } while (result == IGNORABLE && ch >= 0xAC00 && ch <= 0xD7AF);
 332
 333         return result;
 334     }
 335
 336     /**
 337      * <p>Get the previous collation element in the source string.</p>
 338      *
 339      * <p>This iterator iterates over a sequence of collation elements
 340      * that were built from the string. Because there isn't
 341      * necessarily a one-to-one mapping from characters to collation
 342      * elements, this doesn't mean the same thing as "return the
 343      * collation element [or ordering priority] of the previous
 344      * character in the string".</p>
 345      *
 346      * <p>This function updates the iterator's internal pointer to
 347      * point to the collation element preceding the one it's currently
 348      * pointing to and then returns that element, while next() returns
 349      * the current element and then updates the pointer.</p>
 350      *
 351      * @return the previous collation element, or NULLORDER when the start of
 352      *             the iteration has been reached.
 353      * @stable ICU 2.8
 354      */
 355     public int previous()
 356     {
 357         assert m_direction <= 0;
 358         m_direction = -1;
 359
 360         if (m_source_.getIndex() <= 0 && m_isForwards_) {
 361             // if iterator is new or reset, we can immediate perform  backwards
 362             // iteration even when the offset is not right.
 363             m_source_.setToLimit();
 364             updateInternalState();
 365         }
 366         m_isForwards_ = false;
 367         if (m_CEBufferSize_ > 0) {
 368             if (m_CEBufferOffset_ > 0) {
 369                 return m_CEBuffer_[-- m_CEBufferOffset_];
 370             }
 371             m_CEBufferSize_ = 0;
 372             m_CEBufferOffset_ = 0;
 373         }
 374
 375         int result = NULLORDER;
 376         char ch = 0;
 377         do {
 378             int ch_int = previousChar();
 379             if (ch_int == UCharacterIterator.DONE) {
 380                 return NULLORDER;
 381             }
 382             ch = (char)ch_int;
 383             if (m_collator_.m_isHiragana4_) {
 384                 m_isCodePointHiragana_ = (ch >= 0x3040 && ch <= 0x309f);
 385             }
 386             if (m_collator_.isContractionEnd(ch) && !isBackwardsStart()) {
 387                 result = previousSpecial(m_collator_, CE_CONTRACTION_, ch);
 388             }
 389             else {
 390                 if (ch <= 0xFF) {
 391                     result = m_collator_.m_trie_.getLatin1LinearValue(ch);
 392                 }
 393                 else {
 394                     result = m_collator_.m_trie_.getLeadValue(ch);
 395                 }
 396                 if (RuleBasedCollator.isSpecial(result)) {
 397                     result = previousSpecial(m_collator_, result, ch);
 398                 }
 399                 if (result == CE_NOT_FOUND_) {
 400                     if (!isBackwardsStart()
 401                         && m_collator_.isContractionEnd(ch)) {
 402                         result = CE_CONTRACTION_;
 403                     }
 404                     else {
 405                         if(RuleBasedCollator.UCA_ != null) {
 406                             result = RuleBasedCollator.UCA_.m_trie_.getLeadValue(ch);
 407                         }
 408                     }
 409
 410                     if (RuleBasedCollator.isSpecial(result)) {
 411                         if(RuleBasedCollator.UCA_ != null) {
 412                             result = previousSpecial(RuleBasedCollator.UCA_, result, ch);
 413                         }
 414                     }
 415                 }
 416             }
 417         } while (result == IGNORABLE && ch >= 0xAC00 && ch <= 0xD7AF);
 418         if(result == CE_NOT_FOUND_) {
 419             result = previousImplicit(ch);
 420         }
 421         return result;
 422     }
 423
 424     /**
 425      * Return the primary order of the specified collation element,
 426      * i.e. the first 16 bits.  This value is unsigned.
 427      * @param ce the collation element
 428      * @return the element's 16 bits primary order.
 429      * @stable ICU 2.8
 430      */
 431     public final static int primaryOrder(int ce)
 432     {
 433         return (ce & RuleBasedCollator.CE_PRIMARY_MASK_)
 434             >>> RuleBasedCollator.CE_PRIMARY_SHIFT_;
 435     }
 436     /**
 437      * Return the secondary order of the specified collation element,
 438      * i.e. the 16th to 23th bits, inclusive.  This value is unsigned.
 439      * @param ce the collation element
 440      * @return the element's 8 bits secondary order
 441      * @stable ICU 2.8
 442      */
 443     public final static int secondaryOrder(int ce)
 444     {
 445         return (ce & RuleBasedCollator.CE_SECONDARY_MASK_)
 446             >> RuleBasedCollator.CE_SECONDARY_SHIFT_;
 447     }
 448
 449     /**
 450      * Return the tertiary order of the specified collation element, i.e. the last
 451      * 8 bits.  This value is unsigned.
 452      * @param ce the collation element
 453      * @return the element's 8 bits tertiary order
 454      * @stable ICU 2.8
 455      */
 456     public final static int tertiaryOrder(int ce)
 457     {
 458         return ce & RuleBasedCollator.CE_TERTIARY_MASK_;
 459     }
 460
 461     /**
 462      * <p> Sets the iterator to point to the collation element
 463      * corresponding to the character at the specified offset. The
 464      * value returned by the next call to next() will be the collation
 465      * element corresponding to the characters at offset.</p>
 466      *
 467      * <p>If offset is in the middle of a contracting character
 468      * sequence, the iterator is adjusted to the start of the
 469      * contracting sequence. This means that getOffset() is not
 470      * guaranteed to return the same value set by this method.</p>
 471      *
 472      * <p>If the decomposition mode is on, and offset is in the middle
 473      * of a decomposible range of source text, the iterator may not
 474      * return a correct result for the next forwards or backwards
 475      * iteration.  The user must ensure that the offset is not in the
 476      * middle of a decomposible range.</p>
 477      *
 478      * @param offset the character offset into the original source string to
 479      *        set. Note that this is not an offset into the corresponding
 480      *        sequence of collation elements.
 481      * @stable ICU 2.8
 482      */
 483     public void setOffset(int offset)
 484     {
 485         m_direction = 0;    // reset to initial state
 486
 487         m_source_.setIndex(offset);
 488         int ch_int = m_source_.current();
 489         char ch = (char)ch_int;
 490         if (ch_int != UCharacterIterator.DONE && m_collator_.isUnsafe(ch)) {
 491             // if it is unsafe we need to check if it is part of a contraction
 492             // or a surrogate character
 493             if (UTF16.isTrailSurrogate(ch)) {
 494                 // if it is a surrogate pair we move up one character
 495                 char prevch = (char)m_source_.previous();
 496                 if (!UTF16.isLeadSurrogate(prevch)) {
 497                     m_source_.setIndex(offset); // go back to the same index
 498                 }
 499             }
 500             else {
 501                 // could be part of a contraction
 502                 // backup to a safe point and iterate till we pass offset
 503                 while (m_source_.getIndex() > 0) {
 504                     if (!m_collator_.isUnsafe(ch)) {
 505                         break;
 506                     }
 507                     ch = (char)m_source_.previous();
 508                 }
 509                 updateInternalState();
 510                 int prevoffset = 0;
 511                 while (m_source_.getIndex() <= offset) {
 512                     prevoffset = m_source_.getIndex();
 513                     next();
 514                 }
 515                 m_source_.setIndex(prevoffset);
 516             }
 517         }
 518         updateInternalState();
 519         // direction code to prevent next and previous from returning a
 520         // character if we are already at the ends
 521         offset = m_source_.getIndex();
 522         if (offset == 0/* m_source_.getBeginIndex() */) {
 523             // preventing previous() from returning characters from the end of
 524             // the string again if we are at the beginning
 525             m_isForwards_ = false;
 526         }
 527         else if (offset == m_source_.getLength()) {
 528             // preventing next() from returning characters from the start of
 529             // the string again if we are at the end
 530             m_isForwards_ = true;
 531         }
 532     }
 533
 534     /**
 535      * <p>Set a new source string for iteration, and reset the offset
 536      * to the beginning of the text.</p>
 537      *
 538      * @param source the new source string for iteration.
 539      * @stable ICU 2.8
 540      */
 541     public void setText(String source)
 542     {
 543         m_srcUtilIter_.setText(source);
 544         m_source_ = m_srcUtilIter_;
 545         updateInternalState();
 546
 547         m_direction = 0;   // reset to initial state
 548     }
 549
 550     /**
 551      * <p>Set a new source string iterator for iteration, and reset the
 552      * offset to the beginning of the text.
 553      * </p>
 554      * <p>The source iterator's integrity will be preserved since a new copy
 555      * will be created for use.</p>
 556      * @param source the new source string iterator for iteration.
 557      * @stable ICU 2.8
 558      */
 559     public void setText(UCharacterIterator source)
 560     {
 561         m_srcUtilIter_.setText(source.getText());
 562         m_source_ = m_srcUtilIter_;
 563         updateInternalState();
 564
 565         m_direction = 0;   // reset to initial state
 566     }
 567
 568     /**
 569      * <p>Set a new source string iterator for iteration, and reset the
 570      * offset to the beginning of the text.
 571      * </p>
 572      * @param source the new source string iterator for iteration.
 573      * @stable ICU 2.8
 574      */
 575     public void setText(CharacterIterator source)
 576     {
 577         m_source_ = new CharacterIteratorWrapper(source);
 578         m_source_.setToStart();
 579         updateInternalState();
 580
 581         m_direction = 0;   // reset to initial state
 582     }
 583
 584     // public miscellaneous methods -----------------------------------------
 585
 586     /**
 587      * Tests that argument object is equals to this CollationElementIterator.
 588      * Iterators are equal if the objects uses the same RuleBasedCollator,
 589      * the same source text and have the same current position in iteration.
 590      * @param that object to test if it is equals to this
 591      *             CollationElementIterator
 592      * @stable ICU 2.8
 593      */
 594     public boolean equals(Object that)
 595     {
 596         if (that == this) {
 597             return true;
 598         }
 599         if (that instanceof CollationElementIterator) {
 600             CollationElementIterator thatceiter
 601                                               = (CollationElementIterator)that;
 602             if (!m_collator_.equals(thatceiter.m_collator_)) {
 603                 return false;
 604             }
 605             // checks the text
 606             return m_source_.getIndex() == thatceiter.m_source_.getIndex()
 607                    && m_source_.getText().equals(
 608                                             thatceiter.m_source_.getText());
 609         }
 610         return false;
 611     }
 612
 613     /**
 614      * Mock implementation of hashCode(). This implementation always returns a constant
 615      * value. When Java assertion is enabled, this method triggers an assertion failure.
 616      * @internal
 617      * @deprecated This API is ICU internal only.
 618      */
 619     public int hashCode() {
 620         assert false : "hashCode not designed";
 621         return 42;
 622     }
 623
 624     // package private constructors ------------------------------------------
 625
 626     private CollationElementIterator(RuleBasedCollator collator) {
 627         m_utilStringBuffer_ = new StringBuilder();
 628         m_collator_ = collator;
 629         m_CEBuffer_ = new int[CE_BUFFER_INIT_SIZE_];
 630         m_buffer_ = new StringBuilder();
 631         m_utilSpecialBackUp_ = new Backup();
 632     }
 633
 634     /**
 635      * <p>CollationElementIterator constructor. This takes a source
 636      * string and a RuleBasedCollator. The iterator will walk through
 637      * the source string based on the rules defined by the
 638      * collator. If the source string is empty, NULLORDER will be
 639      * returned on the first call to next().</p>
 640      *
 641      * @param source the source string.
 642      * @param collator the RuleBasedCollator
 643      * @stable ICU 2.8
 644      */
 645     CollationElementIterator(String source, RuleBasedCollator collator)
 646     {
 647         this(collator);
 648         m_source_ = m_srcUtilIter_ = new StringUCharacterIterator(source);
 649         updateInternalState();
 650     }
 651
 652     /**
 653      * <p>CollationElementIterator constructor. This takes a source
 654      * character iterator and a RuleBasedCollator. The iterator will
 655      * walk through the source string based on the rules defined by
 656      * the collator. If the source string is empty, NULLORDER will be
 657      * returned on the first call to next().</p>
 658      *
 659      * @param source the source string iterator.
 660      * @param collator the RuleBasedCollator
 661      * @stable ICU 2.8
 662      */
 663     CollationElementIterator(CharacterIterator source,
 664                              RuleBasedCollator collator)
 665     {
 666         this(collator);
 667         m_srcUtilIter_ = new StringUCharacterIterator();
 668         m_source_ = new CharacterIteratorWrapper(source);
 669         updateInternalState();
 670     }
 671
 672     /**
 673      * <p>CollationElementIterator constructor. This takes a source
 674      * character iterator and a RuleBasedCollator. The iterator will
 675      * walk through the source string based on the rules defined by
 676      * the collator. If the source string is empty, NULLORDER will be
 677      * returned on the first call to next().</p>
 678      *
 679      * @param source the source string iterator.
 680      * @param collator the RuleBasedCollator
 681      * @stable ICU 2.8
 682      */
 683     CollationElementIterator(UCharacterIterator source,
 684                              RuleBasedCollator collator)
 685     {
 686         this(collator);
 687         m_srcUtilIter_ = new StringUCharacterIterator();
 688         m_srcUtilIter_.setText(source.getText());
 689         m_source_ = m_srcUtilIter_;
 690         updateInternalState();
 691     }
 692
 693     // package private data members -----------------------------------------
 694
 695     /**
 696      * true if current codepoint was Hiragana
 697      */
 698     boolean m_isCodePointHiragana_;
 699     /**
 700      * Position in the original string that starts with a non-FCD sequence
 701      */
 702     int m_FCDStart_;
 703     /**
 704      * This is the CE from CEs buffer that should be returned.
 705      * Initial value is 0.
 706      * Forwards iteration will end with m_CEBufferOffset_ == m_CEBufferSize_,
 707      * backwards will end with m_CEBufferOffset_ == 0.
 708      * The next/previous after we reach the end/beginning of the m_CEBuffer_
 709      * will cause this value to be reset to 0.
 710      */
 711     int m_CEBufferOffset_;
 712
 713     /**
 714      * This is the position to which we have stored processed CEs.
 715      * Initial value is 0.
 716      * The next/previous after we reach the end/beginning of the m_CEBuffer_
 717      * will cause this value to be reset to 0.
 718      */
 719     int m_CEBufferSize_;
 720     static final int CE_NOT_FOUND_ = 0xF0000000;
 721     static final int CE_EXPANSION_TAG_ = 1;
 722     static final int CE_CONTRACTION_TAG_ = 2;
 723     /**
 724      * Collate Digits As Numbers (CODAN) implementation
 725      */
 726     static final int CE_DIGIT_TAG_ = 13;
 727
 728     // package private methods ----------------------------------------------
 729
 730     /**
 731      * Sets the collator used.
 732      * Internal use, all data members will be reset to the default values
 733      * @param collator to set
 734      */
 735     void setCollator(RuleBasedCollator collator)
 736     {
 737         m_collator_ = collator;
 738         updateInternalState();
 739     }
 740
 741     /**
 742      * <p>Sets the iterator to point to the collation element corresponding to
 743      * the specified character (the parameter is a CHARACTER offset in the
 744      * original string, not an offset into its corresponding sequence of
 745      * collation elements). The value returned by the next call to next()
 746      * will be the collation element corresponding to the specified position
 747      * in the text. Unlike the public method setOffset(int), this method does
 748      * not try to readjust the offset to the start of a contracting sequence.
 749      * getOffset() is guaranteed to return the same value as was passed to a
 750      * preceding call to setOffset().</p>
 751      * @param offset new character offset into the original text to set.
 752      */
 753     void setExactOffset(int offset)
 754     {
 755         m_source_.setIndex(offset);
 756         updateInternalState();
 757
 758         m_direction = 0;    // reset to initial state
 759     }
 760
 761     /**
 762      * Checks if iterator is in the buffer zone
 763      * @return true if iterator is in buffer zone, false otherwise
 764      */
 765     boolean isInBuffer()
 766     {
 767         return m_bufferOffset_ > 0;
 768     }
 769
 770
 771     /**
 772      * <p>Sets the iterator to point to the collation element corresponding to
 773      * the specified character (the parameter is a CHARACTER offset in the
 774      * original string, not an offset into its corresponding sequence of
 775      * collation elements). The value returned by the next call to next()
 776      * will be the collation element corresponding to the specified position
 777      * in the text. Unlike the public method setOffset(int), this method does
 778      * not try to readjust the offset to the start of a contracting sequence.
 779      * getOffset() is guaranteed to return the same value as was passed to a
 780      * preceding call to setOffset().</p>
 781      * </p>
 782      * @param source the new source string iterator for iteration.
 783      * @param offset to the source
 784      */
 785     void setText(UCharacterIterator source, int offset)
 786     {
 787         m_srcUtilIter_.setText(source.getText());
 788         m_source_ = m_srcUtilIter_;
 789         m_source_.setIndex(offset);
 790         updateInternalState();
 791
 792         m_direction = 0;   // reset to initial state
 793     }
 794
 795     // private inner class --------------------------------------------------
 796
 797     /**
 798      * Backup data class
 799      */
 800     private static final class Backup
 801     {
 802         // protected data members -------------------------------------------
 803
 804         /**
 805          * Backup non FCD sequence limit
 806          */
 807         protected int m_FCDLimit_;
 808         /**
 809          * Backup non FCD sequence start
 810          */
 811         protected int m_FCDStart_;
 812         /**
 813          * Backup if previous Codepoint is Hiragana quatenary
 814          */
 815         protected boolean m_isCodePointHiragana_;
 816         /**
 817          * Backup buffer position
 818          */
 819         protected int m_bufferOffset_;
 820         /**
 821          * Backup source iterator offset
 822          */
 823         protected int m_offset_;
 824         /**
 825          * Backup buffer contents
 826          */
 827         protected StringBuffer m_buffer_;
 828
 829         // protected constructor --------------------------------------------
 830
 831         /**
 832          * Empty constructor
 833          */
 834         protected Backup()
 835         {
 836             m_buffer_ = new StringBuffer();
 837         }
 838     }
 839     // end inner class ------------------------------------------------------
 840
 841     /**
 842      * Direction of travel
 843      */
 844     private boolean m_isForwards_;
 845     /**
 846      * Source string iterator
 847      */
 848     private UCharacterIterator m_source_;
 849     /**
 850      * This is position to the m_buffer_, -1 if iterator is not in m_buffer_
 851      */
 852     private int m_bufferOffset_;
 853     /**
 854      * Buffer for temporary storage of normalized characters, discontiguous
 855      * characters and Thai characters
 856      */
 857     private StringBuilder m_buffer_;
 858     /**
 859      * Position in the original string to continue forward FCD check from.
 860      */
 861     private int m_FCDLimit_;
 862     /**
 863      * The collator this iterator is based on
 864      */
 865     private RuleBasedCollator m_collator_;
 866     /**
 867      * true if Hiragana quatenary is on
 868      */
 869     //private boolean m_isHiragana4_;
 870     /**
 871      * CE buffer
 872      */
 873     private int m_CEBuffer_[];
 874     /**
 875      * In reality we should not have to deal with expansion sequences longer
 876      * then 16. However this value can be change if a bigger buffer is needed.
 877      * Note, if the size is change to too small a number, BIG trouble.
 878      * Reasonable small value is around 10, if there's no Arabic or other
 879      * funky collations that have long expansion sequence. This is the longest
 880      * expansion sequence this can handle without bombing out.
 881      */
 882     private static final int CE_BUFFER_INIT_SIZE_ = 512;
 883     /**
 884      * Backup storage for special processing inner cases
 885      */
 886     private Backup m_utilSpecialBackUp_;
 887     /**
 888      * Backup storage in special processing entry state
 889      */
 890     private Backup m_utilSpecialEntryBackUp_;
 891     /**
 892      * Backup storage in special processing discontiguous state
 893      */
 894     private Backup m_utilSpecialDiscontiguousBackUp_;
 895     /**
 896      * Utility
 897      */
 898     private StringUCharacterIterator m_srcUtilIter_;
 899     private StringBuilder m_utilStringBuffer_;
 900     private StringBuilder m_utilSkippedBuffer_;
 901     private CollationElementIterator m_utilColEIter_;
 902     private static final Normalizer2Impl m_nfcImpl_ = Norm2AllModes.getNFCInstance().impl;
 903     private StringBuilder m_unnormalized_;
 904     private Normalizer2Impl.ReorderingBuffer m_n2Buffer_;
 905     /**
 906      * The first non-zero combining class character
 907      */
 908     private static final int FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_ = 0xC0;
 909     /**
 910      * One character before the first character with leading non-zero combining
 911      * class
 912      */
 913     private static final int LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_ = 0x300;
 914     /**
 915      * Mask for the last byte
 916      */
 917     private static final int LAST_BYTE_MASK_ = 0xFF;
 918     /**
 919      * Shift value for the second last byte
 920      */
 921     private static final int SECOND_LAST_BYTE_SHIFT_ = 8;
 922
 923     // special ce values and tags -------------------------------------------
 924
 925 //    private static final int CE_EXPANSION_ = 0xF1000000;
 926     private static final int CE_CONTRACTION_ = 0xF2000000;
 927     /**
 928      * Indicates the last ce has been consumed. Compare with NULLORDER.
 929      * NULLORDER is returned if error occurs.
 930      */
 931 /*    private static final int CE_NO_MORE_CES_ = 0x00010101;
 932     private static final int CE_NO_MORE_CES_PRIMARY_ = 0x00010000;
 933     private static final int CE_NO_MORE_CES_SECONDARY_ = 0x00000100;
 934     private static final int CE_NO_MORE_CES_TERTIARY_ = 0x00000001;
 935 */
 936     private static final int CE_NOT_FOUND_TAG_ = 0;
 937     /**
 938      * Charset processing, not yet implemented
 939      */
 940     private static final int CE_CHARSET_TAG_ = 4;
 941     /**
 942      * AC00-D7AF
 943      */
 944     private static final int CE_HANGUL_SYLLABLE_TAG_ = 6;
 945     /**
 946      * D800-DBFF
 947      */
 948     private static final int CE_LEAD_SURROGATE_TAG_ = 7;
 949     /**
 950      * DC00-DFFF
 951      */
 952     private static final int CE_TRAIL_SURROGATE_TAG_ = 8;
 953     /**
 954      * 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
 955      */
 956     private static final int CE_CJK_IMPLICIT_TAG_ = 9;
 957     private static final int CE_IMPLICIT_TAG_ = 10;
 958     static final int CE_SPEC_PROC_TAG_ = 11;
 959     /**
 960      * This is a 3 byte primary with starting secondaries and tertiaries.
 961      * It fits in a single 32 bit CE and is used instead of expansion to save
 962      * space without affecting the performance (hopefully).
 963      */
 964     private static final int CE_LONG_PRIMARY_TAG_ = 12;
 965
 966 //    private static final int CE_CE_TAGS_COUNT = 14;
 967     private static final int CE_BYTE_COMMON_ = 0x05;
 968
 969     // end special ce values and tags ---------------------------------------
 970
 971     private static final int HANGUL_SBASE_ = 0xAC00;
 972     private static final int HANGUL_LBASE_ = 0x1100;
 973     private static final int HANGUL_VBASE_ = 0x1161;
 974     private static final int HANGUL_TBASE_ = 0x11A7;
 975     private static final int HANGUL_VCOUNT_ = 21;
 976     private static final int HANGUL_TCOUNT_ = 28;
 977
 978     // CJK stuff ------------------------------------------------------------
 979
 980 /*    private static final int CJK_BASE_ = 0x4E00;
 981     private static final int CJK_LIMIT_ = 0x9FFF+1;
 982     private static final int CJK_COMPAT_USED_BASE_ = 0xFA0E;
 983     private static final int CJK_COMPAT_USED_LIMIT_ = 0xFA2F + 1;
 984     private static final int CJK_A_BASE_ = 0x3400;
 985     private static final int CJK_A_LIMIT_ = 0x4DBF + 1;
 986     private static final int CJK_B_BASE_ = 0x20000;
 987     private static final int CJK_B_LIMIT_ = 0x2A6DF + 1;
 988     private static final int NON_CJK_OFFSET_ = 0x110000;
 989 */
 990     private static final boolean DEBUG  =  ICUDebug.enabled("collator");
 991
 992     // Field tracking the current direction. This field was added
 993     // just for making sure that reset()/setOffset()/setText() is called
 994     // before switching the iterator direction.
 995     // We used to allow changing direction without calling reset()/setOffset()
 996     // setText() in ICU4J, but the API specification was updated to match the
 997     // ICU4C's specification. The current implementation seems to handle
 998     // direction change (or not), but it will be completely replaced with
 999     // a new implementation not allowing this. Until then, we use this field
1000     // to trigger assertion and make sure our implementation is not depending on
1001     // the assumption. See ticket#9104.
1002     private byte m_direction = 0;   // -1: backward, 0: initial state, 1: forward
1003
1004     // private methods ------------------------------------------------------
1005
1006     /**
1007      * Reset the iterator internally
1008      */
1009     private void updateInternalState()
1010     {
1011         m_isCodePointHiragana_ = false;
1012         m_buffer_.setLength(0);
1013         m_bufferOffset_ = -1;
1014         m_CEBufferOffset_ = 0;
1015         m_CEBufferSize_ = 0;
1016         m_FCDLimit_ = -1;
1017         m_FCDStart_ = m_source_.getLength();
1018         //m_isHiragana4_ = m_collator_.m_isHiragana4_;
1019         m_isForwards_ = true;
1020     }
1021
1022     /**
1023      * Backup the current internal state
1024      * @param backup object to store the data
1025      */
1026     private void backupInternalState(Backup backup)
1027     {
1028         backup.m_offset_ = m_source_.getIndex();
1029         backup.m_FCDLimit_ = m_FCDLimit_;
1030         backup.m_FCDStart_ = m_FCDStart_;
1031         backup.m_isCodePointHiragana_ = m_isCodePointHiragana_;
1032         backup.m_bufferOffset_ = m_bufferOffset_;
1033         backup.m_buffer_.setLength(0);
1034         if (m_bufferOffset_ >= 0) {
1035             backup.m_buffer_.append(m_buffer_);
1036         }
1037     }
1038
1039     /**
1040      * Update the iterator internally with backed-up state
1041      * @param backup object that stored the data
1042      */
1043     private void updateInternalState(Backup backup)
1044     {
1045         m_source_.setIndex(backup.m_offset_);
1046         m_isCodePointHiragana_ = backup.m_isCodePointHiragana_;
1047         m_bufferOffset_ = backup.m_bufferOffset_;
1048         m_FCDLimit_ = backup.m_FCDLimit_;
1049         m_FCDStart_ = backup.m_FCDStart_;
1050         m_buffer_.setLength(0);
1051         if (m_bufferOffset_ >= 0) {
1052             m_buffer_.append(backup.m_buffer_);
1053         }
1054     }
1055
1056     /**
1057      * A fast combining class retrieval system.
1058      * @param ch UTF16 character
1059      * @return combining class of ch
1060      */
1061     private int getCombiningClass(int ch)
1062     {
1063         if (ch >= LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_ &&
1064             m_collator_.isUnsafe((char)ch) || ch > 0xFFFF
1065         ) {
1066             return m_nfcImpl_.getCC(m_nfcImpl_.getNorm16(ch));
1067         }
1068         return 0;
1069     }
1070
1071     /**
1072      * <p>Incremental normalization, this is an essential optimization.
1073      * Assuming FCD checks has been done, normalize the non-FCD characters into
1074      * the buffer.
1075      * Source offsets points to the current processing character.
1076      * </p>
1077      */
1078     private void normalize()
1079     {
1080         if (m_unnormalized_ == null) {
1081             m_unnormalized_ = new StringBuilder();
1082             m_n2Buffer_ = new Normalizer2Impl.ReorderingBuffer(m_nfcImpl_, m_buffer_, 10);
1083         } else {
1084             m_unnormalized_.setLength(0);
1085             m_n2Buffer_.remove();
1086         }
1087         int size = m_FCDLimit_ - m_FCDStart_;
1088         m_source_.setIndex(m_FCDStart_);
1089         for (int i = 0; i < size; i ++) {
1090             m_unnormalized_.append((char)m_source_.next());
1091         }
1092         m_nfcImpl_.decomposeShort(m_unnormalized_, 0, size, m_n2Buffer_);
1093     }
1094
1095     /**
1096      * <p>Incremental FCD check and normalization. Gets the next base character
1097      * position and determines if the in-between characters needs normalization.
1098      * </p>
1099      * <p>When entering, the state is known to be this:
1100      * <ul>
1101      * <li>We are working on source string, not the buffer.
1102      * <li>The leading combining class from the current character is 0 or the
1103      *     trailing combining class of the previous char was zero.
1104      * </ul>
1105      * Incoming source offsets points to the current processing character.
1106      * Return source offsets points to the current processing character.
1107      * </p>
1108      * @param ch current character (lead unit)
1109      * @param offset offset of ch +1
1110      * @return true if FCDCheck passes, false otherwise
1111      */
1112     private boolean FCDCheck(int ch, int offset)
1113     {
1114         boolean result = true;
1115
1116         // Get the trailing combining class of the current character.
1117         // If it's zero, we are OK.
1118         m_FCDStart_ = offset - 1;
1119         m_source_.setIndex(offset);
1120         // trie access
1121         int fcd;
1122         if (ch < 0x180) {
1123             fcd = m_nfcImpl_.getFCD16FromBelow180(ch);
1124         } else if (m_nfcImpl_.singleLeadMightHaveNonZeroFCD16(ch)) {
1125             if (Character.isHighSurrogate((char)ch)) {
1126                 int c2 = m_source_.next();
1127                 if (c2 < 0) {
1128                     fcd = 0;  // end of input
1129                 } else if (Character.isLowSurrogate((char)c2)) {
1130                     fcd = m_nfcImpl_.getFCD16FromNormData(Character.toCodePoint((char)ch, (char)c2));
1131                 } else {
1132                     m_source_.moveIndex(-1);
1133                     fcd = 0;
1134                 }
1135             } else {
1136                 fcd = m_nfcImpl_.getFCD16FromNormData(ch);
1137             }
1138         } else {
1139             fcd = 0;
1140         }
1141
1142         int prevTrailCC = fcd & LAST_BYTE_MASK_;
1143
1144         if (prevTrailCC == 0) {
1145             offset = m_source_.getIndex();
1146         } else {
1147             // The current char has a non-zero trailing CC. Scan forward until
1148             // we find a char with a leading cc of zero.
1149             while (true) {
1150                 ch = m_source_.nextCodePoint();
1151                 if (ch < 0) {
1152                     offset = m_source_.getIndex();
1153                     break;
1154                 }
1155                 // trie access
1156                 fcd = m_nfcImpl_.getFCD16(ch);
1157                 int leadCC = fcd >> SECOND_LAST_BYTE_SHIFT_;
1158                 if (leadCC == 0) {
1159                     // this is a base character, we stop the FCD checks
1160                     offset = m_source_.getIndex() - Character.charCount(ch);
1161                     break;
1162                 }
1163
1164                 if (leadCC < prevTrailCC) {
1165                     result = false;
1166                 }
1167
1168                 prevTrailCC = fcd & LAST_BYTE_MASK_;
1169             }
1170         }
1171         m_FCDLimit_ = offset;
1172         m_source_.setIndex(m_FCDStart_ + 1);
1173         return result;
1174     }
1175
1176     /**
1177      * <p>Method tries to fetch the next character that is in fcd form.</p>
1178      * <p>Normalization is done if required.</p>
1179      * <p>Offsets are returned at the next character.</p>
1180      * @return next fcd character
1181      */
1182     private int nextChar()
1183     {
1184         int result;
1185
1186         // loop handles the next character whether it is in the buffer or not.
1187         if (m_bufferOffset_ < 0) {
1188             // we're working on the source and not normalizing. fast path.
1189             // note Thai pre-vowel reordering uses buffer too
1190             result = m_source_.next();
1191         }
1192         else {
1193             // we are in the buffer, buffer offset will never be 0 here
1194             if (m_bufferOffset_ >= m_buffer_.length()) {
1195                 // Null marked end of buffer, revert to the source string and
1196                 // loop back to top to try again to get a character.
1197                 m_source_.setIndex(m_FCDLimit_);
1198                 m_bufferOffset_ = -1;
1199                 m_buffer_.setLength(0);
1200                 return nextChar();
1201             }
1202             return m_buffer_.charAt(m_bufferOffset_ ++);
1203         }
1204         int startoffset = m_source_.getIndex();
1205         if (result < FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_
1206             // Fast fcd safe path. trail combining class == 0.
1207             || m_collator_.getDecomposition() == Collator.NO_DECOMPOSITION
1208             || m_bufferOffset_ >= 0 || m_FCDLimit_ >= startoffset) {
1209             // skip the fcd checks
1210             return result;
1211         }
1212
1213         if (result < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_) {
1214             // We need to peek at the next character in order to tell if we are
1215             // FCD
1216             int next = m_source_.current();
1217             if (next == UCharacterIterator.DONE
1218                 || next < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_) {
1219                 return result; // end of source string and if next character
1220                 // starts with a base character is always fcd.
1221             }
1222         }
1223
1224         // Need a more complete FCD check and possible normalization.
1225         if (!FCDCheck(result, startoffset)) {
1226             normalize();
1227             result = m_buffer_.charAt(0);
1228             m_bufferOffset_ = 1;
1229         }
1230         return result;
1231     }
1232
1233     /**
1234      * <p>Incremental normalization, this is an essential optimization.
1235      * Assuming FCD checks has been done, normalize the non-FCD characters into
1236      * the buffer.
1237      * Source offsets points to the current processing character.</p>
1238      */
1239     private void normalizeBackwards()
1240     {
1241         normalize();
1242         m_bufferOffset_ = m_buffer_.length();
1243     }
1244
1245     /**
1246      * <p>Incremental backwards FCD check and normalization. Gets the previous
1247      * base character position and determines if the in-between characters
1248      * needs normalization.
1249      * </p>
1250      * <p>When entering, the state is known to be this:
1251      * <ul>
1252      * <li>We are working on source string, not the buffer.
1253      * <li>The trailing combining class from the current character is 0 or the
1254      *     leading combining class of the next char was zero.
1255      * </ul>
1256      * Input source offsets points to the previous character.
1257      * Return source offsets points to the current processing character.
1258      * </p>
1259      * @param ch current character
1260      * @param offset current character offset
1261      * @return true if FCDCheck passes, false otherwise
1262      */
1263     private boolean FCDCheckBackwards(int ch, int offset)
1264     {
1265         int fcd;
1266         m_FCDLimit_ = offset + 1;
1267         m_source_.setIndex(offset);
1268         if (ch < 0x180) {
1269             fcd = m_nfcImpl_.getFCD16FromBelow180(ch);
1270         } else if (!Character.isLowSurrogate((char)ch)) {
1271             if (m_nfcImpl_.singleLeadMightHaveNonZeroFCD16(ch)) {
1272                 fcd = m_nfcImpl_.getFCD16FromNormData(ch);
1273             } else {
1274                 fcd = 0;
1275             }
1276         } else {
1277             int c2 = m_source_.previous();
1278             if (c2 < 0) {
1279                 fcd = 0;  // start of input
1280             } else if (Character.isHighSurrogate((char)c2)) {
1281                 ch = Character.toCodePoint((char)c2, (char)ch);
1282                 fcd = m_nfcImpl_.getFCD16FromNormData(ch);
1283                 --offset;
1284             } else {
1285                 m_source_.moveIndex(1);
1286                 fcd = 0;
1287             }
1288         }
1289
1290         // Scan backward until we find a char with a leading cc of zero.
1291         boolean result = true;
1292         if (fcd != 0) {
1293             int leadCC;
1294             for (;;) {
1295                 leadCC = fcd >> SECOND_LAST_BYTE_SHIFT_;
1296                 if (leadCC == 0 || (ch = m_source_.previousCodePoint()) < 0) {
1297                     offset = m_source_.getIndex();
1298                     break;
1299                 }
1300                 fcd = m_nfcImpl_.getFCD16(ch);
1301                 int prevTrailCC = fcd & LAST_BYTE_MASK_;
1302                 if (leadCC < prevTrailCC) {
1303                     result = false;
1304                 } else if (fcd == 0) {
1305                     offset = m_source_.getIndex() + Character.charCount(ch);
1306                     break;
1307                 }
1308             }
1309         }
1310
1311         // storing character with 0 lead fcd or the 1st accent with a base
1312         // character before it
1313         m_FCDStart_ = offset;
1314         m_source_.setIndex(m_FCDLimit_);
1315         return result;
1316     }
1317
1318     /**
1319      * <p>Method tries to fetch the previous character that is in fcd form.</p>
1320      * <p>Normalization is done if required.</p>
1321      * <p>Offsets are returned at the current character.</p>
1322      * @return previous fcd character
1323      */
1324     private int previousChar()
1325     {
1326         if (m_bufferOffset_ >= 0) {
1327             m_bufferOffset_ --;
1328             if (m_bufferOffset_ >= 0) {
1329                 return m_buffer_.charAt(m_bufferOffset_);
1330             }
1331             else {
1332                 // At the start of buffer, route back to string.
1333                 m_buffer_.setLength(0);
1334                 if (m_FCDStart_ == 0) {
1335                     m_FCDStart_ = -1;
1336                     m_source_.setIndex(0);
1337                     return UCharacterIterator.DONE;
1338                 }
1339                 else {
1340                     m_FCDLimit_ = m_FCDStart_;
1341                     m_source_.setIndex(m_FCDStart_);
1342                     return previousChar();
1343                 }
1344             }
1345         }
1346         int result = m_source_.previous();
1347         int startoffset = m_source_.getIndex();
1348         if (result < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_
1349             || m_collator_.getDecomposition() == Collator.NO_DECOMPOSITION
1350             || m_FCDStart_ <= startoffset || m_source_.getIndex() == 0) {
1351             return result;
1352         }
1353         int ch = m_source_.previous();
1354         if (ch < FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_) {
1355             // if previous character is FCD
1356             m_source_.next();
1357             return result;
1358         }
1359         // Need a more complete FCD check and possible normalization.
1360         if (!FCDCheckBackwards(result, startoffset)) {
1361             normalizeBackwards();
1362             m_bufferOffset_ --;
1363             result = m_buffer_.charAt(m_bufferOffset_);
1364         }
1365         else {
1366             // fcd checks always reset m_source_ to the limit of the FCD
1367             m_source_.setIndex(startoffset);
1368         }
1369         return result;
1370     }
1371
1372     /**
1373      * Determines if it is at the start of source iteration
1374      * @return true if iterator at the start, false otherwise
1375      */
1376     private final boolean isBackwardsStart()
1377     {
1378         return (m_bufferOffset_ < 0 && m_source_.getIndex() == 0)
1379             || (m_bufferOffset_ == 0 && m_FCDStart_ <= 0);
1380     }
1381
1382     /**
1383      * Checks if iterator is at the end of its source string.
1384      * @return true if it is at the end, false otherwise
1385      */
1386     private final boolean isEnd()
1387     {
1388         if (m_bufferOffset_ >= 0) {
1389             if (m_bufferOffset_ != m_buffer_.length()) {
1390                 return false;
1391             }
1392             else {
1393                 // at end of buffer. check if fcd is at the end
1394                 return m_FCDLimit_ == m_source_.getLength();
1395             }
1396         }
1397         return m_source_.getLength() == m_source_.getIndex();
1398     }
1399
1400     /**
1401      * <p>Special CE management for surrogates</p>
1402      * <p>Lead surrogate is encountered. CE to be retrieved by using the
1403      * following code unit. If the next code unit is a trail surrogate, both
1404      * units will be combined to retrieve the CE,
1405      * otherwise we treat it like an unassigned code point.</p>
1406      * @param collator collator to use
1407      * @param ce current CE
1408      * @param trail character
1409      * @return next CE for the surrogate characters
1410      */
1411     private final int nextSurrogate(RuleBasedCollator collator, int ce,
1412                                     char trail)
1413     {
1414         if (!UTF16.isTrailSurrogate(trail)) {
1415             updateInternalState(m_utilSpecialBackUp_);
1416             return CE_NOT_FOUND_;
1417         }
1418         // TODO: CE contain the data from the previous CE + the mask.
1419         // It should at least be unmasked
1420         int result = collator.m_trie_.getTrailValue(ce, trail);
1421         if (result == CE_NOT_FOUND_) {
1422             updateInternalState(m_utilSpecialBackUp_);
1423         }
1424         return result;
1425     }
1426
1427     /**
1428      * Gets the CE expansion offset
1429      * @param collator current collator
1430      * @param ce ce to test
1431      * @return expansion offset
1432      */
1433     private int getExpansionOffset(RuleBasedCollator collator, int ce)
1434     {
1435         return ((ce & 0xFFFFF0) >> 4) - collator.m_expansionOffset_;
1436     }
1437
1438
1439     /**
1440      * Gets the contraction ce offset
1441      * @param collator current collator
1442      * @param ce current ce
1443      * @return contraction offset
1444      */
1445     private int getContractionOffset(RuleBasedCollator collator, int ce)
1446     {
1447         return (ce & 0xFFFFFF) - collator.m_contractionOffset_;
1448     }
1449
1450     /**
1451      * Checks if CE is a special tag CE
1452      * @param ce to check
1453      * @return true if CE is a special tag CE, false otherwise
1454      */
1455     private boolean isSpecialPrefixTag(int ce)
1456     {
1457         return RuleBasedCollator.isSpecial(ce) &&
1458             RuleBasedCollator.getTag(ce) == CE_SPEC_PROC_TAG_;
1459     }
1460
1461     /**
1462      * <p>Special processing getting a CE that is preceded by a certain
1463      * prefix.</p>
1464      * <p>Used for optimizing Japanese length and iteration marks. When a
1465      * special processing tag is encountered, iterate backwards to see if
1466      * there's a match.</p>
1467      * <p>Contraction tables are used, prefix data is stored backwards in the
1468      * table.</p>
1469      * @param collator collator to use
1470      * @param ce current ce
1471      * @param entrybackup entry backup iterator status
1472      * @return next collation element
1473      */
1474     private int nextSpecialPrefix(RuleBasedCollator collator, int ce,
1475                                   Backup entrybackup)
1476     {
1477         backupInternalState(m_utilSpecialBackUp_);
1478         updateInternalState(entrybackup);
1479         previousChar();
1480         // We want to look at the character where we entered
1481
1482         while (true) {
1483             // This loop will run once per source string character, for as
1484             // long as we are matching a potential contraction sequence
1485             // First we position ourselves at the begining of contraction
1486             // sequence
1487             int entryoffset = getContractionOffset(collator, ce);
1488             int offset = entryoffset;
1489             if (isBackwardsStart()) {
1490                 ce = collator.m_contractionCE_[offset];
1491                 break;
1492             }
1493             char previous = (char)previousChar();
1494             while (previous > collator.m_contractionIndex_[offset]) {
1495                 // contraction characters are ordered, skip smaller characters
1496                 offset ++;
1497             }
1498
1499             if (previous == collator.m_contractionIndex_[offset]) {
1500                 // Found the source string char in the table.
1501                 // Pick up the corresponding CE from the table.
1502                 ce = collator.m_contractionCE_[offset];
1503             }
1504             else {
1505                 // Source string char was not in the table, prefix not found
1506                 ce = collator.m_contractionCE_[entryoffset];
1507             }
1508
1509             if (!isSpecialPrefixTag(ce)) {
1510                 // The source string char was in the contraction table, and
1511                 // the corresponding CE is not a prefix CE. We found the
1512                 // prefix, break out of loop, this CE will end up being
1513                 // returned. This is the normal way out of prefix handling
1514                 // when the source actually contained the prefix.
1515                 break;
1516             }
1517         }
1518         if (ce != CE_NOT_FOUND_) {
1519             // we found something and we can merilly continue
1520             updateInternalState(m_utilSpecialBackUp_);
1521         }
1522         else { // prefix search was a failure, we have to backup all the way to
1523             // the start
1524             updateInternalState(entrybackup);
1525         }
1526         return ce;
1527     }
1528
1529     /**
1530      * Checks if the ce is a contraction tag
1531      * @param ce ce to check
1532      * @return true if ce is a contraction tag, false otherwise
1533      */
1534     private boolean isContractionTag(int ce)
1535     {
1536         return RuleBasedCollator.isSpecial(ce) &&
1537             RuleBasedCollator.getTag(ce) == CE_CONTRACTION_TAG_;
1538     }
1539
1540     /**
1541      * Method to copy skipped characters into the buffer and sets the fcd
1542      * position. To ensure that the skipped characters are considered later,
1543      * we need to place it in the appropriate position in the buffer and
1544      * reassign the source index. simple case if index reside in string,
1545      * simply copy to buffer and fcdposition = pos, pos = start of buffer.
1546      * if pos in normalization buffer, we'll insert the copy infront of pos
1547      * and point pos to the start of the buffer. why am i doing these copies?
1548      * well, so that the whole chunk of codes in the getNextCE,
1549      * ucol_prv_getSpecialCE does not require any changes, which will be
1550      * really painful.
1551      * @param skipped character buffer
1552      */
1553     private void setDiscontiguous(StringBuilder skipped)
1554     {
1555         if (m_bufferOffset_ >= 0) {
1556             m_buffer_.replace(0, m_bufferOffset_, skipped.toString());
1557         }
1558         else {
1559             m_FCDLimit_ = m_source_.getIndex();
1560             m_buffer_.setLength(0);
1561             m_buffer_.append(skipped.toString());
1562         }
1563
1564         m_bufferOffset_ = 0;
1565     }
1566
1567     /**
1568      * Returns the current character for forward iteration
1569      * @return current character
1570      */
1571     private int currentChar()
1572     {
1573         if (m_bufferOffset_ < 0) {
1574             m_source_.previousCodePoint();
1575             return m_source_.nextCodePoint();
1576         }
1577
1578         // m_bufferOffset_ is never 0 in normal circumstances except after a
1579         // discontiguous contraction since it is always returned and moved
1580         // by 1 when we do nextChar()
1581         return UTF16.charAt(m_buffer_, m_bufferOffset_ - 1);
1582     }
1583
1584     /**
1585      * Method to get the discontiguous collation element within the source.
1586      * Note this function will set the position to the appropriate places.
1587      * Passed in character offset points to the second combining character
1588      * after the start character.
1589      * @param collator current collator used
1590      * @param entryoffset index to the start character in the contraction table
1591      * @return discontiguous collation element offset
1592      */
1593     private int nextDiscontiguous(RuleBasedCollator collator, int entryoffset)
1594     {
1595         int offset = entryoffset;
1596         boolean multicontraction = false;
1597         // since it will be stuffed into this iterator and ran over again
1598         if (m_utilSkippedBuffer_ == null) {
1599             m_utilSkippedBuffer_ = new StringBuilder();
1600         }
1601         else {
1602             m_utilSkippedBuffer_.setLength(0);
1603         }
1604         int ch = currentChar();
1605         m_utilSkippedBuffer_.appendCodePoint(ch);
1606         int prevCC = 0;
1607         int cc = getCombiningClass(ch);
1608         // accent after the first character
1609         if (m_utilSpecialDiscontiguousBackUp_ == null) {
1610             m_utilSpecialDiscontiguousBackUp_ = new Backup();
1611         }
1612         backupInternalState(m_utilSpecialDiscontiguousBackUp_);
1613         boolean prevWasLead = false;
1614         while (true) {
1615             // We read code units for contraction table matching
1616             // but have to get combining classes for code points
1617             // to figure out where to stop with discontiguous contraction.
1618             int ch_int = nextChar();
1619             char nextch = (char)ch_int;
1620             if (UTF16.isSurrogate(nextch)) {
1621                 if (prevWasLead) {
1622                     // trail surrogate of surrogate pair, keep previous and current cc
1623                     prevWasLead = false;
1624                 } else {
1625                     prevCC = cc;
1626                     cc = 0;  // default cc for an unpaired surrogate
1627                     prevWasLead = false;
1628                     if (Character.isHighSurrogate(nextch)) {
1629                         int trail = nextChar();
1630                         if (Character.isLowSurrogate((char)trail)) {
1631                             cc = getCombiningClass(Character.toCodePoint(nextch, (char)trail));
1632                             prevWasLead = true;
1633                         }
1634                         if (trail >= 0) {
1635                             previousChar();  // restore index after having peeked at the next code unit
1636                         }
1637                     }
1638                 }
1639             } else {
1640                 prevCC = cc;
1641                 cc = getCombiningClass(ch_int);
1642                 prevWasLead = false;
1643             }
1644             if (ch_int < 0 || cc == 0) {
1645                 // if there are no more accents to move around
1646                 // we don't have to shift previousChar, since we are resetting
1647                 // the offset later
1648                 if (multicontraction) {
1649                     if (ch_int >= 0) {
1650                         previousChar(); // backtrack
1651                     }
1652                     setDiscontiguous(m_utilSkippedBuffer_);
1653                     return collator.m_contractionCE_[offset];
1654                 }
1655                 break;
1656             }
1657
1658             offset ++; // skip the combining class offset
1659             while ((offset < collator.m_contractionIndex_.length) &&
1660                    (nextch > collator.m_contractionIndex_[offset])) {
1661                 offset ++;
1662             }
1663
1664             int ce = CE_NOT_FOUND_;
1665             if ( offset >= collator.m_contractionIndex_.length)  {
1666                 break;
1667             }
1668             if (nextch != collator.m_contractionIndex_[offset] || cc == prevCC) {
1669                     // unmatched or blocked character
1670                 if ( (m_utilSkippedBuffer_.length()!= 1) ||
1671                      ((m_utilSkippedBuffer_.charAt(0)!= nextch) &&
1672                       (m_bufferOffset_<0) )) { // avoid push to skipped buffer twice
1673                     m_utilSkippedBuffer_.append(nextch);
1674                 }
1675                 offset = entryoffset;  // Restore the offset before checking next character.
1676                 continue;
1677             }
1678             else {
1679                 ce = collator.m_contractionCE_[offset];
1680             }
1681
1682             if (ce == CE_NOT_FOUND_) {
1683                 break;
1684             }
1685             else if (isContractionTag(ce)) {
1686                 // this is a multi-contraction
1687                 offset = getContractionOffset(collator, ce);
1688                 if (collator.m_contractionCE_[offset] != CE_NOT_FOUND_) {
1689                     multicontraction = true;
1690                     backupInternalState(m_utilSpecialDiscontiguousBackUp_);
1691                 }
1692             }
1693             else {
1694                 setDiscontiguous(m_utilSkippedBuffer_);
1695                 return ce;
1696             }
1697         }
1698
1699         updateInternalState(m_utilSpecialDiscontiguousBackUp_);
1700         // backup is one forward of the base character, we need to move back
1701         // one more
1702         previousChar();
1703         return collator.m_contractionCE_[entryoffset];
1704     }
1705
1706     /**
1707      * Gets the next contraction ce
1708      * @param collator collator to use
1709      * @param ce current ce
1710      * @return ce of the next contraction
1711      */
1712     private int nextContraction(RuleBasedCollator collator, int ce)
1713     {
1714         backupInternalState(m_utilSpecialBackUp_);
1715         int entryce = collator.m_contractionCE_[getContractionOffset(collator, ce)]; //CE_NOT_FOUND_;
1716         while (true) {
1717             int entryoffset = getContractionOffset(collator, ce);
1718             int offset = entryoffset;
1719
1720             if (isEnd()) {
1721                 ce = collator.m_contractionCE_[offset];
1722                 if (ce == CE_NOT_FOUND_) {
1723                     // back up the source over all the chars we scanned going
1724                     // into this contraction.
1725                     ce = entryce;
1726                     updateInternalState(m_utilSpecialBackUp_);
1727                 }
1728                 break;
1729             }
1730
1731             // get the discontiguos maximum combining class
1732             int maxCC = (collator.m_contractionIndex_[offset] & 0xFF);
1733             // checks if all characters have the same combining class
1734             byte allSame = (byte)(collator.m_contractionIndex_[offset] >> 8);
1735             char ch = (char)nextChar();
1736             offset ++;
1737             while (ch > collator.m_contractionIndex_[offset]) {
1738                 // contraction characters are ordered, skip all smaller
1739                 offset ++;
1740             }
1741
1742             if (ch == collator.m_contractionIndex_[offset]) {
1743                 // Found the source string char in the contraction table.
1744                 //  Pick up the corresponding CE from the table.
1745                 ce = collator.m_contractionCE_[offset];
1746             }
1747             else {
1748                 // Source string char was not in contraction table.
1749                 // Unless it is a discontiguous contraction, we are done
1750                 int miss = ch;
1751                 // ticket 8484 - porting changes from C for 6101
1752                 // We test whether the next two char are surrogate pairs.
1753                 // This test is done if the iterator is not in the end.
1754                 // If there is no surrogate pair, the iterator
1755                 // goes back one if needed.
1756                 if(UTF16.isLeadSurrogate(ch) && !isEnd()) {
1757                     char surrNextChar = (char)nextChar();
1758                     if (UTF16.isTrailSurrogate(surrNextChar)) {
1759                         miss = UCharacterProperty.getRawSupplementary(ch, surrNextChar);
1760                     } else {
1761                         previousChar();
1762                     }
1763                 }
1764                 int sCC;
1765                 if (maxCC == 0 || (sCC = getCombiningClass(miss)) == 0
1766                     || sCC > maxCC || (allSame != 0 && sCC == maxCC) ||
1767                     isEnd()) {
1768                     // Contraction can not be discontiguous, back up by one
1769                     previousChar();
1770                     if(miss > 0xFFFF) {
1771                         previousChar();
1772                     }
1773                     ce = collator.m_contractionCE_[entryoffset];
1774                 }
1775                 else {
1776                     // Contraction is possibly discontiguous.
1777                     // find the next character if ch is not a base character
1778                     int ch_int = nextChar();
1779                     if (ch_int != UCharacterIterator.DONE) {
1780                         previousChar();
1781                     }
1782                     char nextch = (char)ch_int;
1783                     if (getCombiningClass(nextch) == 0) {
1784                         previousChar();
1785                         if(miss > 0xFFFF) {
1786                             previousChar();
1787                         }
1788                         // base character not part of discontiguous contraction
1789                         ce = collator.m_contractionCE_[entryoffset];
1790                     }
1791                     else {
1792                         ce = nextDiscontiguous(collator, entryoffset);
1793                     }
1794                 }
1795             }
1796
1797             if (ce == CE_NOT_FOUND_) {
1798                 // source did not match the contraction, revert back original
1799                 updateInternalState(m_utilSpecialBackUp_);
1800                 ce = entryce;
1801                 break;
1802             }
1803
1804             // source was a contraction
1805             if (!isContractionTag(ce)) {
1806                 break;
1807             }
1808
1809             // ccontinue looping to check for the remaining contraction.
1810             if (collator.m_contractionCE_[entryoffset] != CE_NOT_FOUND_) {
1811                 // there are further contractions to be performed, so we store
1812                 // the so-far completed ce, so that if we fail in the next
1813                 // round we just return this one.
1814                 entryce = collator.m_contractionCE_[entryoffset];
1815                 backupInternalState(m_utilSpecialBackUp_);
1816                 if (m_utilSpecialBackUp_.m_bufferOffset_ >= 0) {
1817                     m_utilSpecialBackUp_.m_bufferOffset_ --;
1818                 }
1819                 else {
1820                     m_utilSpecialBackUp_.m_offset_ --;
1821                 }
1822             }
1823         }
1824         return ce;
1825     }
1826
1827     /**
1828      * Gets the next ce for long primaries, stuffs the rest of the collation
1829      * elements into the ce buffer
1830      * @param ce current ce
1831      * @return next ce
1832      */
1833     private int nextLongPrimary(int ce)
1834     {
1835         m_CEBuffer_[1] = ((ce & 0xFF) << 24)
1836             | RuleBasedCollator.CE_CONTINUATION_MARKER_;
1837         m_CEBufferOffset_ = 1;
1838         m_CEBufferSize_ = 2;
1839         m_CEBuffer_[0] = ((ce & 0xFFFF00) << 8) | (CE_BYTE_COMMON_ << 8) |
1840             CE_BYTE_COMMON_;
1841         return m_CEBuffer_[0];
1842     }
1843
1844     /**
1845      * Gets the number of expansion
1846      * @param ce current ce
1847      * @return number of expansion
1848      */
1849     private int getExpansionCount(int ce)
1850     {
1851         return ce & 0xF;
1852     }
1853
1854     /**
1855      * Gets the next expansion ce and stuffs the rest of the collation elements
1856      * into the ce buffer
1857      * @param collator current collator
1858      * @param ce current ce
1859      * @return next expansion ce
1860      */
1861     private int nextExpansion(RuleBasedCollator collator, int ce)
1862     {
1863         // NOTE: we can encounter both continuations and expansions in an
1864         // expansion!
1865         // I have to decide where continuations are going to be dealt with
1866         int offset = getExpansionOffset(collator, ce);
1867         m_CEBufferSize_ = getExpansionCount(ce);
1868         m_CEBufferOffset_ = 1;
1869         m_CEBuffer_[0] = collator.m_expansion_[offset];
1870         if (m_CEBufferSize_ != 0) {
1871             // if there are less than 16 elements in expansion
1872             for (int i = 1; i < m_CEBufferSize_; i ++) {
1873                 m_CEBuffer_[i] = collator.m_expansion_[offset + i];
1874             }
1875         }
1876         else {
1877             // ce are terminated
1878             m_CEBufferSize_ = 1;
1879             while (collator.m_expansion_[offset] != 0) {
1880                 m_CEBuffer_[m_CEBufferSize_ ++] =
1881                     collator.m_expansion_[++ offset];
1882             }
1883         }
1884         // in case of one element expansion, we
1885         // want to immediately return CEpos
1886         if (m_CEBufferSize_ == 1) {
1887             m_CEBufferSize_ = 0;
1888             m_CEBufferOffset_ = 0;
1889         }
1890         return m_CEBuffer_[0];
1891     }
1892
1893     /**
1894      * Gets the next digit ce
1895      * @param collator current collator
1896      * @param ce current collation element
1897      * @param cp current codepoint
1898      * @return next digit ce
1899      */
1900     private int nextDigit(RuleBasedCollator collator, int ce, int cp)
1901     {
1902         // We do a check to see if we want to collate digits as numbers;
1903         // if so we generate a custom collation key. Otherwise we pull out
1904         // the value stored in the expansion table.
1905
1906         if (m_collator_.m_isNumericCollation_){
1907             int collateVal = 0;
1908             int trailingZeroIndex = 0;
1909             boolean nonZeroValReached = false;
1910
1911             // I just need a temporary place to store my generated CEs.
1912             // icu4c uses a unsigned byte array, i'll use a stringbuffer here
1913             // to avoid dealing with the sign problems and array allocation
1914             // clear and set initial string buffer length
1915             m_utilStringBuffer_.setLength(3);
1916
1917             // We parse the source string until we hit a char that's NOT a
1918             // digit.
1919             // Use this u_charDigitValue. This might be slow because we have
1920             // to handle surrogates...
1921             int digVal = UCharacter.digit(cp);
1922             // if we have arrived here, we have already processed possible
1923             // supplementaries that trigered the digit tag -
1924             // all supplementaries are marked in the UCA.
1925             // We  pad a zero in front of the first element anyways.
1926             // This takes care of the (probably) most common case where
1927             // people are sorting things followed by a single digit
1928             int digIndx = 1;
1929             for (;;) {
1930                 // Make sure we have enough space.
1931                 if (digIndx >= ((m_utilStringBuffer_.length() - 2) << 1)) {
1932                     m_utilStringBuffer_.setLength(m_utilStringBuffer_.length()
1933                                                   << 1);
1934                 }
1935                 // Skipping over leading zeroes.
1936                 if (digVal != 0 || nonZeroValReached) {
1937                     if (digVal != 0 && !nonZeroValReached) {
1938                         nonZeroValReached = true;
1939                     }
1940                     // We parse the digit string into base 100 numbers
1941                     // (this fits into a byte).
1942                     // We only add to the buffer in twos, thus if we are
1943                     // parsing an odd character, that serves as the
1944                     // 'tens' digit while the if we are parsing an even
1945                     // one, that is the 'ones' digit. We dumped the
1946                     // parsed base 100 value (collateVal) into a buffer.
1947                     // We multiply each collateVal by 2 (to give us room)
1948                     // and add 5 (to avoid overlapping magic CE byte
1949                     // values). The last byte we subtract 1 to ensure it is
1950                     // less than all the other bytes.
1951                     if (digIndx % 2 != 0) {
1952                         collateVal += digVal;
1953                         // This removes trailing zeroes.
1954                         if (collateVal == 0 && trailingZeroIndex == 0) {
1955                             trailingZeroIndex = ((digIndx - 1) >>> 1) + 2;
1956                         }
1957                         else if (trailingZeroIndex != 0) {
1958                             trailingZeroIndex = 0;
1959                         }
1960                         m_utilStringBuffer_.setCharAt(
1961                                             ((digIndx - 1) >>> 1) + 2,
1962                                             (char)((collateVal << 1) + 6));
1963                         collateVal = 0;
1964                     }
1965                     else {
1966                         // We drop the collation value into the buffer so if
1967                         // we need to do a "front patch" we don't have to
1968                         // check to see if we're hitting the last element.
1969
1970                         collateVal = digVal * 10;
1971                         if (collateVal == 0) {
1972                             if (trailingZeroIndex != 0) {
1973                                 trailingZeroIndex = (digIndx >>> 1) + 2;
1974                             }
1975                         } else {
1976                             trailingZeroIndex = 0;
1977                         }
1978
1979                         m_utilStringBuffer_.setCharAt((digIndx >>> 1) + 2,
1980                                                 (char)((collateVal << 1) + 6));
1981                     }
1982                     digIndx ++;
1983                 }
1984
1985                 // Get next character.
1986                 if (!isEnd()){
1987                     backupInternalState(m_utilSpecialBackUp_);
1988                     int char32 = nextChar();
1989                     char ch = (char)char32;
1990                     if (UTF16.isLeadSurrogate(ch)){
1991                         if (!isEnd()) {
1992                             char trail = (char)nextChar();
1993                             if (UTF16.isTrailSurrogate(trail)) {
1994                                char32 = UCharacterProperty.getRawSupplementary(
1995                                                                    ch, trail);
1996                             }
1997                             else {
1998                                 goBackOne();
1999                             }
2000                         }
2001                     }
2002
2003                     digVal = UCharacter.digit(char32);
2004                     if (digVal == -1) {
2005                         // Resetting position to point to the next unprocessed
2006                         // char. We overshot it when doing our test/set for
2007                         // numbers.
2008                         updateInternalState(m_utilSpecialBackUp_);
2009                         break;
2010                     }
2011                 }
2012                 else {
2013                     break;
2014                 }
2015             }
2016
2017             if (nonZeroValReached == false){
2018                 digIndx = 2;
2019                 m_utilStringBuffer_.setCharAt(2, (char)6);
2020             }
2021
2022             int endIndex = trailingZeroIndex != 0 ? trailingZeroIndex
2023                                              : (digIndx >>> 1) + 2;
2024
2025             if (digIndx % 2 != 0){
2026                 // We missed a value. Since digIndx isn't even, stuck too many
2027                 // values into the buffer (this is what we get for padding the
2028                 // first byte with a zero). "Front-patch" now by pushing all
2029                 // nybbles forward.
2030                 // Doing it this way ensures that at least 50% of the time
2031                 // (statistically speaking) we'll only be doing a single pass
2032                 // and optimizes for strings with single digits. I'm just
2033                 // assuming that's the more common case.
2034                 for (int i = 2; i < endIndex; i ++){
2035                     m_utilStringBuffer_.setCharAt(i,
2036                         (char)((((((m_utilStringBuffer_.charAt(i) - 6) >>> 1)
2037                                   % 10) * 10)
2038                                  + (((m_utilStringBuffer_.charAt(i + 1) - 6)
2039                                       >>> 1) / 10) << 1) + 6));
2040                 }
2041                 -- digIndx;
2042             }
2043
2044             // Subtract one off of the last byte.
2045             m_utilStringBuffer_.setCharAt(endIndex - 1,
2046                          (char)(m_utilStringBuffer_.charAt(endIndex - 1) - 1));
2047
2048             // We want to skip over the first two slots in the buffer.
2049             // The first slot is reserved for the header byte CODAN_PLACEHOLDER.
2050             // The second slot is for the sign/exponent byte:
2051             // 0x80 + (decimalPos/2) & 7f.
2052             m_utilStringBuffer_.setCharAt(0, (char)RuleBasedCollator.CODAN_PLACEHOLDER);
2053             m_utilStringBuffer_.setCharAt(1,
2054                                      (char)(0x80 + ((digIndx >>> 1) & 0x7F)));
2055
2056             // Now transfer the collation key to our collIterate struct.
2057             // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
2058             ce = (((m_utilStringBuffer_.charAt(0) << 8)
2059                        // Primary weight
2060                        | m_utilStringBuffer_.charAt(1))
2061                                     << RuleBasedCollator.CE_PRIMARY_SHIFT_)
2062                        //  Secondary weight
2063                        | (RuleBasedCollator.BYTE_COMMON_
2064                           << RuleBasedCollator.CE_SECONDARY_SHIFT_)
2065                        | RuleBasedCollator.BYTE_COMMON_; // Tertiary weight.
2066             int i = 2; // Reset the index into the buffer.
2067
2068             m_CEBuffer_[0] = ce;
2069             m_CEBufferSize_ = 1;
2070             m_CEBufferOffset_ = 1;
2071             while (i < endIndex)
2072             {
2073                 int primWeight = m_utilStringBuffer_.charAt(i ++) << 8;
2074                 if (i < endIndex) {
2075                     primWeight |= m_utilStringBuffer_.charAt(i ++);
2076                 }
2077                 m_CEBuffer_[m_CEBufferSize_ ++]
2078                     = (primWeight << RuleBasedCollator.CE_PRIMARY_SHIFT_)
2079                       | RuleBasedCollator.CE_CONTINUATION_MARKER_;
2080             }
2081             return ce;
2082         }
2083
2084         // no numeric mode, we'll just switch to whatever we stashed and
2085         // continue
2086         // find the offset to expansion table
2087         return collator.m_expansion_[getExpansionOffset(collator, ce)];
2088     }
2089
2090     /**
2091      * Gets the next implicit ce for codepoints
2092      * @param codepoint current codepoint
2093      * @return implicit ce
2094      */
2095     private int nextImplicit(int codepoint)
2096     {
2097         int result = RuleBasedCollator.impCEGen_.getImplicitFromCodePoint(codepoint);
2098         m_CEBuffer_[0] = (result & RuleBasedCollator.CE_PRIMARY_MASK_)
2099                          | 0x00000505;
2100         m_CEBuffer_[1] = ((result & 0x0000FFFF) << 16) | 0x000000C0;
2101         m_CEBufferOffset_ = 1;
2102         m_CEBufferSize_ = 2;
2103         return m_CEBuffer_[0];
2104     }
2105
2106     /**
2107      * Returns the next ce associated with the following surrogate characters
2108      * @param ch current character
2109      * @return ce
2110      */
2111     private int nextSurrogate(char ch)
2112     {
2113         int ch_int = nextChar();
2114         char nextch = (char)ch_int;
2115         if (ch_int != CharacterIterator.DONE &&
2116             UTF16.isTrailSurrogate(nextch)) {
2117             int codepoint = UCharacterProperty.getRawSupplementary(ch, nextch);
2118             return nextImplicit(codepoint);
2119         }
2120         if (nextch != CharacterIterator.DONE) {
2121             previousChar(); // reverts back to the original position
2122         }
2123         return CE_NOT_FOUND_; // treat like unassigned
2124     }
2125
2126     /**
2127      * Returns the next ce for a hangul character, this is an implicit
2128      * calculation
2129      * @param collator current collator
2130      * @param ch current character
2131      * @return hangul ce
2132      */
2133     private int nextHangul(RuleBasedCollator collator, char ch)
2134     {
2135         char L = (char)(ch - HANGUL_SBASE_);
2136
2137         // divide into pieces
2138         // do it in this order since some compilers can do % and / in one
2139         // operation
2140         char T = (char)(L % HANGUL_TCOUNT_);
2141         L /= HANGUL_TCOUNT_;
2142         char V = (char)(L % HANGUL_VCOUNT_);
2143         L /= HANGUL_VCOUNT_;
2144
2145         // offset them
2146         L += HANGUL_LBASE_;
2147         V += HANGUL_VBASE_;
2148         T += HANGUL_TBASE_;
2149
2150         // return the first CE, but first put the rest into the expansion
2151         // buffer
2152         m_CEBufferSize_ = 0;
2153         if (!m_collator_.m_isJamoSpecial_) { // FAST PATH
2154             m_CEBuffer_[m_CEBufferSize_ ++] =
2155                 collator.m_trie_.getLeadValue(L);
2156             m_CEBuffer_[m_CEBufferSize_ ++] =
2157                 collator.m_trie_.getLeadValue(V);
2158
2159             if (T != HANGUL_TBASE_) {
2160                 m_CEBuffer_[m_CEBufferSize_ ++] =
2161                     collator.m_trie_.getLeadValue(T);
2162             }
2163             m_CEBufferOffset_ = 1;
2164             return m_CEBuffer_[0];
2165         }
2166         else {
2167             // Jamo is Special
2168             // Since Hanguls pass the FCD check, it is guaranteed that we
2169             // won't be in the normalization buffer if something like this
2170             // happens
2171             // Move Jamos into normalization buffer
2172             m_buffer_.append(L);
2173             m_buffer_.append(V);
2174             if (T != HANGUL_TBASE_) {
2175                 m_buffer_.append(T);
2176             }
2177             m_bufferOffset_ = 0;
2178             m_FCDLimit_ = m_source_.getIndex();
2179             m_FCDStart_ = m_FCDLimit_ - 1;
2180             // Indicate where to continue in main input string after
2181             // exhausting the buffer
2182             return IGNORABLE;
2183         }
2184     }
2185
2186     /**
2187      * <p>Special CE management. Expansions, contractions etc...</p>
2188      * @param collator can be plain UCA
2189      * @param ce current ce
2190      * @param ch current character
2191      * @return next special ce
2192      */
2193     private int nextSpecial(RuleBasedCollator collator, int ce, char ch)
2194     {
2195         int codepoint = ch;
2196         Backup entrybackup = m_utilSpecialEntryBackUp_;
2197         // this is to handle recursive looping
2198         if (entrybackup != null) {
2199             m_utilSpecialEntryBackUp_ = null;
2200         }
2201         else {
2202             entrybackup = new Backup();
2203         }
2204         backupInternalState(entrybackup);
2205         try { // forces it to assign m_utilSpecialEntryBackup_
2206             while (true) {
2207                 // This loop will repeat only in the case of contractions,
2208                 // surrogate
2209                 switch(RuleBasedCollator.getTag(ce)) {
2210                 case CE_NOT_FOUND_TAG_:
2211                     // impossible case for icu4j
2212                     return ce;
2213                 case RuleBasedCollator.CE_SURROGATE_TAG_:
2214                     if (isEnd()) {
2215                         return CE_NOT_FOUND_;
2216                     }
2217                     backupInternalState(m_utilSpecialBackUp_);
2218                     char trail = (char)nextChar();
2219                     ce = nextSurrogate(collator, ce, trail);
2220                     // calculate the supplementary code point value,
2221                     // if surrogate was not tailored we go one more round
2222                     codepoint =
2223                         UCharacterProperty.getRawSupplementary(ch, trail);
2224                     break;
2225                 case CE_SPEC_PROC_TAG_:
2226                     ce = nextSpecialPrefix(collator, ce, entrybackup);
2227                     break;
2228                 case CE_CONTRACTION_TAG_:
2229                     ce = nextContraction(collator, ce);
2230                     break;
2231                 case CE_LONG_PRIMARY_TAG_:
2232                     return nextLongPrimary(ce);
2233                 case CE_EXPANSION_TAG_:
2234                     return nextExpansion(collator, ce);
2235                 case CE_DIGIT_TAG_:
2236                     ce = nextDigit(collator, ce, codepoint);
2237                     break;
2238                     // various implicits optimization
2239                 case CE_CJK_IMPLICIT_TAG_:
2240                     // 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
2241                     return nextImplicit(codepoint);
2242                 case CE_IMPLICIT_TAG_: // everything that is not defined
2243                     return nextImplicit(codepoint);
2244                 case CE_TRAIL_SURROGATE_TAG_:
2245                     return CE_NOT_FOUND_; // DC00-DFFF broken surrogate, treat like unassigned
2246                 case CE_LEAD_SURROGATE_TAG_:  // D800-DBFF
2247                     return nextSurrogate(ch);
2248                 case CE_HANGUL_SYLLABLE_TAG_: // AC00-D7AF
2249                     return nextHangul(collator, ch);
2250                 case CE_CHARSET_TAG_:
2251                                     // not yet implemented probably after 1.8
2252                     return CE_NOT_FOUND_;
2253                 default:
2254                     ce = IGNORABLE;
2255                     // synwee todo, throw exception or something here.
2256                 }
2257                 if (!RuleBasedCollator.isSpecial(ce)) {
2258                     break;
2259                 }
2260             }
2261         }
2262         finally {
2263             m_utilSpecialEntryBackUp_ = entrybackup;
2264         }
2265         return ce;
2266     }
2267
2268     /**
2269      * Special processing is getting a CE that is preceded by a certain prefix.
2270      * Currently this is only needed for optimizing Japanese length and
2271      * iteration marks. When we encouter a special processing tag, we go
2272      * backwards and try to see if we have a match. Contraction tables are used
2273      * - so the whole process is not unlike contraction. prefix data is stored
2274      * backwards in the table.
2275      * @param collator current collator
2276      * @param ce current ce
2277      * @return previous ce
2278      */
2279     private int previousSpecialPrefix(RuleBasedCollator collator, int ce)
2280     {
2281         backupInternalState(m_utilSpecialBackUp_);
2282         while (true) {
2283             // position ourselves at the begining of contraction sequence
2284             int offset = getContractionOffset(collator, ce);
2285             int entryoffset = offset;
2286             if (isBackwardsStart()) {
2287                 ce = collator.m_contractionCE_[offset];
2288                 break;
2289             }
2290             char prevch = (char)previousChar();
2291             while (prevch > collator.m_contractionIndex_[offset]) {
2292                 // since contraction codepoints are ordered, we skip all that
2293                 // are smaller
2294                 offset ++;
2295             }
2296             if (prevch == collator.m_contractionIndex_[offset]) {
2297                 ce = collator.m_contractionCE_[offset];
2298             }
2299             else {
2300                 // if there is a completely ignorable code point in the middle
2301                 // of a prefix, we need to act as if it's not there assumption:
2302                 // 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to
2303                 // zero)
2304                 // lone surrogates cannot be set to zero as it would break
2305                 // other processing
2306                 int isZeroCE = collator.m_trie_.getLeadValue(prevch);
2307                 // it's easy for BMP code points
2308                 if (isZeroCE == 0) {
2309                     continue;
2310                 }
2311                 else if (UTF16.isTrailSurrogate(prevch)
2312                          || UTF16.isLeadSurrogate(prevch)) {
2313                     // for supplementary code points, we have to check the next one
2314                     // situations where we are going to ignore
2315                     // 1. beginning of the string: schar is a lone surrogate
2316                     // 2. schar is a lone surrogate
2317                     // 3. schar is a trail surrogate in a valid surrogate
2318                     //    sequence that is explicitly set to zero.
2319                     if (!isBackwardsStart()) {
2320                         char lead = (char)previousChar();
2321                         if (UTF16.isLeadSurrogate(lead)) {
2322                             isZeroCE = collator.m_trie_.getLeadValue(lead);
2323                             if (RuleBasedCollator.getTag(isZeroCE)
2324                                 == RuleBasedCollator.CE_SURROGATE_TAG_) {
2325                                 int finalCE = collator.m_trie_.getTrailValue(
2326                                                                       isZeroCE,
2327                                                                       prevch);
2328                                 if (finalCE == 0) {
2329                                     // this is a real, assigned completely
2330                                     // ignorable code point
2331                                     continue;
2332                                 }
2333                             }
2334                         }
2335                         else {
2336                             nextChar(); // revert to original offset
2337                             // lone surrogate, completely ignorable
2338                             continue;
2339                         }
2340                         nextChar(); // revert to original offset
2341                     }
2342                     else {
2343                          // lone surrogate at the beggining, completely ignorable
2344                          continue;
2345                     }
2346                 }
2347
2348                 // char was not in the table. prefix not found
2349                 ce = collator.m_contractionCE_[entryoffset];
2350             }
2351
2352             if (!isSpecialPrefixTag(ce)) {
2353                 // char was in the contraction table, and the corresponding ce
2354                 // is not a prefix ce.  We found the prefix, break out of loop,
2355                 // this ce will end up being returned.
2356                 break;
2357             }
2358         }
2359         updateInternalState(m_utilSpecialBackUp_);
2360         return ce;
2361     }
2362
2363     /**
2364      * Retrieves the previous contraction ce. To ensure that the backwards and
2365      * forwards iteration matches, we take the current region of most possible
2366      * match and pass it through the forward iteration. This will ensure that
2367      * the obstinate problem of overlapping contractions will not occur.
2368      * @param collator current collator
2369      * @param ce current ce
2370      * @param ch current character
2371      * @return previous contraction ce
2372      */
2373     private int previousContraction(RuleBasedCollator collator, int ce, char ch)
2374     {
2375         m_utilStringBuffer_.setLength(0);
2376         // since we might encounter normalized characters (from the thai
2377         // processing) we can't use peekCharacter() here.
2378         char prevch = (char)previousChar();
2379         boolean atStart = false;
2380         // TODO: address the comment above - maybe now we *can* use peekCharacter
2381         //while (collator.isUnsafe(ch) || isThaiPreVowel(prevch)) {
2382         while (collator.isUnsafe(ch)) {
2383             m_utilStringBuffer_.insert(0, ch);
2384             ch = prevch;
2385             if (isBackwardsStart()) {
2386                 atStart = true;
2387                 break;
2388             }
2389             prevch = (char)previousChar();
2390         }
2391         if (!atStart) {
2392             // undo the previousChar() if we didn't reach the beginning
2393             nextChar();
2394         }
2395         // adds the initial base character to the string
2396         m_utilStringBuffer_.insert(0, ch);
2397
2398         // a new collation element iterator is used to simply things, since
2399         // using the current collation element iterator will mean that the
2400         // forward and backwards iteration will share and change the same
2401         // buffers. it is going to be painful.
2402         int originaldecomp = collator.getDecomposition();
2403         // for faster access, since string would have been normalized above
2404         collator.setDecomposition(Collator.NO_DECOMPOSITION);
2405         if (m_utilColEIter_ == null) {
2406             m_utilColEIter_ = new CollationElementIterator(
2407                                                 m_utilStringBuffer_.toString(),
2408                                                 collator);
2409         }
2410         else {
2411             m_utilColEIter_.m_collator_ = collator;
2412             m_utilColEIter_.setText(m_utilStringBuffer_.toString());
2413         }
2414         ce = m_utilColEIter_.next();
2415         m_CEBufferSize_ = 0;
2416         while (ce != NULLORDER) {
2417             if (m_CEBufferSize_ == m_CEBuffer_.length) {
2418                 try {
2419                     // increasing cebuffer size
2420                     int tempbuffer[] = new int[m_CEBuffer_.length + 50];
2421                     System.arraycopy(m_CEBuffer_, 0, tempbuffer, 0,
2422                                      m_CEBuffer_.length);
2423                     m_CEBuffer_ = tempbuffer;
2424                 }
2425                 catch( MissingResourceException e)
2426                 {
2427                     throw e;
2428                 }
2429                 catch (Exception e) {
2430                     if(DEBUG){
2431                         e.printStackTrace();
2432                     }
2433                     return NULLORDER;
2434                 }
2435             }
2436             m_CEBuffer_[m_CEBufferSize_ ++] = ce;
2437             ce = m_utilColEIter_.next();
2438         }
2439         collator.setDecomposition(originaldecomp);
2440         m_CEBufferOffset_ = m_CEBufferSize_ - 1;
2441         return m_CEBuffer_[m_CEBufferOffset_];
2442     }
2443
2444     /**
2445      * Returns the previous long primary ces
2446      * @param ce long primary ce
2447      * @return previous long primary ces
2448      */
2449     private int previousLongPrimary(int ce)
2450     {
2451         m_CEBufferSize_ = 0;
2452         m_CEBuffer_[m_CEBufferSize_ ++] =
2453             ((ce & 0xFFFF00) << 8) | (CE_BYTE_COMMON_ << 8) | CE_BYTE_COMMON_;
2454         m_CEBuffer_[m_CEBufferSize_ ++] = ((ce & 0xFF) << 24)
2455             | RuleBasedCollator.CE_CONTINUATION_MARKER_;
2456         m_CEBufferOffset_ = m_CEBufferSize_ - 1;
2457         return m_CEBuffer_[m_CEBufferOffset_];
2458     }
2459
2460     /**
2461      * Returns the previous expansion ces
2462      * @param collator current collator
2463      * @param ce current ce
2464      * @return previous expansion ce
2465      */
2466     private int previousExpansion(RuleBasedCollator collator, int ce)
2467     {
2468         // find the offset to expansion table
2469         int offset = getExpansionOffset(collator, ce);
2470         m_CEBufferSize_ = getExpansionCount(ce);
2471         if (m_CEBufferSize_ != 0) {
2472             // less than 16 elements in expansion
2473             for (int i = 0; i < m_CEBufferSize_; i ++) {
2474                 m_CEBuffer_[i] = collator.m_expansion_[offset + i];
2475             }
2476
2477         }
2478         else {
2479             // null terminated ces
2480             while (collator.m_expansion_[offset + m_CEBufferSize_] != 0) {
2481                 m_CEBuffer_[m_CEBufferSize_] =
2482                     collator.m_expansion_[offset + m_CEBufferSize_];
2483                 m_CEBufferSize_ ++;
2484             }
2485         }
2486         m_CEBufferOffset_ = m_CEBufferSize_ - 1;
2487         return m_CEBuffer_[m_CEBufferOffset_];
2488     }
2489
2490     /**
2491      * Getting the digit collation elements
2492      * @param collator
2493      * @param ce current collation element
2494      * @param ch current code point
2495      * @return digit collation element
2496      */
2497     private int previousDigit(RuleBasedCollator collator, int ce, char ch)
2498     {
2499         // We do a check to see if we want to collate digits as numbers; if so we generate
2500         //  a custom collation key. Otherwise we pull out the value stored in the expansion table.
2501         if (m_collator_.m_isNumericCollation_){
2502             int leadingZeroIndex = 0;
2503             int collateVal = 0;
2504             boolean nonZeroValReached = false;
2505
2506             // clear and set initial string buffer length
2507             m_utilStringBuffer_.setLength(3);
2508
2509             // We parse the source string until we hit a char that's NOT a digit
2510             // Use this u_charDigitValue. This might be slow because we have to
2511             // handle surrogates...
2512             int char32 = ch;
2513             if (UTF16.isTrailSurrogate(ch)) {
2514                 if (!isBackwardsStart()){
2515                     char lead = (char)previousChar();
2516                     if (UTF16.isLeadSurrogate(lead)) {
2517                         char32 = UCharacterProperty.getRawSupplementary(lead,
2518                                                                         ch);
2519                     }
2520                     else {
2521                         goForwardOne();
2522                     }
2523                 }
2524             }
2525             int digVal = UCharacter.digit(char32);
2526             int digIndx = 0;
2527             for (;;) {
2528                 // Make sure we have enough space.
2529                 if (digIndx >= ((m_utilStringBuffer_.length() - 2) << 1)) {
2530                     m_utilStringBuffer_.setLength(m_utilStringBuffer_.length()
2531                                                   << 1);
2532                 }
2533                 // Skipping over "trailing" zeroes but we still add to digIndx.
2534                 if (digVal != 0 || nonZeroValReached) {
2535                     if (digVal != 0 && !nonZeroValReached) {
2536                         nonZeroValReached = true;
2537                     }
2538
2539                     // We parse the digit string into base 100 numbers (this
2540                     // fits into a byte).
2541                     // We only add to the buffer in twos, thus if we are
2542                     // parsing an odd character, that serves as the 'tens'
2543                     // digit while the if we are parsing an even one, that is
2544                     // the 'ones' digit. We dumped the parsed base 100 value
2545                     // (collateVal) into a buffer. We multiply each collateVal
2546                     // by 2 (to give us room) and add 5 (to avoid overlapping
2547                     // magic CE byte values). The last byte we subtract 1 to
2548                     // ensure it is less than all the other bytes.
2549                     // Since we're doing in this reverse we want to put the
2550                     // first digit encountered into the ones place and the
2551                     // second digit encountered into the tens place.
2552
2553                     if (digIndx % 2 != 0){
2554                         collateVal += digVal * 10;
2555
2556                         // This removes leading zeroes.
2557                         if (collateVal == 0 && leadingZeroIndex == 0) {
2558                            leadingZeroIndex = ((digIndx - 1) >>> 1) + 2;
2559                         }
2560                         else if (leadingZeroIndex != 0) {
2561                             leadingZeroIndex = 0;
2562                         }
2563
2564                         m_utilStringBuffer_.setCharAt(((digIndx - 1) >>> 1) + 2,
2565                                                 (char)((collateVal << 1) + 6));
2566                         collateVal = 0;
2567                     }
2568                     else {
2569                         collateVal = digVal;
2570                     }
2571                 }
2572                 digIndx ++;
2573
2574                 if (!isBackwardsStart()){
2575                     backupInternalState(m_utilSpecialBackUp_);
2576                     char32 = previousChar();
2577                     if (UTF16.isTrailSurrogate(ch)){
2578                         if (!isBackwardsStart()) {
2579                             char lead = (char)previousChar();
2580                             if (UTF16.isLeadSurrogate(lead)) {
2581                                 char32
2582                                     = UCharacterProperty.getRawSupplementary(
2583                                                                     lead, ch);
2584                             }
2585                             else {
2586                                 updateInternalState(m_utilSpecialBackUp_);
2587                             }
2588                         }
2589                     }
2590
2591                     digVal = UCharacter.digit(char32);
2592                     if (digVal == -1) {
2593                         updateInternalState(m_utilSpecialBackUp_);
2594                         break;
2595                     }
2596                 }
2597                 else {
2598                     break;
2599                 }
2600             }
2601
2602             if (nonZeroValReached == false) {
2603                 digIndx = 2;
2604                 m_utilStringBuffer_.setCharAt(2, (char)6);
2605             }
2606
2607             if (digIndx % 2 != 0) {
2608                 if (collateVal == 0 && leadingZeroIndex == 0) {
2609                     // This removes the leading 0 in a odd number sequence of
2610                     // numbers e.g. avery001
2611                     leadingZeroIndex = ((digIndx - 1) >>> 1) + 2;
2612                 }
2613                 else {
2614                     // this is not a leading 0, we add it in
2615                     m_utilStringBuffer_.setCharAt((digIndx >>> 1) + 2,
2616                                                 (char)((collateVal << 1) + 6));
2617                     digIndx ++;
2618                 }
2619             }
2620
2621             int endIndex = leadingZeroIndex != 0 ? leadingZeroIndex
2622                                                : ((digIndx >>> 1) + 2) ;
2623             digIndx = ((endIndex - 2) << 1) + 1; // removing initial zeros
2624             // Subtract one off of the last byte.
2625             // Really the first byte here, but it's reversed...
2626             m_utilStringBuffer_.setCharAt(2,
2627                                     (char)(m_utilStringBuffer_.charAt(2) - 1));
2628             // We want to skip over the first two slots in the buffer.
2629             // The first slot is reserved for the header byte CODAN_PLACEHOLDER.
2630             // The second slot is for the sign/exponent byte:
2631             // 0x80 + (decimalPos/2) & 7f.
2632             m_utilStringBuffer_.setCharAt(0, (char)RuleBasedCollator.CODAN_PLACEHOLDER);
2633             m_utilStringBuffer_.setCharAt(1,
2634                                     (char)(0x80 + ((digIndx >>> 1) & 0x7F)));
2635
2636             // Now transfer the collation key to our collIterate struct.
2637             // The total size for our collation key is endIndx bumped up to the
2638             // next largest even value divided by two.
2639             m_CEBufferSize_ = 0;
2640             m_CEBuffer_[m_CEBufferSize_ ++]
2641                         = (((m_utilStringBuffer_.charAt(0) << 8)
2642                             // Primary weight
2643                             | m_utilStringBuffer_.charAt(1))
2644                               << RuleBasedCollator.CE_PRIMARY_SHIFT_)
2645                             // Secondary weight
2646                             | (RuleBasedCollator.BYTE_COMMON_
2647                                << RuleBasedCollator.CE_SECONDARY_SHIFT_)
2648                             // Tertiary weight.
2649                             | RuleBasedCollator.BYTE_COMMON_;
2650              int i = endIndex - 1; // Reset the index into the buffer.
2651              while (i >= 2) {
2652                 int primWeight = m_utilStringBuffer_.charAt(i --) << 8;
2653                 if (i >= 2) {
2654                     primWeight |= m_utilStringBuffer_.charAt(i --);
2655                 }
2656                 m_CEBuffer_[m_CEBufferSize_ ++]
2657                     = (primWeight << RuleBasedCollator.CE_PRIMARY_SHIFT_)
2658                       | RuleBasedCollator.CE_CONTINUATION_MARKER_;
2659              }
2660              m_CEBufferOffset_ = m_CEBufferSize_ - 1;
2661              return m_CEBuffer_[m_CEBufferOffset_];
2662          }
2663          else {
2664              return collator.m_expansion_[getExpansionOffset(collator, ce)];
2665          }
2666     }
2667
2668     /**
2669      * Returns previous hangul ces
2670      * @param collator current collator
2671      * @param ch current character
2672      * @return previous hangul ce
2673      */
2674     private int previousHangul(RuleBasedCollator collator, char ch)
2675     {
2676         char L = (char)(ch - HANGUL_SBASE_);
2677         // we do it in this order since some compilers can do % and / in one
2678         // operation
2679         char T = (char)(L % HANGUL_TCOUNT_);
2680         L /= HANGUL_TCOUNT_;
2681         char V = (char)(L % HANGUL_VCOUNT_);
2682         L /= HANGUL_VCOUNT_;
2683
2684         // offset them
2685         L += HANGUL_LBASE_;
2686         V += HANGUL_VBASE_;
2687         T += HANGUL_TBASE_;
2688
2689         m_CEBufferSize_ = 0;
2690         if (!m_collator_.m_isJamoSpecial_) {
2691             m_CEBuffer_[m_CEBufferSize_ ++] =
2692                 collator.m_trie_.getLeadValue(L);
2693             m_CEBuffer_[m_CEBufferSize_ ++] =
2694                 collator.m_trie_.getLeadValue(V);
2695             if (T != HANGUL_TBASE_) {
2696                 m_CEBuffer_[m_CEBufferSize_ ++] =
2697                     collator.m_trie_.getLeadValue(T);
2698             }
2699             m_CEBufferOffset_ = m_CEBufferSize_ - 1;
2700             return m_CEBuffer_[m_CEBufferOffset_];
2701         }
2702         else {
2703             // Since Hanguls pass the FCD check, it is guaranteed that we won't
2704             // be in the normalization buffer if something like this happens
2705             // Move Jamos into normalization buffer
2706             m_buffer_.append(L);
2707             m_buffer_.append(V);
2708             if (T != HANGUL_TBASE_) {
2709                 m_buffer_.append(T);
2710             }
2711             m_bufferOffset_ = m_buffer_.length();
2712             m_FCDStart_ = m_source_.getIndex();
2713             m_FCDLimit_ = m_FCDStart_ + 1;
2714             return IGNORABLE;
2715         }
2716     }
2717
2718     /**
2719      * Gets implicit codepoint ces
2720      * @param codepoint current codepoint
2721      * @return implicit codepoint ces
2722      */
2723     private int previousImplicit(int codepoint)
2724     {
2725         int result = RuleBasedCollator.impCEGen_.getImplicitFromCodePoint(codepoint);
2726         m_CEBufferSize_ = 2;
2727         m_CEBufferOffset_ = 1;
2728         m_CEBuffer_[0] = (result & RuleBasedCollator.CE_PRIMARY_MASK_)
2729                          | 0x00000505;
2730         m_CEBuffer_[1] = ((result & 0x0000FFFF) << 16) | 0x000000C0;
2731         return m_CEBuffer_[1];
2732     }
2733
2734     /**
2735      * Gets the previous surrogate ce
2736      * @param ch current character
2737      * @return previous surrogate ce
2738      */
2739     private int previousSurrogate(char ch)
2740     {
2741         if (isBackwardsStart()) {
2742             // we are at the start of the string, wrong place to be at
2743             return CE_NOT_FOUND_;
2744         }
2745         char prevch = (char)previousChar();
2746         // Handles Han and Supplementary characters here.
2747         if (UTF16.isLeadSurrogate(prevch)) {
2748             return previousImplicit(
2749                           UCharacterProperty.getRawSupplementary(prevch, ch));
2750         }
2751         if (prevch != CharacterIterator.DONE) {
2752             nextChar();
2753         }
2754         return CE_NOT_FOUND_; // treat like unassigned
2755     }
2756
2757     /**
2758      * <p>Special CE management. Expansions, contractions etc...</p>
2759      * @param collator can be plain UCA
2760      * @param ce current ce
2761      * @param ch current character
2762      * @return previous special ce
2763      */
2764     private int previousSpecial(RuleBasedCollator collator, int ce, char ch)
2765     {
2766         while(true) {
2767             // the only ces that loops are thai, special prefix and
2768             // contractions
2769             switch (RuleBasedCollator.getTag(ce)) {
2770             case CE_NOT_FOUND_TAG_:  // this tag always returns
2771                 return ce;
2772             case RuleBasedCollator.CE_SURROGATE_TAG_:  // unpaired lead surrogate
2773                 return CE_NOT_FOUND_;
2774             case CE_SPEC_PROC_TAG_:
2775                 ce = previousSpecialPrefix(collator, ce);
2776                 break;
2777             case CE_CONTRACTION_TAG_:
2778                 // may loop for first character e.g. "0x0f71" for english
2779                 if (isBackwardsStart()) {
2780                     // start of string or this is not the end of any contraction
2781                     ce = collator.m_contractionCE_[
2782                                             getContractionOffset(collator, ce)];
2783                     break;
2784                 }
2785                 return previousContraction(collator, ce, ch); // else
2786             case CE_LONG_PRIMARY_TAG_:
2787                 return previousLongPrimary(ce);
2788             case CE_EXPANSION_TAG_: // always returns
2789                 return previousExpansion(collator, ce);
2790             case CE_DIGIT_TAG_:
2791                 ce = previousDigit(collator, ce, ch);
2792                 break;
2793             case CE_HANGUL_SYLLABLE_TAG_: // AC00-D7AF
2794                 return previousHangul(collator, ch);
2795             case CE_LEAD_SURROGATE_TAG_:  // D800-DBFF
2796                 return CE_NOT_FOUND_; // broken surrogate sequence, treat like unassigned
2797             case CE_TRAIL_SURROGATE_TAG_: // DC00-DFFF
2798                 return previousSurrogate(ch);
2799             case CE_CJK_IMPLICIT_TAG_:
2800                 // 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
2801                 return previousImplicit(ch);
2802             case CE_IMPLICIT_TAG_: // everything that is not defined
2803                 // UCA is filled with these. Tailorings are NOT_FOUND
2804                 return previousImplicit(ch);
2805             case CE_CHARSET_TAG_: // this tag always returns
2806                 return CE_NOT_FOUND_;
2807             default: // this tag always returns
2808                 ce = IGNORABLE;
2809             }
2810             if (!RuleBasedCollator.isSpecial(ce)) {
2811                 break;
2812             }
2813         }
2814         return ce;
2815     }
2816
2817 //    /**
2818 //     * Gets a character from the source string at a given offset.
2819 //     * Handles both normal and iterative cases.
2820 //     * No error checking and does not access the normalization buffer
2821 //     * - caller beware!
2822 //     * @param offset offset from current position which character is to be
2823 //     *               retrieved
2824 //     * @return character at current position + offset
2825 //     */
2826 //    private char peekCharacter(int offset)
2827 //    {
2828 //        if (offset != 0) {
2829 //            int currentoffset = m_source_.getIndex();
2830 //            m_source_.setIndex(currentoffset + offset);
2831 //            char result = (char)m_source_.current();
2832 //            m_source_.setIndex(currentoffset);
2833 //            return result;
2834 //        }
2835 //        else {
2836 //            return (char)m_source_.current();
2837 //        }
2838 //    }
2839
2840     /**
2841      * Moves back 1 position in the source string. This is slightly less
2842      * complicated than previousChar in that it doesn't normalize while
2843      * moving back. Boundary checks are not performed.
2844      * This method is to be used with caution, with the assumption that
2845      * moving back one position will not exceed the source limits.
2846      * Use only with nextChar() and never call this API twice in a row without
2847      * nextChar() in the middle.
2848      */
2849     private void goBackOne()
2850     {
2851         if (m_bufferOffset_ >= 0) {
2852             m_bufferOffset_ --;
2853         }
2854         else {
2855             m_source_.setIndex(m_source_.getIndex() - 1);
2856         }
2857     }
2858
2859     /**
2860      * Moves forward 1 position in the source string. This is slightly less
2861      * complicated than nextChar in that it doesn't normalize while
2862      * moving back. Boundary checks are not performed.
2863      * This method is to be used with caution, with the assumption that
2864      * moving back one position will not exceed the source limits.
2865      * Use only with previousChar() and never call this API twice in a row
2866      * without previousChar() in the middle.
2867      */
2868     private void goForwardOne()
2869     {
2870         if (m_bufferOffset_ < 0) {
2871             // we're working on the source and not normalizing. fast path.
2872             // note Thai pre-vowel reordering uses buffer too
2873             m_source_.setIndex(m_source_.getIndex() + 1);
2874         }
2875         else {
2876             // we are in the buffer, buffer offset will never be 0 here
2877             m_bufferOffset_ ++;
2878         }
2879     }
2880 }