jars/icu4j-52_1/main/classes/collate/src/com/ibm/icu/text/RuleBasedCollator.java

   1 /**
   2  *******************************************************************************
   3  * Copyright (C) 1996-2013, International Business Machines Corporation and
   4  * others. All Rights Reserved.
   5  *******************************************************************************
   6  */
   7 package com.ibm.icu.text;
   8
   9 import java.io.DataInputStream;
  10 import java.io.IOException;
  11 import java.nio.ByteBuffer;
  12 import java.text.CharacterIterator;
  13 import java.text.ParseException;
  14 import java.util.Arrays;
  15 import java.util.HashMap;
  16 import java.util.HashSet;
  17 import java.util.Map;
  18 import java.util.MissingResourceException;
  19 import java.util.Set;
  20 import java.util.concurrent.locks.Lock;
  21 import java.util.concurrent.locks.ReentrantLock;
  22
  23 import com.ibm.icu.impl.BOCU;
  24 import com.ibm.icu.impl.ICUDebug;
  25 import com.ibm.icu.impl.ICUResourceBundle;
  26 import com.ibm.icu.impl.ImplicitCEGenerator;
  27 import com.ibm.icu.impl.IntTrie;
  28 import com.ibm.icu.impl.StringUCharacterIterator;
  29 import com.ibm.icu.impl.Trie;
  30 import com.ibm.icu.impl.TrieIterator;
  31 import com.ibm.icu.impl.Utility;
  32 import com.ibm.icu.lang.UCharacter;
  33 import com.ibm.icu.lang.UScript;
  34 import com.ibm.icu.util.Output;
  35 import com.ibm.icu.util.RangeValueIterator;
  36 import com.ibm.icu.util.ULocale;
  37 import com.ibm.icu.util.UResourceBundle;
  38 import com.ibm.icu.util.VersionInfo;
  39
  40 /**
  41  * <p>
  42  * RuleBasedCollator is a concrete subclass of Collator. It allows customization of the Collator via user-specified rule
  43  * sets. RuleBasedCollator is designed to be fully compliant to the <a
  44  * href="http://www.unicode.org/unicode/reports/tr10/">Unicode Collation Algorithm (UCA)</a> and conforms to ISO 14651.
  45  * </p>
  46  *
  47  * <p>
  48  * Users are strongly encouraged to read <a href="http://www.icu-project.org/userguide/Collate_Intro.html"> the users
  49  * guide</a> for more information about the collation service before using this class.
  50  * </p>
  51  *
  52  * <p>
  53  * Create a RuleBasedCollator from a locale by calling the getInstance(Locale) factory method in the base class
  54  * Collator. Collator.getInstance(Locale) creates a RuleBasedCollator object based on the collation rules defined by the
  55  * argument locale. If a customized collation ordering ar attributes is required, use the RuleBasedCollator(String)
  56  * constructor with the appropriate rules. The customized RuleBasedCollator will base its ordering on UCA, while
  57  * re-adjusting the attributes and orders of the characters in the specified rule accordingly.
  58  * </p>
  59  *
  60  * <p>
  61  * RuleBasedCollator provides correct collation orders for most locales supported in ICU. If specific data for a locale
  62  * is not available, the orders eventually falls back to the <a href="http://www.unicode.org/unicode/reports/tr10/">UCA
  63  * collation order </a>.
  64  * </p>
  65  *
  66  * <p>
  67  * For information about the collation rule syntax and details about customization, please refer to the <a
  68  * href="http://www.icu-project.org/userguide/Collate_Customization.html"> Collation customization</a> section of the
  69  * user's guide.
  70  * </p>
  71  *
  72  * <p>
  73  * <strong>Note</strong> that there are some differences between the Collation rule syntax used in Java and ICU4J:
  74  *
  75  * <ul>
  76  * <li>According to the JDK documentation: <i>
  77  * <p>
  78  * Modifier '!' : Turns on Thai/Lao vowel-consonant swapping. If this rule is in force when a Thai vowel of the range
  79  * &#92;U0E40-&#92;U0E44 precedes a Thai consonant of the range &#92;U0E01-&#92;U0E2E OR a Lao vowel of the range
  80  * &#92;U0EC0-&#92;U0EC4 precedes a Lao consonant of the range &#92;U0E81-&#92;U0EAE then the vowel is placed after the
  81  * consonant for collation purposes.
  82  * </p>
  83  * <p>
  84  * If a rule is without the modifier '!', the Thai/Lao vowel-consonant swapping is not turned on.
  85  * </p>
  86  * </i>
  87  * <p>
  88  * ICU4J's RuleBasedCollator does not support turning off the Thai/Lao vowel-consonant swapping, since the UCA clearly
  89  * states that it has to be supported to ensure a correct sorting order. If a '!' is encountered, it is ignored.
  90  * </p>
  91  * <li>As mentioned in the documentation of the base class Collator, compatibility decomposition mode is not supported.
  92  * </ul>
  93  * <p>
  94  * <strong>Examples</strong>
  95  * </p>
  96  * <p>
  97  * Creating Customized RuleBasedCollators: <blockquote>
  98  *
  99  * <pre>
 100  * String simple = "&amp; a &lt; b &lt; c &lt; d";
 101  * RuleBasedCollator simpleCollator = new RuleBasedCollator(simple);
 102  *
 103  * String norwegian = "&amp; a , A &lt; b , B &lt; c , C &lt; d , D &lt; e , E "
 104  *                    + "&lt; f , F &lt; g , G &lt; h , H &lt; i , I &lt; j , "
 105  *                    + "J &lt; k , K &lt; l , L &lt; m , M &lt; n , N &lt; "
 106  *                    + "o , O &lt; p , P &lt; q , Q &lt r , R &lt s , S &lt; "
 107  *                    + "t , T &lt; u , U &lt; v , V &lt; w , W &lt; x , X "
 108  *                    + "&lt; y , Y &lt; z , Z &lt; &#92;u00E5 = a&#92;u030A "
 109  *                    + ", &#92;u00C5 = A&#92;u030A ; aa , AA &lt; &#92;u00E6 "
 110  *                    + ", &#92;u00C6 &lt; &#92;u00F8 , &#92;u00D8";
 111  * RuleBasedCollator norwegianCollator = new RuleBasedCollator(norwegian);
 112  * </pre>
 113  *
 114  * </blockquote>
 115  *
 116  * Concatenating rules to combine <code>Collator</code>s: <blockquote>
 117  *
 118  * <pre>
 119  * // Create an en_US Collator object
 120  * RuleBasedCollator en_USCollator = (RuleBasedCollator)
 121  *     Collator.getInstance(new Locale("en", "US", ""));
 122  * // Create a da_DK Collator object
 123  * RuleBasedCollator da_DKCollator = (RuleBasedCollator)
 124  *     Collator.getInstance(new Locale("da", "DK", ""));
 125  * // Combine the two
 126  * // First, get the collation rules from en_USCollator
 127  * String en_USRules = en_USCollator.getRules();
 128  * // Second, get the collation rules from da_DKCollator
 129  * String da_DKRules = da_DKCollator.getRules();
 130  * RuleBasedCollator newCollator =
 131  *                             new RuleBasedCollator(en_USRules + da_DKRules);
 132  * // newCollator has the combined rules
 133  * </pre>
 134  *
 135  * </blockquote>
 136  *
 137  * Making changes to an existing RuleBasedCollator to create a new <code>Collator</code> object, by appending changes to
 138  * the existing rule: <blockquote>
 139  *
 140  * <pre>
 141  * // Create a new Collator object with additional rules
 142  * String addRules = "&amp; C &lt; ch, cH, Ch, CH";
 143  * RuleBasedCollator myCollator =
 144  *     new RuleBasedCollator(en_USCollator.getRules() + addRules);
 145  * // myCollator contains the new rules
 146  * </pre>
 147  *
 148  * </blockquote>
 149  *
 150  * How to change the order of non-spacing accents: <blockquote>
 151  *
 152  * <pre>
 153  * // old rule with main accents
 154  * String oldRules = "= &#92;u0301 ; &#92;u0300 ; &#92;u0302 ; &#92;u0308 "
 155  *                 + "; &#92;u0327 ; &#92;u0303 ; &#92;u0304 ; &#92;u0305 "
 156  *                 + "; &#92;u0306 ; &#92;u0307 ; &#92;u0309 ; &#92;u030A "
 157  *                 + "; &#92;u030B ; &#92;u030C ; &#92;u030D ; &#92;u030E "
 158  *                 + "; &#92;u030F ; &#92;u0310 ; &#92;u0311 ; &#92;u0312 "
 159  *                 + "&lt; a , A ; ae, AE ; &#92;u00e6 , &#92;u00c6 "
 160  *                 + "&lt; b , B &lt; c, C &lt; e, E &amp; C &lt; d , D";
 161  * // change the order of accent characters
 162  * String addOn = "&amp; &#92;u0300 ; &#92;u0308 ; &#92;u0302";
 163  * RuleBasedCollator myCollator = new RuleBasedCollator(oldRules + addOn);
 164  * </pre>
 165  *
 166  * </blockquote>
 167  *
 168  * Putting in a new primary ordering before the default setting, e.g. sort English characters before or after Japanese
 169  * characters in the Japanese <code>Collator</code>: <blockquote>
 170  *
 171  * <pre>
 172  * // get en_US Collator rules
 173  * RuleBasedCollator en_USCollator
 174  *                        = (RuleBasedCollator)Collator.getInstance(Locale.US);
 175  * // add a few Japanese characters to sort before English characters
 176  * // suppose the last character before the first base letter 'a' in
 177  * // the English collation rule is &#92;u2212
 178  * String jaString = "& &#92;u2212 &lt &#92;u3041, &#92;u3042 &lt &#92;u3043, "
 179  *                   + "&#92;u3044";
 180  * RuleBasedCollator myJapaneseCollator
 181  *              = new RuleBasedCollator(en_USCollator.getRules() + jaString);
 182  * </pre>
 183  *
 184  * </blockquote>
 185  * </p>
 186  * <p>
 187  * This class is not subclassable
 188  * </p>
 189  *
 190  * @author Syn Wee Quek
 191  * @stable ICU 2.8
 192  */
 193 public final class RuleBasedCollator extends Collator {
 194     // public constructors ---------------------------------------------------
 195
 196     /**
 197      * <p>
 198      * Constructor that takes the argument rules for customization. The collator will be based on UCA, with the
 199      * attributes and re-ordering of the characters specified in the argument rules.
 200      * </p>
 201      * <p>
 202      * See the user guide's section on <a href="http://www.icu-project.org/userguide/Collate_Customization.html">
 203      * Collation Customization</a> for details on the rule syntax.
 204      * </p>
 205      *
 206      * @param rules
 207      *            the collation rules to build the collation table from.
 208      * @exception ParseException
 209      *                and IOException thrown. ParseException thrown when argument rules have an invalid syntax.
 210      *                IOException thrown when an error occured while reading internal data.
 211      * @stable ICU 2.8
 212      */
 213     public RuleBasedCollator(String rules) throws Exception {
 214         checkUCA();
 215         if (rules == null) {
 216             throw new IllegalArgumentException("Collation rules can not be null");
 217         }
 218         init(rules);
 219     }
 220
 221     // public methods --------------------------------------------------------
 222
 223     /**
 224      * Clones the RuleBasedCollator
 225      *
 226      * @return a new instance of this RuleBasedCollator object
 227      * @stable ICU 2.8
 228      */
 229     public Object clone() throws CloneNotSupportedException {
 230         return clone(isFrozen());
 231     }
 232
 233     /**
 234      * Clones the RuleBasedCollator
 235      *
 236      * @param frozen should the clone be frozen or not
 237      * @return a new instance of this RuleBasedCollator object
 238      */
 239     private Object clone(boolean frozen) throws CloneNotSupportedException {
 240         //TODO: once buffer and threading issue is resolved have frozen clone just return itself
 241         RuleBasedCollator result = (RuleBasedCollator) super.clone();
 242         if (latinOneCEs_ != null) {
 243             result.m_reallocLatinOneCEs_ = true;
 244             result.m_ContInfo_ = new ContractionInfo();
 245         }
 246
 247         // since all collation data in the RuleBasedCollator do not change
 248         // we can safely assign the result.fields to this collator
 249         // except in cases where we can't
 250         result.collationBuffer = null;
 251         result.frozenLock = frozen ? new ReentrantLock() : null;
 252         return result;
 253     }
 254
 255     /**
 256      * Return a CollationElementIterator for the given String.
 257      *
 258      * @see CollationElementIterator
 259      * @stable ICU 2.8
 260      */
 261     public CollationElementIterator getCollationElementIterator(String source) {
 262         return new CollationElementIterator(source, this);
 263     }
 264
 265     /**
 266      * Return a CollationElementIterator for the given CharacterIterator. The source iterator's integrity will be
 267      * preserved since a new copy will be created for use.
 268      *
 269      * @see CollationElementIterator
 270      * @stable ICU 2.8
 271      */
 272     public CollationElementIterator getCollationElementIterator(CharacterIterator source) {
 273         CharacterIterator newsource = (CharacterIterator) source.clone();
 274         return new CollationElementIterator(newsource, this);
 275     }
 276
 277     /**
 278      * Return a CollationElementIterator for the given UCharacterIterator. The source iterator's integrity will be
 279      * preserved since a new copy will be created for use.
 280      *
 281      * @see CollationElementIterator
 282      * @stable ICU 2.8
 283      */
 284     public CollationElementIterator getCollationElementIterator(UCharacterIterator source) {
 285         return new CollationElementIterator(source, this);
 286     }
 287
 288     // Freezable interface implementation -------------------------------------------------
 289
 290     /**
 291      * Determines whether the object has been frozen or not.
 292      * @stable ICU 4.8
 293      */
 294     public boolean isFrozen() {
 295         return frozenLock != null;
 296     }
 297
 298     /**
 299      * Freezes the collator.
 300      * @return the collator itself.
 301      * @stable ICU 4.8
 302      */
 303     public Collator freeze() {
 304         if (!isFrozen()) {
 305             frozenLock = new ReentrantLock();
 306         }
 307         return this;
 308     }
 309
 310     /**
 311      * Provides for the clone operation. Any clone is initially unfrozen.
 312      * @stable ICU 4.8
 313      */
 314     public RuleBasedCollator cloneAsThawed() {
 315         RuleBasedCollator clone = null;
 316         try {
 317             clone = (RuleBasedCollator) clone(false);
 318         } catch (CloneNotSupportedException e) {
 319             // Clone is implemented
 320         }
 321         return clone;
 322     }
 323
 324     // public setters --------------------------------------------------------
 325
 326     /**
 327      * Sets the Hiragana Quaternary mode to be on or off. When the Hiragana Quaternary mode is turned on, the collator
 328      * positions Hiragana characters before all non-ignorable characters in QUATERNARY strength. This is to produce a
 329      * correct JIS collation order, distinguishing between Katakana and Hiragana characters.
 330      *
 331      * This attribute is an implementation detail of the CLDR Japanese tailoring.
 332      * The implementation might change to use a different mechanism
 333      * to achieve the same Japanese sort order.
 334      * Since ICU 50, this attribute is not settable any more via API functions.
 335      *
 336      * @param flag
 337      *            true if Hiragana Quaternary mode is to be on, false otherwise
 338      * @see #setHiraganaQuaternaryDefault
 339      * @see #isHiraganaQuaternary
 340      * @deprecated ICU 50 Implementation detail, cannot be set via API, might be removed from implementation.
 341      */
 342     public void setHiraganaQuaternary(boolean flag) {
 343         if (isFrozen()) {
 344             throw new UnsupportedOperationException("Attempt to modify frozen object");
 345         }
 346     }
 347
 348     /**
 349      * Sets the Hiragana Quaternary mode to the initial mode set during construction of the RuleBasedCollator. See
 350      * setHiraganaQuaternary(boolean) for more details.
 351      *
 352      * This attribute is an implementation detail of the CLDR Japanese tailoring.
 353      * The implementation might change to use a different mechanism
 354      * to achieve the same Japanese sort order.
 355      * Since ICU 50, this attribute is not settable any more via API functions.
 356      *
 357      * @see #setHiraganaQuaternary(boolean)
 358      * @see #isHiraganaQuaternary
 359      * @deprecated ICU 50 Implementation detail, cannot be set via API, might be removed from implementation.
 360      */
 361     public void setHiraganaQuaternaryDefault() {
 362         if (isFrozen()) {
 363             throw new UnsupportedOperationException("Attempt to modify frozen object");
 364         }
 365     }
 366
 367     /**
 368      * Sets whether uppercase characters sort before lowercase characters or vice versa, in strength TERTIARY. The
 369      * default mode is false, and so lowercase characters sort before uppercase characters. If true, sort upper case
 370      * characters first.
 371      *
 372      * @param upperfirst
 373      *            true to sort uppercase characters before lowercase characters, false to sort lowercase characters
 374      *            before uppercase characters
 375      * @see #isLowerCaseFirst
 376      * @see #isUpperCaseFirst
 377      * @see #setLowerCaseFirst
 378      * @see #setCaseFirstDefault
 379      * @stable ICU 2.8
 380      */
 381     public void setUpperCaseFirst(boolean upperfirst) {
 382         if (isFrozen()) {
 383             throw new UnsupportedOperationException("Attempt to modify frozen object");
 384         }
 385
 386         if (upperfirst) {
 387             if (m_caseFirst_ != AttributeValue.UPPER_FIRST_) {
 388                 latinOneRegenTable_ = true;
 389             }
 390             m_caseFirst_ = AttributeValue.UPPER_FIRST_;
 391         } else {
 392             if (m_caseFirst_ != AttributeValue.OFF_) {
 393                 latinOneRegenTable_ = true;
 394             }
 395             m_caseFirst_ = AttributeValue.OFF_;
 396         }
 397         updateInternalState();
 398     }
 399
 400     /**
 401      * Sets the orders of lower cased characters to sort before upper cased characters, in strength TERTIARY. The
 402      * default mode is false. If true is set, the RuleBasedCollator will sort lower cased characters before the upper
 403      * cased ones. Otherwise, if false is set, the RuleBasedCollator will ignore case preferences.
 404      *
 405      * @param lowerfirst
 406      *            true for sorting lower cased characters before upper cased characters, false to ignore case
 407      *            preferences.
 408      * @see #isLowerCaseFirst
 409      * @see #isUpperCaseFirst
 410      * @see #setUpperCaseFirst
 411      * @see #setCaseFirstDefault
 412      * @stable ICU 2.8
 413      */
 414     public void setLowerCaseFirst(boolean lowerfirst) {
 415         if (isFrozen()) {
 416             throw new UnsupportedOperationException("Attempt to modify frozen object");
 417         }
 418
 419         if (lowerfirst) {
 420             if (m_caseFirst_ != AttributeValue.LOWER_FIRST_) {
 421                 latinOneRegenTable_ = true;
 422             }
 423             m_caseFirst_ = AttributeValue.LOWER_FIRST_;
 424         } else {
 425             if (m_caseFirst_ != AttributeValue.OFF_) {
 426                 latinOneRegenTable_ = true;
 427             }
 428             m_caseFirst_ = AttributeValue.OFF_;
 429         }
 430         updateInternalState();
 431     }
 432
 433     /**
 434      * Sets the case first mode to the initial mode set during construction of the RuleBasedCollator. See
 435      * setUpperCaseFirst(boolean) and setLowerCaseFirst(boolean) for more details.
 436      *
 437      * @see #isLowerCaseFirst
 438      * @see #isUpperCaseFirst
 439      * @see #setLowerCaseFirst(boolean)
 440      * @see #setUpperCaseFirst(boolean)
 441      * @stable ICU 2.8
 442      */
 443     public final void setCaseFirstDefault() {
 444         if (isFrozen()) {
 445             throw new UnsupportedOperationException("Attempt to modify frozen object");
 446         }
 447
 448         if (m_caseFirst_ != m_defaultCaseFirst_) {
 449             latinOneRegenTable_ = true;
 450         }
 451         m_caseFirst_ = m_defaultCaseFirst_;
 452         updateInternalState();
 453     }
 454
 455     /**
 456      * Sets the alternate handling mode to the initial mode set during construction of the RuleBasedCollator. See
 457      * setAlternateHandling(boolean) for more details.
 458      *
 459      * @see #setAlternateHandlingShifted(boolean)
 460      * @see #isAlternateHandlingShifted()
 461      * @stable ICU 2.8
 462      */
 463     public void setAlternateHandlingDefault() {
 464         if (isFrozen()) {
 465             throw new UnsupportedOperationException("Attempt to modify frozen object");
 466         }
 467
 468         m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_;
 469         updateInternalState();
 470     }
 471
 472     /**
 473      * Sets the case level mode to the initial mode set during construction of the RuleBasedCollator. See
 474      * setCaseLevel(boolean) for more details.
 475      *
 476      * @see #setCaseLevel(boolean)
 477      * @see #isCaseLevel
 478      * @stable ICU 2.8
 479      */
 480     public void setCaseLevelDefault() {
 481         if (isFrozen()) {
 482             throw new UnsupportedOperationException("Attempt to modify frozen object");
 483         }
 484
 485         m_isCaseLevel_ = m_defaultIsCaseLevel_;
 486         updateInternalState();
 487     }
 488
 489     /**
 490      * Sets the decomposition mode to the initial mode set during construction of the RuleBasedCollator. See
 491      * setDecomposition(int) for more details.
 492      *
 493      * @see #getDecomposition
 494      * @see #setDecomposition(int)
 495      * @stable ICU 2.8
 496      */
 497     public void setDecompositionDefault() {
 498         if (isFrozen()) {
 499             throw new UnsupportedOperationException("Attempt to modify frozen object");
 500         }
 501
 502         setDecomposition(m_defaultDecomposition_);
 503         updateInternalState();
 504     }
 505
 506     /**
 507      * Sets the French collation mode to the initial mode set during construction of the RuleBasedCollator. See
 508      * setFrenchCollation(boolean) for more details.
 509      *
 510      * @see #isFrenchCollation
 511      * @see #setFrenchCollation(boolean)
 512      * @stable ICU 2.8
 513      */
 514     public void setFrenchCollationDefault() {
 515         if (isFrozen()) {
 516             throw new UnsupportedOperationException("Attempt to modify frozen object");
 517         }
 518
 519         if (m_isFrenchCollation_ != m_defaultIsFrenchCollation_) {
 520             latinOneRegenTable_ = true;
 521         }
 522         m_isFrenchCollation_ = m_defaultIsFrenchCollation_;
 523         updateInternalState();
 524     }
 525
 526     /**
 527      * Sets the collation strength to the initial mode set during the construction of the RuleBasedCollator. See
 528      * setStrength(int) for more details.
 529      *
 530      * @see #setStrength(int)
 531      * @see #getStrength
 532      * @stable ICU 2.8
 533      */
 534     public void setStrengthDefault() {
 535         setStrength(m_defaultStrength_);
 536         updateInternalState();
 537     }
 538
 539     /**
 540      * Method to set numeric collation to its default value. When numeric collation is turned on, this Collator
 541      * generates a collation key for the numeric value of substrings of digits. This is a way to get '100' to sort AFTER
 542      * '2'
 543      *
 544      * @see #getNumericCollation
 545      * @see #setNumericCollation
 546      * @stable ICU 2.8
 547      */
 548     public void setNumericCollationDefault() {
 549         if (isFrozen()) {
 550             throw new UnsupportedOperationException("Attempt to modify frozen object");
 551         }
 552
 553         setNumericCollation(m_defaultIsNumericCollation_);
 554         updateInternalState();
 555     }
 556
 557     /**
 558      * Sets the mode for the direction of SECONDARY weights to be used in French collation. The default value is false,
 559      * which treats SECONDARY weights in the order they appear. If set to true, the SECONDARY weights will be sorted
 560      * backwards. See the section on <a href="http://www.icu-project.org/userguide/Collate_ServiceArchitecture.html">
 561      * French collation</a> for more information.
 562      *
 563      * @param flag
 564      *            true to set the French collation on, false to set it off
 565      * @stable ICU 2.8
 566      * @see #isFrenchCollation
 567      * @see #setFrenchCollationDefault
 568      */
 569     public void setFrenchCollation(boolean flag) {
 570         if (isFrozen()) {
 571             throw new UnsupportedOperationException("Attempt to modify frozen object");
 572         }
 573
 574         if (m_isFrenchCollation_ != flag) {
 575             latinOneRegenTable_ = true;
 576         }
 577         m_isFrenchCollation_ = flag;
 578         updateInternalState();
 579     }
 580
 581     /**
 582      * Sets the alternate handling for QUATERNARY strength to be either shifted or non-ignorable. See the UCA definition
 583      * on <a href="http://www.unicode.org/unicode/reports/tr10/#Variable_Weighting"> Alternate Weighting</a>. This
 584      * attribute will only be effective when QUATERNARY strength is set. The default value for this mode is false,
 585      * corresponding to the NON_IGNORABLE mode in UCA. In the NON-IGNORABLE mode, the RuleBasedCollator will treats all
 586      * the codepoints with non-ignorable primary weights in the same way. If the mode is set to true, the behaviour
 587      * corresponds to SHIFTED defined in UCA, this causes codepoints with PRIMARY orders that are equal or below the
 588      * variable top value to be ignored in PRIMARY order and moved to the QUATERNARY order.
 589      *
 590      * @param shifted
 591      *            true if SHIFTED behaviour for alternate handling is desired, false for the NON_IGNORABLE behaviour.
 592      * @see #isAlternateHandlingShifted
 593      * @see #setAlternateHandlingDefault
 594      * @stable ICU 2.8
 595      */
 596     public void setAlternateHandlingShifted(boolean shifted) {
 597         if (isFrozen()) {
 598             throw new UnsupportedOperationException("Attempt to modify frozen object");
 599         }
 600
 601         m_isAlternateHandlingShifted_ = shifted;
 602         updateInternalState();
 603     }
 604
 605     /**
 606      * <p>
 607      * When case level is set to true, an additional weight is formed between the SECONDARY and TERTIARY weight, known
 608      * as the case level. The case level is used to distinguish large and small Japanese Kana characters. Case level
 609      * could also be used in other situations. For example to distinguish certain Pinyin characters. The default value
 610      * is false, which means the case level is not generated. The contents of the case level are affected by the case
 611      * first mode. A simple way to ignore accent differences in a string is to set the strength to PRIMARY and enable
 612      * case level.
 613      * </p>
 614      * <p>
 615      * See the section on <a href="http://www.icu-project.org/userguide/Collate_ServiceArchitecture.html"> case
 616      * level</a> for more information.
 617      * </p>
 618      *
 619      * @param flag
 620      *            true if case level sorting is required, false otherwise
 621      * @stable ICU 2.8
 622      * @see #setCaseLevelDefault
 623      * @see #isCaseLevel
 624      */
 625     public void setCaseLevel(boolean flag) {
 626         if (isFrozen()) {
 627             throw new UnsupportedOperationException("Attempt to modify frozen object");
 628         }
 629
 630         m_isCaseLevel_ = flag;
 631         updateInternalState();
 632     }
 633
 634     /**
 635      * <p>
 636      * Sets this Collator's strength property. The strength property determines the minimum level of difference
 637      * considered significant during comparison.
 638      * </p>
 639      * <p>
 640      * See the Collator class description for an example of use.
 641      * </p>
 642      *
 643      * @param newStrength
 644      *            the new strength value.
 645      * @see #getStrength
 646      * @see #setStrengthDefault
 647      * @see #PRIMARY
 648      * @see #SECONDARY
 649      * @see #TERTIARY
 650      * @see #QUATERNARY
 651      * @see #IDENTICAL
 652      * @exception IllegalArgumentException
 653      *                If the new strength value is not one of PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL.
 654      * @stable ICU 2.8
 655      */
 656     public void setStrength(int newStrength) {
 657         super.setStrength(newStrength);
 658         updateInternalState();
 659     }
 660
 661     /**
 662      * <p>
 663      * Variable top is a two byte primary value which causes all the codepoints with primary values that are less or
 664      * equal than the variable top to be shifted when alternate handling is set to SHIFTED.
 665      * </p>
 666      * <p>
 667      * Sets the variable top to a collation element value of a string supplied.
 668      * </p>
 669      *
 670      * @param varTop
 671      *            one or more (if contraction) characters to which the variable top should be set
 672      * @return a int value containing the value of the variable top in upper 16 bits. Lower 16 bits are undefined.
 673      * @exception IllegalArgumentException
 674      *                is thrown if varTop argument is not a valid variable top element. A variable top element is
 675      *                invalid when
 676      *                <ul>
 677      *                <li>it is a contraction that does not exist in the Collation order
 678      *                <li>when the PRIMARY strength collation element for the variable top has more than two bytes
 679      *                <li>when the varTop argument is null or zero in length.
 680      *                </ul>
 681      * @see #getVariableTop
 682      * @see RuleBasedCollator#setAlternateHandlingShifted
 683      * @stable ICU 2.6
 684      */
 685     public int setVariableTop(String varTop) {
 686         if (isFrozen()) {
 687             throw new UnsupportedOperationException("Attempt to modify frozen object");
 688         }
 689
 690         if (varTop == null || varTop.length() == 0) {
 691             throw new IllegalArgumentException("Variable top argument string can not be null or zero in length.");
 692         }
 693
 694         CollationBuffer buffer = null;
 695         try {
 696             buffer = getCollationBuffer();
 697             return setVariableTop(varTop, buffer);
 698         } finally {
 699             releaseCollationBuffer(buffer);
 700         }
 701
 702     }
 703
 704     private int setVariableTop(String varTop, CollationBuffer buffer) {
 705         buffer.m_srcUtilColEIter_.setText(varTop);
 706         int ce = buffer.m_srcUtilColEIter_.next();
 707
 708         // here we check if we have consumed all characters
 709         // you can put in either one character or a contraction
 710         // you shouldn't put more...
 711         if (buffer.m_srcUtilColEIter_.getOffset() != varTop.length() || ce == CollationElementIterator.NULLORDER) {
 712             throw new IllegalArgumentException("Variable top argument string is a contraction that does not exist "
 713                     + "in the Collation order");
 714         }
 715
 716         int nextCE = buffer.m_srcUtilColEIter_.next();
 717
 718         if ((nextCE != CollationElementIterator.NULLORDER)
 719                 && (!isContinuation(nextCE) || (nextCE & CE_PRIMARY_MASK_) != 0)) {
 720             throw new IllegalArgumentException("Variable top argument string can only have a single collation "
 721                     + "element that has less than or equal to two PRIMARY strength " + "bytes");
 722         }
 723
 724         m_variableTopValue_ = (ce & CE_PRIMARY_MASK_) >> 16;
 725
 726         return ce & CE_PRIMARY_MASK_;
 727     }
 728
 729     /**
 730      * Sets the variable top to a collation element value supplied. Variable top is set to the upper 16 bits. Lower 16
 731      * bits are ignored.
 732      *
 733      * @param varTop
 734      *            Collation element value, as returned by setVariableTop or getVariableTop
 735      * @see #getVariableTop
 736      * @see #setVariableTop(String)
 737      * @stable ICU 2.6
 738      */
 739     public void setVariableTop(int varTop) {
 740         if (isFrozen()) {
 741             throw new UnsupportedOperationException("Attempt to modify frozen object");
 742         }
 743
 744         m_variableTopValue_ = (varTop & CE_PRIMARY_MASK_) >> 16;
 745     }
 746
 747     /**
 748      * When numeric collation is turned on, this Collator generates a collation key for the numeric value of substrings
 749      * of digits. This is a way to get '100' to sort AFTER '2'
 750      *
 751      * @param flag
 752      *            true to turn numeric collation on and false to turn it off
 753      * @see #getNumericCollation
 754      * @see #setNumericCollationDefault
 755      * @stable ICU 2.8
 756      */
 757     public void setNumericCollation(boolean flag) {
 758         if (isFrozen()) {
 759             throw new UnsupportedOperationException("Attempt to modify frozen object");
 760         }
 761
 762         // sort substrings of digits as numbers
 763         m_isNumericCollation_ = flag;
 764         updateInternalState();
 765     }
 766
 767     /**
 768      * Sets the reordering codes for this collator.
 769      * Collation reordering allows scripts and some other defined blocks of characters
 770      * to be moved relative to each other as a block. This reordering is done on top of
 771      * the DUCET/CLDR standard collation order. Reordering can specify groups to be placed
 772      * at the start and/or the end of the collation order.
 773      * <p>By default, reordering codes specified for the start of the order are placed in the
 774      * order given after a group of “special” non-script blocks. These special groups of characters
 775      * are space, punctuation, symbol, currency, and digit. These special groups are represented with
 776      * {@link Collator.ReorderCodes}. Script groups can be intermingled with
 777      * these special non-script blocks if those special blocks are explicitly specified in the reordering.
 778      * <p>The special code {@link Collator.ReorderCodes#OTHERS OTHERS} stands for any script that is not explicitly
 779      * mentioned in the list of reordering codes given. Anything that is after {@link Collator.ReorderCodes#OTHERS OTHERS}
 780      * will go at the very end of the reordering in the order given.
 781      * <p>The special reorder code {@link Collator.ReorderCodes#DEFAULT DEFAULT} will reset the reordering for this collator
 782      * to the default for this collator. The default reordering may be the DUCET/CLDR order or may be a reordering that
 783      * was specified when this collator was created from resource data or from rules. The
 784      * {@link Collator.ReorderCodes#DEFAULT DEFAULT} code <b>must</b> be the sole code supplied when it used. If not
 785      * that will result in an {@link IllegalArgumentException} being thrown.
 786      * <p>The special reorder code {@link Collator.ReorderCodes#NONE NONE} will remove any reordering for this collator.
 787      * The result of setting no reordering will be to have the DUCET/CLDR reordering used. The
 788      * {@link Collator.ReorderCodes#NONE NONE} code <b>must</b> be the sole code supplied when it used.
 789      * @param order the reordering codes to apply to this collator; if this is null or an empty array
 790      * then this clears any existing reordering
 791      * @throws IllegalArgumentException if the reordering codes are malformed in any way (e.g. duplicates, multiple reset codes, overlapping equivalent scripts)
 792      * @see #getReorderCodes
 793      * @see #getEquivalentReorderCodes
 794      * @stable ICU 4.8
 795      */
 796     public void setReorderCodes(int... order) {
 797         if (isFrozen()) {
 798             throw new UnsupportedOperationException("Attempt to modify frozen object");
 799         }
 800
 801         if (order != null && order.length > 0) {
 802             m_reorderCodes_ = order.clone();
 803         } else {
 804             m_reorderCodes_ = null;
 805         }
 806         buildPermutationTable();
 807     }
 808
 809     // public getters --------------------------------------------------------
 810
 811     /**
 812      * Gets the collation tailoring rules for this RuleBasedCollator.
 813      * Equivalent to String getRules(false).
 814      *
 815      * @return the collation tailoring rules
 816      * @see #getRules(boolean)
 817      * @stable ICU 2.8
 818      */
 819     public String getRules() {
 820         return m_rules_;
 821     }
 822
 823     /**
 824      * Returns current rules. The argument defines whether full rules (UCA + tailored) rules are returned or just the
 825      * tailoring.
 826      *
 827      * <p>The "UCA rules" are an <i>approximation</i> of the root collator's sort order.
 828      * They are almost never used or useful at runtime and can be removed from the data.
 829      * See <a href="http://userguide.icu-project.org/collation/customization#TOC-Building-on-Existing-Locales">User Guide:
 830      * Collation Customization, Building on Existing Locales</a>
 831      *
 832      * <p>{@link #getRules()} should normally be used instead.
 833      * @param fullrules
 834      *            true if the rules that defines the full set of collation order is required, otherwise false for
 835      *            returning only the tailored rules
 836      * @return the current rules that defines this Collator.
 837      * @see #getRules()
 838      * @stable ICU 2.6
 839      */
 840     public String getRules(boolean fullrules) {
 841         if (!fullrules) {
 842             return m_rules_;
 843         }
 844         // take the UCA rules and append real rules at the end
 845         return UCA_.m_rules_.concat(m_rules_);
 846     }
 847
 848     /**
 849      * Get an UnicodeSet that contains all the characters and sequences tailored in this collator.
 850      *
 851      * @return a pointer to a UnicodeSet object containing all the code points and sequences that may sort differently
 852      *         than in the UCA.
 853      * @stable ICU 2.4
 854      */
 855     public UnicodeSet getTailoredSet() {
 856         try {
 857             CollationRuleParser src = new CollationRuleParser(getRules());
 858             return src.getTailoredSet();
 859         } catch (Exception e) {
 860             throw new IllegalStateException("A tailoring rule should not " + "have errors. Something is quite wrong!");
 861         }
 862     }
 863
 864     private static class contContext {
 865         RuleBasedCollator coll;
 866         UnicodeSet contractions;
 867         UnicodeSet expansions;
 868         UnicodeSet removedContractions;
 869         boolean addPrefixes;
 870
 871         contContext(RuleBasedCollator coll, UnicodeSet contractions, UnicodeSet expansions,
 872                 UnicodeSet removedContractions, boolean addPrefixes) {
 873             this.coll = coll;
 874             this.contractions = contractions;
 875             this.expansions = expansions;
 876             this.removedContractions = removedContractions;
 877             this.addPrefixes = addPrefixes;
 878         }
 879     }
 880
 881     private void addSpecial(contContext c, StringBuilder buffer, int CE) {
 882         StringBuilder b = new StringBuilder();
 883         int offset = (CE & 0xFFFFFF) - c.coll.m_contractionOffset_;
 884         int newCE = c.coll.m_contractionCE_[offset];
 885         // we might have a contraction that ends from previous level
 886         if (newCE != CollationElementIterator.CE_NOT_FOUND_) {
 887             if (isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_ && isSpecial(newCE)
 888                     && getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) {
 889                 addSpecial(c, buffer, newCE);
 890             }
 891             if (buffer.length() > 1) {
 892                 if (c.contractions != null) {
 893                     c.contractions.add(buffer.toString());
 894                 }
 895                 if (c.expansions != null && isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) {
 896                     c.expansions.add(buffer.toString());
 897                 }
 898             }
 899         }
 900
 901         offset++;
 902         // check whether we're doing contraction or prefix
 903         if (getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) {
 904             while (c.coll.m_contractionIndex_[offset] != 0xFFFF) {
 905                 b.delete(0, b.length());
 906                 b.append(buffer);
 907                 newCE = c.coll.m_contractionCE_[offset];
 908                 b.insert(0, c.coll.m_contractionIndex_[offset]);
 909                 if (isSpecial(newCE)
 910                         && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) {
 911                     addSpecial(c, b, newCE);
 912                 } else {
 913                     if (c.contractions != null) {
 914                         c.contractions.add(b.toString());
 915                     }
 916                     if (c.expansions != null && isSpecial(newCE)
 917                             && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) {
 918                         c.expansions.add(b.toString());
 919                     }
 920                 }
 921                 offset++;
 922             }
 923         } else if (getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_) {
 924             while (c.coll.m_contractionIndex_[offset] != 0xFFFF) {
 925                 b.delete(0, b.length());
 926                 b.append(buffer);
 927                 newCE = c.coll.m_contractionCE_[offset];
 928                 b.append(c.coll.m_contractionIndex_[offset]);
 929                 if (isSpecial(newCE)
 930                         && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) {
 931                     addSpecial(c, b, newCE);
 932                 } else {
 933                     if (c.contractions != null) {
 934                         c.contractions.add(b.toString());
 935                     }
 936                     if (c.expansions != null && isSpecial(newCE)
 937                             && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) {
 938                         c.expansions.add(b.toString());
 939                     }
 940                 }
 941                 offset++;
 942             }
 943         }
 944     }
 945
 946     private void processSpecials(contContext c) {
 947         int internalBufferSize = 512;
 948         TrieIterator trieiterator = new TrieIterator(c.coll.m_trie_);
 949         RangeValueIterator.Element element = new RangeValueIterator.Element();
 950         while (trieiterator.next(element)) {
 951             int start = element.start;
 952             int limit = element.limit;
 953             int CE = element.value;
 954             StringBuilder contraction = new StringBuilder(internalBufferSize);
 955
 956             if (isSpecial(CE)) {
 957                 if (((getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) || getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_)) {
 958                     while (start < limit) {
 959                         // if there are suppressed contractions, we don't
 960                         // want to add them.
 961                         if (c.removedContractions != null && c.removedContractions.contains(start)) {
 962                             start++;
 963                             continue;
 964                         }
 965                         // we start our contraction from middle, since we don't know if it
 966                         // will grow toward right or left
 967                         contraction.append((char) start);
 968                         addSpecial(c, contraction, CE);
 969                         start++;
 970                     }
 971                 } else if (c.expansions != null && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) {
 972                     while (start < limit) {
 973                         c.expansions.add(start++);
 974                     }
 975                 }
 976             }
 977         }
 978     }
 979
 980     /**
 981      * Gets unicode sets containing contractions and/or expansions of a collator
 982      *
 983      * @param contractions
 984      *            if not null, set to contain contractions
 985      * @param expansions
 986      *            if not null, set to contain expansions
 987      * @param addPrefixes
 988      *            add the prefix contextual elements to contractions
 989      * @throws Exception
 990      *             Throws an exception if any errors occurs.
 991      * @stable ICU 3.4
 992      */
 993     public void getContractionsAndExpansions(UnicodeSet contractions, UnicodeSet expansions, boolean addPrefixes)
 994     throws Exception {
 995         if (contractions != null) {
 996             contractions.clear();
 997         }
 998         if (expansions != null) {
 999             expansions.clear();
1000         }
1001         String rules = getRules();
1002         try {
1003             CollationRuleParser src = new CollationRuleParser(rules);
1004             contContext c = new contContext(RuleBasedCollator.UCA_, contractions, expansions, src.m_removeSet_,
1005                     addPrefixes);
1006
1007             // Add the UCA contractions
1008             processSpecials(c);
1009             // This is collator specific. Add contractions from a collator
1010             c.coll = this;
1011             c.removedContractions = null;
1012             processSpecials(c);
1013         } catch (Exception e) {
1014             throw e;
1015         }
1016     }
1017
1018     /**
1019      * <p>
1020      * Get a Collation key for the argument String source from this RuleBasedCollator.
1021      * </p>
1022      * <p>
1023      * General recommendation: <br>
1024      * If comparison are to be done to the same String multiple times, it would be more efficient to generate
1025      * CollationKeys for the Strings and use CollationKey.compareTo(CollationKey) for the comparisons. If the each
1026      * Strings are compared to only once, using the method RuleBasedCollator.compare(String, String) will have a better
1027      * performance.
1028      * </p>
1029      * <p>
1030      * See the class documentation for an explanation about CollationKeys.
1031      * </p>
1032      *
1033      * @param source
1034      *            the text String to be transformed into a collation key.
1035      * @return the CollationKey for the given String based on this RuleBasedCollator's collation rules. If the source
1036      *         String is null, a null CollationKey is returned.
1037      * @see CollationKey
1038      * @see #compare(String, String)
1039      * @see #getRawCollationKey
1040      * @stable ICU 2.8
1041      */
1042     public CollationKey getCollationKey(String source) {
1043         if (source == null) {
1044             return null;
1045         }
1046         CollationBuffer buffer = null;
1047         try {
1048             buffer = getCollationBuffer();
1049             return getCollationKey(source, buffer);
1050         } finally {
1051             releaseCollationBuffer(buffer);
1052         }
1053     }
1054
1055     private CollationKey getCollationKey(String source, CollationBuffer buffer) {
1056         buffer.m_utilRawCollationKey_ = getRawCollationKey(source, buffer.m_utilRawCollationKey_, buffer);
1057         return new CollationKey(source, buffer.m_utilRawCollationKey_);
1058     }
1059
1060     /**
1061      * Gets the simpler form of a CollationKey for the String source following the rules of this Collator and stores the
1062      * result into the user provided argument key. If key has a internal byte array of length that's too small for the
1063      * result, the internal byte array will be grown to the exact required size.
1064      *
1065      * @param source the text String to be transformed into a RawCollationKey
1066      * @param key output RawCollationKey to store results
1067      * @return If key is null, a new instance of RawCollationKey will be created and returned, otherwise the user
1068      *         provided key will be returned.
1069      * @see #getCollationKey
1070      * @see #compare(String, String)
1071      * @see RawCollationKey
1072      * @stable ICU 2.8
1073      */
1074     public RawCollationKey getRawCollationKey(String source, RawCollationKey key) {
1075         if (source == null) {
1076             return null;
1077         }
1078         CollationBuffer buffer = null;
1079         try {
1080             buffer = getCollationBuffer();
1081             return getRawCollationKey(source, key, buffer);
1082         } finally {
1083             releaseCollationBuffer(buffer);
1084         }
1085     }
1086
1087     private RawCollationKey getRawCollationKey(String source, RawCollationKey key, CollationBuffer buffer) {
1088         int strength = getStrength();
1089         buffer.m_utilCompare0_ = m_isCaseLevel_;
1090         // m_utilCompare1_ = true;
1091         buffer.m_utilCompare2_ = strength >= SECONDARY;
1092         buffer.m_utilCompare3_ = strength >= TERTIARY;
1093         buffer.m_utilCompare4_ = strength >= QUATERNARY;
1094         buffer.m_utilCompare5_ = strength == IDENTICAL;
1095
1096         boolean doFrench = m_isFrenchCollation_ && buffer.m_utilCompare2_;
1097         // TODO: UCOL_COMMON_BOT4 should be a function of qShifted.
1098         // If we have no qShifted, we don't need to set UCOL_COMMON_BOT4 so
1099         // high.
1100         int commonBottom4 = ((m_variableTopValue_ >>> 8) + 1) & LAST_BYTE_MASK_;
1101         byte hiragana4 = 0;
1102         if (m_isHiragana4_ && buffer.m_utilCompare4_) {
1103             // allocate one more space for hiragana, value for hiragana
1104             hiragana4 = (byte) commonBottom4;
1105             commonBottom4++;
1106         }
1107
1108         int bottomCount4 = 0xFF - commonBottom4;
1109         // If we need to normalize, we'll do it all at once at the beginning!
1110         if (buffer.m_utilCompare5_ && Normalizer.quickCheck(source, Normalizer.NFD, 0) != Normalizer.YES) {
1111             // if it is identical strength, we have to normalize the string to
1112             // NFD so that it will be appended correctly to the end of the sort
1113             // key
1114             source = Normalizer.decompose(source, false);
1115         } else if (getDecomposition() != NO_DECOMPOSITION
1116                 && Normalizer.quickCheck(source, Normalizer.FCD, 0) != Normalizer.YES) {
1117             // for the rest of the strength, if decomposition is on, FCD is
1118             // enough for us to work on.
1119             source = Normalizer.normalize(source, Normalizer.FCD);
1120         }
1121         getSortKeyBytes(source, doFrench, hiragana4, commonBottom4, bottomCount4, buffer);
1122         if (key == null) {
1123             key = new RawCollationKey();
1124         }
1125         getSortKey(source, doFrench, commonBottom4, bottomCount4, key, buffer);
1126         return key;
1127     }
1128
1129     /**
1130      * Return true if an uppercase character is sorted before the corresponding lowercase character. See
1131      * setCaseFirst(boolean) for details.
1132      *
1133      * @see #setUpperCaseFirst
1134      * @see #setLowerCaseFirst
1135      * @see #isLowerCaseFirst
1136      * @see #setCaseFirstDefault
1137      * @return true if upper cased characters are sorted before lower cased characters, false otherwise
1138      * @stable ICU 2.8
1139      */
1140     public boolean isUpperCaseFirst() {
1141         return (m_caseFirst_ == AttributeValue.UPPER_FIRST_);
1142     }
1143
1144     /**
1145      * Return true if a lowercase character is sorted before the corresponding uppercase character. See
1146      * setCaseFirst(boolean) for details.
1147      *
1148      * @see #setUpperCaseFirst
1149      * @see #setLowerCaseFirst
1150      * @see #isUpperCaseFirst
1151      * @see #setCaseFirstDefault
1152      * @return true lower cased characters are sorted before upper cased characters, false otherwise
1153      * @stable ICU 2.8
1154      */
1155     public boolean isLowerCaseFirst() {
1156         return (m_caseFirst_ == AttributeValue.LOWER_FIRST_);
1157     }
1158
1159     /**
1160      * Checks if the alternate handling behaviour is the UCA defined SHIFTED or NON_IGNORABLE. If return value is true,
1161      * then the alternate handling attribute for the Collator is SHIFTED. Otherwise if return value is false, then the
1162      * alternate handling attribute for the Collator is NON_IGNORABLE See setAlternateHandlingShifted(boolean) for more
1163      * details.
1164      *
1165      * @return true or false
1166      * @see #setAlternateHandlingShifted(boolean)
1167      * @see #setAlternateHandlingDefault
1168      * @stable ICU 2.8
1169      */
1170     public boolean isAlternateHandlingShifted() {
1171         return m_isAlternateHandlingShifted_;
1172     }
1173
1174     /**
1175      * Checks if case level is set to true. See setCaseLevel(boolean) for details.
1176      *
1177      * @return the case level mode
1178      * @see #setCaseLevelDefault
1179      * @see #isCaseLevel
1180      * @see #setCaseLevel(boolean)
1181      * @stable ICU 2.8
1182      */
1183     public boolean isCaseLevel() {
1184         return m_isCaseLevel_;
1185     }
1186
1187     /**
1188      * Checks if French Collation is set to true. See setFrenchCollation(boolean) for details.
1189      *
1190      * @return true if French Collation is set to true, false otherwise
1191      * @see #setFrenchCollation(boolean)
1192      * @see #setFrenchCollationDefault
1193      * @stable ICU 2.8
1194      */
1195     public boolean isFrenchCollation() {
1196         return m_isFrenchCollation_;
1197     }
1198
1199     /**
1200      * Checks if the Hiragana Quaternary mode is set on. See setHiraganaQuaternary(boolean) for more details.
1201      *
1202      * This attribute is an implementation detail of the CLDR Japanese tailoring.
1203      * The implementation might change to use a different mechanism
1204      * to achieve the same Japanese sort order.
1205      * Since ICU 50, this attribute is not settable any more via API functions.
1206      *
1207      * @return flag true if Hiragana Quaternary mode is on, false otherwise
1208      * @see #setHiraganaQuaternaryDefault
1209      * @see #setHiraganaQuaternary(boolean)
1210      * @deprecated ICU 50 Implementation detail, cannot be set via API, might be removed from implementation.
1211      */
1212     public boolean isHiraganaQuaternary() {
1213         return m_isHiragana4_;
1214     }
1215
1216     /**
1217      * Gets the variable top value of a Collator. Lower 16 bits are undefined and should be ignored.
1218      *
1219      * @return the variable top value of a Collator.
1220      * @see #setVariableTop
1221      * @stable ICU 2.6
1222      */
1223     public int getVariableTop() {
1224         return m_variableTopValue_ << 16;
1225     }
1226
1227     /**
1228      * Method to retrieve the numeric collation value. When numeric collation is turned on, this Collator generates a
1229      * collation key for the numeric value of substrings of digits. This is a way to get '100' to sort AFTER '2'
1230      *
1231      * @see #setNumericCollation
1232      * @see #setNumericCollationDefault
1233      * @return true if numeric collation is turned on, false otherwise
1234      * @stable ICU 2.8
1235      */
1236     public boolean getNumericCollation() {
1237         return m_isNumericCollation_;
1238     }
1239
1240     /**
1241      * Retrieves the reordering codes for this collator.
1242      * These reordering codes are a combination of UScript codes and ReorderCodes.
1243      * @return a copy of the reordering codes for this collator;
1244      * if none are set then returns an empty array
1245      * @see #setReorderCodes
1246      * @see #getEquivalentReorderCodes
1247      * @stable ICU 4.8
1248      */
1249     public int[] getReorderCodes() {
1250         if (m_reorderCodes_ != null) {
1251             return m_reorderCodes_.clone();
1252         } else {
1253             return LeadByteConstants.EMPTY_INT_ARRAY;
1254         }
1255     }
1256
1257     /**
1258      * Retrieves all the reorder codes that are grouped with the given reorder code. Some reorder
1259      * codes are grouped and must reorder together.
1260      *
1261      * @param reorderCode code for which equivalents to be retrieved
1262      * @return the set of all reorder codes in the same group as the given reorder code.
1263      * @see #setReorderCodes
1264      * @see #getReorderCodes
1265      * @stable ICU 4.8
1266      */
1267     public static int[] getEquivalentReorderCodes(int reorderCode) {
1268         Set<Integer> equivalentCodesSet = new HashSet<Integer>();
1269         int[] leadBytes = RuleBasedCollator.LEADBYTE_CONSTANTS_.getLeadBytesForReorderCode(reorderCode);
1270         for (int leadByte : leadBytes) {
1271             int[] codes = RuleBasedCollator.LEADBYTE_CONSTANTS_.getReorderCodesForLeadByte(leadByte);
1272             for (int code : codes) {
1273                 equivalentCodesSet.add(code);
1274             }
1275         }
1276         int[] equivalentCodes = new int[equivalentCodesSet.size()];
1277         int i = 0;
1278         for (int code : equivalentCodesSet) {
1279             equivalentCodes[i++] = code;
1280         }
1281         return equivalentCodes;
1282     }
1283
1284     // public other methods -------------------------------------------------
1285
1286     /**
1287      * Compares the equality of two RuleBasedCollator objects. RuleBasedCollator objects are equal if they have the same
1288      * collation rules and the same attributes.
1289      *
1290      * @param obj
1291      *            the RuleBasedCollator to be compared to.
1292      * @return true if this RuleBasedCollator has exactly the same collation behaviour as obj, false otherwise.
1293      * @stable ICU 2.8
1294      */
1295     public boolean equals(Object obj) {
1296         if (obj == null) {
1297             return false; // super does class check
1298         }
1299         if (this == obj) {
1300             return true;
1301         }
1302         if (getClass() != obj.getClass()) {
1303             return false;
1304         }
1305         RuleBasedCollator other = (RuleBasedCollator) obj;
1306         // all other non-transient information is also contained in rules.
1307         if (getStrength() != other.getStrength() || getDecomposition() != other.getDecomposition()
1308                 || other.m_caseFirst_ != m_caseFirst_ || other.m_caseSwitch_ != m_caseSwitch_
1309                 || other.m_isAlternateHandlingShifted_ != m_isAlternateHandlingShifted_
1310                 || other.m_isCaseLevel_ != m_isCaseLevel_ || other.m_isFrenchCollation_ != m_isFrenchCollation_
1311                 || other.m_isHiragana4_ != m_isHiragana4_) {
1312             return false;
1313         }
1314         if (m_reorderCodes_ != null ^ other.m_reorderCodes_ != null) {
1315             return false;
1316         }
1317         if (m_reorderCodes_ != null) {
1318             if (m_reorderCodes_.length != other.m_reorderCodes_.length) {
1319                 return false;
1320             }
1321             for (int i = 0; i < m_reorderCodes_.length; i++) {
1322                 if (m_reorderCodes_[i] != other.m_reorderCodes_[i]) {
1323                     return false;
1324                 }
1325             }
1326         }
1327         boolean rules = m_rules_ == other.m_rules_;
1328         if (!rules && (m_rules_ != null && other.m_rules_ != null)) {
1329             rules = m_rules_.equals(other.m_rules_);
1330         }
1331         if (!rules || !ICUDebug.enabled("collation")) {
1332             return rules;
1333         }
1334         if (m_addition3_ != other.m_addition3_ || m_bottom3_ != other.m_bottom3_
1335                 || m_bottomCount3_ != other.m_bottomCount3_ || m_common3_ != other.m_common3_
1336                 || m_isSimple3_ != other.m_isSimple3_ || m_mask3_ != other.m_mask3_
1337                 || m_minContractionEnd_ != other.m_minContractionEnd_ || m_minUnsafe_ != other.m_minUnsafe_
1338                 || m_top3_ != other.m_top3_ || m_topCount3_ != other.m_topCount3_
1339                 || !Arrays.equals(m_unsafe_, other.m_unsafe_)) {
1340             return false;
1341         }
1342         if (!m_trie_.equals(other.m_trie_)) {
1343             // we should use the trie iterator here, but then this part is
1344             // only used in the test.
1345             for (int i = UCharacter.MAX_VALUE; i >= UCharacter.MIN_VALUE; i--) {
1346                 int v = m_trie_.getCodePointValue(i);
1347                 int otherv = other.m_trie_.getCodePointValue(i);
1348                 if (v != otherv) {
1349                     int mask = v & (CE_TAG_MASK_ | CE_SPECIAL_FLAG_);
1350                     if (mask == (otherv & 0xff000000)) {
1351                         v &= 0xffffff;
1352                         otherv &= 0xffffff;
1353                         if (mask == 0xf1000000) {
1354                             v -= (m_expansionOffset_ << 4);
1355                             otherv -= (other.m_expansionOffset_ << 4);
1356                         } else if (mask == 0xf2000000) {
1357                             v -= m_contractionOffset_;
1358                             otherv -= other.m_contractionOffset_;
1359                         }
1360                         if (v == otherv) {
1361                             continue;
1362                         }
1363                     }
1364                     return false;
1365                 }
1366             }
1367         }
1368         if (!Arrays.equals(m_contractionCE_, other.m_contractionCE_)
1369                 || !Arrays.equals(m_contractionEnd_, other.m_contractionEnd_)
1370                 || !Arrays.equals(m_contractionIndex_, other.m_contractionIndex_)
1371                 || !Arrays.equals(m_expansion_, other.m_expansion_)
1372                 || !Arrays.equals(m_expansionEndCE_, other.m_expansionEndCE_)) {
1373             return false;
1374         }
1375         // not comparing paddings
1376         for (int i = 0; i < m_expansionEndCE_.length; i++) {
1377             if (m_expansionEndCEMaxSize_[i] != other.m_expansionEndCEMaxSize_[i]) {
1378                 return false;
1379             }
1380         }
1381         return true;
1382     }
1383
1384     /**
1385      * Generates a unique hash code for this RuleBasedCollator.
1386      *
1387      * @return the unique hash code for this Collator
1388      * @stable ICU 2.8
1389      */
1390     public int hashCode() {
1391         String rules = getRules();
1392         if (rules == null) {
1393             rules = "";
1394         }
1395         return rules.hashCode();
1396     }
1397
1398     /**
1399      * Compares the source text String to the target text String according to the collation rules, strength and
1400      * decomposition mode for this RuleBasedCollator. Returns an integer less than, equal to or greater than zero
1401      * depending on whether the source String is less than, equal to or greater than the target String. See the Collator
1402      * class description for an example of use. </p>
1403      * <p>
1404      * General recommendation: <br>
1405      * If comparison are to be done to the same String multiple times, it would be more efficient to generate
1406      * CollationKeys for the Strings and use CollationKey.compareTo(CollationKey) for the comparisons. If speed
1407      * performance is critical and object instantiation is to be reduced, further optimization may be achieved by
1408      * generating a simpler key of the form RawCollationKey and reusing this RawCollationKey object with the method
1409      * RuleBasedCollator.getRawCollationKey. Internal byte representation can be directly accessed via RawCollationKey
1410      * and stored for future use. Like CollationKey, RawCollationKey provides a method RawCollationKey.compareTo for key
1411      * comparisons. If the each Strings are compared to only once, using the method RuleBasedCollator.compare(String,
1412      * String) will have a better performance.
1413      * </p>
1414      *
1415      * @param source
1416      *            the source text String.
1417      * @param target
1418      *            the target text String.
1419      * @return Returns an integer value. Value is less than zero if source is less than target, value is zero if source
1420      *         and target are equal, value is greater than zero if source is greater than target.
1421      * @see CollationKey
1422      * @see #getCollationKey
1423      * @stable ICU 2.8
1424      */
1425     public int compare(String source, String target) {
1426         if (source.equals(target)) {
1427             return 0;
1428         }
1429         CollationBuffer buffer = null;
1430         try {
1431             buffer = getCollationBuffer();
1432             return compare(source, target, buffer);
1433         } finally {
1434             releaseCollationBuffer(buffer);
1435         }
1436     }
1437
1438     private int compare(String source, String target, CollationBuffer buffer) {
1439         // Find the length of any leading portion that is equal
1440         int offset = getFirstUnmatchedOffset(source, target);
1441         // return compareRegular(source, target, offset);
1442         if (latinOneUse_) {
1443             if ((offset < source.length() && source.charAt(offset) > ENDOFLATINONERANGE_)
1444                     || (offset < target.length() && target.charAt(offset) > ENDOFLATINONERANGE_)) {
1445                 // source or target start with non-latin-1
1446                 return compareRegular(source, target, offset, buffer);
1447             } else {
1448                 return compareUseLatin1(source, target, offset, buffer);
1449             }
1450         } else {
1451             return compareRegular(source, target, offset, buffer);
1452         }
1453     }
1454
1455     // package private inner interfaces --------------------------------------
1456
1457     /**
1458      * Attribute values to be used when setting the Collator options
1459      */
1460     static interface AttributeValue {
1461         /**
1462          * Indicates that the default attribute value will be used. See individual attribute for details on its default
1463          * value.
1464          */
1465         static final int DEFAULT_ = -1;
1466         /**
1467          * Primary collation strength
1468          */
1469         static final int PRIMARY_ = Collator.PRIMARY;
1470         /**
1471          * Secondary collation strength
1472          */
1473         static final int SECONDARY_ = Collator.SECONDARY;
1474         /**
1475          * Tertiary collation strength
1476          */
1477         static final int TERTIARY_ = Collator.TERTIARY;
1478         /**
1479          * Default collation strength
1480          */
1481         static final int DEFAULT_STRENGTH_ = Collator.TERTIARY;
1482         /**
1483          * Internal use for strength checks in Collation elements
1484          */
1485         static final int CE_STRENGTH_LIMIT_ = Collator.TERTIARY + 1;
1486         /**
1487          * Quaternary collation strength
1488          */
1489         static final int QUATERNARY_ = 3;
1490         /**
1491          * Identical collation strength
1492          */
1493         static final int IDENTICAL_ = Collator.IDENTICAL;
1494         /**
1495          * Internal use for strength checks
1496          */
1497         static final int STRENGTH_LIMIT_ = Collator.IDENTICAL + 1;
1498         /**
1499          * Turn the feature off - works for FRENCH_COLLATION, CASE_LEVEL, HIRAGANA_QUATERNARY_MODE and
1500          * DECOMPOSITION_MODE
1501          */
1502         static final int OFF_ = 16;
1503         /**
1504          * Turn the feature on - works for FRENCH_COLLATION, CASE_LEVEL, HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE
1505          */
1506         static final int ON_ = 17;
1507         /**
1508          * Valid for ALTERNATE_HANDLING. Alternate handling will be shifted
1509          */
1510         static final int SHIFTED_ = 20;
1511         /**
1512          * Valid for ALTERNATE_HANDLING. Alternate handling will be non ignorable
1513          */
1514         static final int NON_IGNORABLE_ = 21;
1515         /**
1516          * Valid for CASE_FIRST - lower case sorts before upper case
1517          */
1518         static final int LOWER_FIRST_ = 24;
1519         /**
1520          * Upper case sorts before lower case
1521          */
1522         static final int UPPER_FIRST_ = 25;
1523         /**
1524          * Number of attribute values
1525          */
1526         static final int LIMIT_ = 29;
1527     }
1528
1529     /**
1530      * Attributes that collation service understands. All the attributes can take DEFAULT value, as well as the values
1531      * specific to each one.
1532      */
1533     static interface Attribute {
1534         /**
1535          * Attribute for direction of secondary weights - used in French. Acceptable values are ON, which results in
1536          * secondary weights being considered backwards and OFF which treats secondary weights in the order they appear.
1537          */
1538         static final int FRENCH_COLLATION_ = 0;
1539         /**
1540          * Attribute for handling variable elements. Acceptable values are NON_IGNORABLE (default) which treats all the
1541          * codepoints with non-ignorable primary weights in the same way, and SHIFTED which causes codepoints with
1542          * primary weights that are equal or below the variable top value to be ignored on primary level and moved to
1543          * the quaternary level.
1544          */
1545         static final int ALTERNATE_HANDLING_ = 1;
1546         /**
1547          * Controls the ordering of upper and lower case letters. Acceptable values are OFF (default), which orders
1548          * upper and lower case letters in accordance to their tertiary weights, UPPER_FIRST which forces upper case
1549          * letters to sort before lower case letters, and LOWER_FIRST which does the opposite.
1550          */
1551         static final int CASE_FIRST_ = 2;
1552         /**
1553          * Controls whether an extra case level (positioned before the third level) is generated or not. Acceptable
1554          * values are OFF (default), when case level is not generated, and ON which causes the case level to be
1555          * generated. Contents of the case level are affected by the value of CASE_FIRST attribute. A simple way to
1556          * ignore accent differences in a string is to set the strength to PRIMARY and enable case level.
1557          */
1558         static final int CASE_LEVEL_ = 3;
1559         /**
1560          * Controls whether the normalization check and necessary normalizations are performed. When set to OFF
1561          * (default) no normalization check is performed. The correctness of the result is guaranteed only if the input
1562          * data is in so-called FCD form (see users manual for more info). When set to ON, an incremental check is
1563          * performed to see whether the input data is in the FCD form. If the data is not in the FCD form, incremental
1564          * NFD normalization is performed.
1565          */
1566         static final int NORMALIZATION_MODE_ = 4;
1567         /**
1568          * The strength attribute. Can be either PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL. The usual
1569          * strength for most locales (except Japanese) is tertiary. Quaternary strength is useful when combined with
1570          * shifted setting for alternate handling attribute and for JIS x 4061 collation, when it is used to distinguish
1571          * between Katakana and Hiragana (this is achieved by setting the HIRAGANA_QUATERNARY mode to on. Otherwise,
1572          * quaternary level is affected only by the number of non ignorable code points in the string. Identical
1573          * strength is rarely useful, as it amounts to codepoints of the NFD form of the string.
1574          */
1575         static final int STRENGTH_ = 5;
1576         /**
1577          * When turned on, this attribute positions Hiragana before all non-ignorables on quaternary level. This is a
1578          * sneaky way to produce JIS sort order.
1579          */
1580         static final int HIRAGANA_QUATERNARY_MODE_ = 6;
1581         /**
1582          * Attribute count
1583          */
1584         static final int LIMIT_ = 7;
1585     }
1586
1587     /**
1588      * DataManipulate singleton
1589      */
1590     static class DataManipulate implements Trie.DataManipulate {
1591         // public methods ----------------------------------------------------
1592
1593         /**
1594          * Internal method called to parse a lead surrogate's ce for the offset to the next trail surrogate data.
1595          *
1596          * @param ce
1597          *            collation element of the lead surrogate
1598          * @return data offset or 0 for the next trail surrogate
1599          * @stable ICU 2.8
1600          */
1601         public final int getFoldingOffset(int ce) {
1602             if (isSpecial(ce) && getTag(ce) == CE_SURROGATE_TAG_) {
1603                 return (ce & 0xFFFFFF);
1604             }
1605             return 0;
1606         }
1607
1608         /**
1609          * Get singleton object
1610          */
1611         public static final DataManipulate getInstance() {
1612             if (m_instance_ == null) {
1613                 m_instance_ = new DataManipulate();
1614             }
1615             return m_instance_;
1616         }
1617
1618         // private data member ----------------------------------------------
1619
1620         /**
1621          * Singleton instance
1622          */
1623         private static DataManipulate m_instance_;
1624
1625         // private constructor ----------------------------------------------
1626
1627         /**
1628          * private to prevent initialization
1629          */
1630         private DataManipulate() {
1631         }
1632     }
1633
1634     /**
1635      * UCAConstants
1636      */
1637     static final class UCAConstants {
1638         int FIRST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000
1639         int LAST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000
1640         int FIRST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x00008705
1641         int FIRST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000000
1642         int LAST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000500
1643         int LAST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x0000DD05
1644         int FIRST_VARIABLE_[] = new int[2]; // 0x05070505
1645         int LAST_VARIABLE_[] = new int[2]; // 0x13CF0505
1646         int FIRST_NON_VARIABLE_[] = new int[2]; // 0x16200505
1647         int LAST_NON_VARIABLE_[] = new int[2]; // 0x767C0505
1648         int RESET_TOP_VALUE_[] = new int[2]; // 0x9F000303
1649         int FIRST_IMPLICIT_[] = new int[2];
1650         int LAST_IMPLICIT_[] = new int[2];
1651         int FIRST_TRAILING_[] = new int[2];
1652         int LAST_TRAILING_[] = new int[2];
1653         int PRIMARY_TOP_MIN_;
1654         int PRIMARY_IMPLICIT_MIN_; // 0xE8000000
1655         int PRIMARY_IMPLICIT_MAX_; // 0xF0000000
1656         int PRIMARY_TRAILING_MIN_; // 0xE8000000
1657         int PRIMARY_TRAILING_MAX_; // 0xF0000000
1658         int PRIMARY_SPECIAL_MIN_; // 0xE8000000
1659         int PRIMARY_SPECIAL_MAX_; // 0xF0000000
1660     }
1661
1662     /**
1663      * Script to Lead Byte and Lead Byte to Script Data
1664      *
1665      */
1666     static final class LeadByteConstants {
1667         private static final int DATA_MASK_FOR_INDEX = 0x8000;
1668         private static final int[] EMPTY_INT_ARRAY = new int[0];
1669
1670         private int serializedSize = 0;
1671
1672         private Map<Integer, Integer> SCRIPT_TO_LEAD_BYTES_INDEX;
1673         private byte[] SCRIPT_TO_LEAD_BYTES_DATA;
1674
1675         private int[] LEAD_BYTE_TO_SCRIPTS_INDEX;
1676         private byte[] LEAD_BYTE_TO_SCRIPTS_DATA;
1677
1678         LeadByteConstants() {
1679         }
1680
1681         void read(DataInputStream dis) throws IOException {
1682             int readcount = 0;
1683             int indexCount;
1684             int dataSize;
1685
1686             // script to lead bytes
1687             indexCount = dis.readShort();
1688             readcount += 2;
1689             dataSize = dis.readShort();
1690             readcount += 2;
1691             this.SCRIPT_TO_LEAD_BYTES_INDEX = new HashMap<Integer, Integer>();
1692             //System.out.println("Script to Lead Bytes Index - Count = " + indexCount);
1693             for (int index = 0; index < indexCount; index++) {
1694                 int reorderCode = dis.readShort(); // reorder code
1695                 readcount += 2;
1696                 int dataOffset = 0xffff & dis.readShort(); // data offset
1697                 readcount += 2;
1698                 //                 System.out.println("\t-------------");
1699                 //                 System.out.println("\toffset = " + Integer.toHexString(readcount - 4));
1700                 //                 System.out.println("\treorderCode = " + Integer.toHexString(reorderCode));
1701                 //                 System.out.println("\tdataOffset = " + Integer.toHexString(dataOffset));
1702                 this.SCRIPT_TO_LEAD_BYTES_INDEX.put(reorderCode, dataOffset);
1703             }
1704
1705             this.SCRIPT_TO_LEAD_BYTES_DATA = new byte[dataSize * 2];
1706             dis.readFully(this.SCRIPT_TO_LEAD_BYTES_DATA, 0, this.SCRIPT_TO_LEAD_BYTES_DATA.length);
1707             readcount += this.SCRIPT_TO_LEAD_BYTES_DATA.length;
1708
1709             // lead byte to scripts
1710             indexCount = dis.readShort();
1711             readcount += 2;
1712             dataSize = dis.readShort();
1713             readcount += 2;
1714             this.LEAD_BYTE_TO_SCRIPTS_INDEX = new int[indexCount];
1715             //System.out.println("Lead Byte to Scripts Index - Count = " + indexCount);
1716             for (int index = 0; index < indexCount; index++) {
1717                 this.LEAD_BYTE_TO_SCRIPTS_INDEX[index] = 0xffff & dis.readShort();
1718                 readcount += 2;
1719                 //                System.out.println("\t-------------");
1720                 //                System.out.println("\toffset = " + Integer.toHexString(readcount - 2));
1721                 //                System.out.println("\tindex = " + Integer.toHexString(index));
1722                 //                System.out.println("\tdataOffset = " + Integer.toHexString(this.LEAD_BYTE_TO_SCRIPTS_INDEX[index]));
1723             }
1724
1725             this.LEAD_BYTE_TO_SCRIPTS_DATA = new byte[dataSize * 2];
1726             dis.readFully(this.LEAD_BYTE_TO_SCRIPTS_DATA, 0, this.LEAD_BYTE_TO_SCRIPTS_DATA.length);
1727             readcount += this.LEAD_BYTE_TO_SCRIPTS_DATA.length;
1728
1729             this.serializedSize = readcount;
1730         }
1731
1732         int getSerializedDataSize() {
1733             return this.serializedSize;
1734         }
1735
1736         int[] getReorderCodesForLeadByte(int leadByte) {
1737             if (leadByte >= this.LEAD_BYTE_TO_SCRIPTS_INDEX.length) {
1738                 return EMPTY_INT_ARRAY;
1739             }
1740             int offset = this.LEAD_BYTE_TO_SCRIPTS_INDEX[leadByte];
1741             if (offset == 0) {
1742                 return EMPTY_INT_ARRAY;
1743             }
1744             int[] reorderCodes;
1745             if ((offset & DATA_MASK_FOR_INDEX) == DATA_MASK_FOR_INDEX) {
1746                 reorderCodes = new int[1];
1747                 reorderCodes[0] = offset & ~DATA_MASK_FOR_INDEX;
1748             } else {
1749                 int length = readShort(this.LEAD_BYTE_TO_SCRIPTS_DATA, offset);
1750                 offset++;
1751
1752                 reorderCodes = new int[length];
1753                 for (int code = 0; code < length; code++, offset++) {
1754                     reorderCodes[code] = readShort(this.LEAD_BYTE_TO_SCRIPTS_DATA, offset);
1755                 }
1756             }
1757             return reorderCodes;
1758         }
1759
1760         int[] getLeadBytesForReorderCode(int reorderCode) {
1761             if (!this.SCRIPT_TO_LEAD_BYTES_INDEX.containsKey(reorderCode)) {
1762                 return EMPTY_INT_ARRAY;
1763             }
1764             int offset = this.SCRIPT_TO_LEAD_BYTES_INDEX.get(reorderCode);
1765
1766             if (offset == 0) {
1767                 return EMPTY_INT_ARRAY;
1768             }
1769
1770             int[] leadBytes;
1771             if ((offset & DATA_MASK_FOR_INDEX) == DATA_MASK_FOR_INDEX) {
1772                 leadBytes = new int[1];
1773                 leadBytes[0] = offset & ~DATA_MASK_FOR_INDEX;
1774             } else {
1775                 int length = readShort(this.SCRIPT_TO_LEAD_BYTES_DATA, offset);
1776                 offset++;
1777
1778                 leadBytes = new int[length];
1779                 for (int leadByte = 0; leadByte < length; leadByte++, offset++) {
1780                     leadBytes[leadByte] = readShort(this.SCRIPT_TO_LEAD_BYTES_DATA, offset);
1781                 }
1782             }
1783             return leadBytes;
1784         }
1785
1786         private static int readShort(byte[] data, int offset) {
1787             return (0xff & data[offset * 2]) << 8 | (data[offset * 2 + 1] & 0xff);
1788         }
1789     }
1790
1791     // package private data member -------------------------------------------
1792
1793     static final byte BYTE_FIRST_TAILORED_ = (byte) 0x04;
1794     static final byte BYTE_COMMON_ = (byte) 0x05;
1795     static final int COMMON_TOP_2_ = 0x86; // int for unsigness
1796     static final int COMMON_BOTTOM_2_ = BYTE_COMMON_;
1797     static final int COMMON_BOTTOM_3 = 0x05;
1798     /**
1799      * Case strength mask
1800      */
1801     static final int CE_CASE_BIT_MASK_ = 0xC0;
1802     static final int CE_TAG_SHIFT_ = 24;
1803     static final int CE_TAG_MASK_ = 0x0F000000;
1804
1805     static final int CE_SPECIAL_FLAG_ = 0xF0000000;
1806     /**
1807      * Lead surrogate that is tailored and doesn't start a contraction
1808      */
1809     static final int CE_SURROGATE_TAG_ = 5;
1810     /**
1811      * Mask to get the primary strength of the collation element
1812      */
1813     static final int CE_PRIMARY_MASK_ = 0xFFFF0000;
1814     /**
1815      * Mask to get the secondary strength of the collation element
1816      */
1817     static final int CE_SECONDARY_MASK_ = 0xFF00;
1818     /**
1819      * Mask to get the tertiary strength of the collation element
1820      */
1821     static final int CE_TERTIARY_MASK_ = 0xFF;
1822     /**
1823      * Primary strength shift
1824      */
1825     static final int CE_PRIMARY_SHIFT_ = 16;
1826     /**
1827      * Secondary strength shift
1828      */
1829     static final int CE_SECONDARY_SHIFT_ = 8;
1830     /**
1831      * Continuation marker
1832      */
1833     static final int CE_CONTINUATION_MARKER_ = 0xC0;
1834
1835     /**
1836      * Size of collator raw data headers and options before the expansion data. This is used when expansion ces are to
1837      * be retrieved. ICU4C uses the expansion offset starting from UCollator.UColHeader, hence ICU4J will have to minus
1838      * that off to get the right expansion ce offset. In number of ints.
1839      */
1840     int m_expansionOffset_;
1841     /**
1842      * Size of collator raw data headers, options and expansions before contraction data. This is used when contraction
1843      * ces are to be retrieved. ICU4C uses contraction offset starting from UCollator.UColHeader, hence ICU4J will have
1844      * to minus that off to get the right contraction ce offset. In number of chars.
1845      */
1846     int m_contractionOffset_;
1847     /**
1848      * Flag indicator if Jamo is special
1849      */
1850     boolean m_isJamoSpecial_;
1851
1852     // Collator options ------------------------------------------------------
1853
1854     int m_defaultVariableTopValue_;
1855     boolean m_defaultIsFrenchCollation_;
1856     boolean m_defaultIsAlternateHandlingShifted_;
1857     int m_defaultCaseFirst_;
1858     boolean m_defaultIsCaseLevel_;
1859     int m_defaultDecomposition_;
1860     int m_defaultStrength_;
1861     boolean m_defaultIsHiragana4_;
1862     boolean m_defaultIsNumericCollation_;
1863     /**
1864      * Default script order - the one created at initial rule parse time
1865      */
1866     int[] m_defaultReorderCodes_;
1867
1868     /**
1869      * Value of the variable top
1870      */
1871     int m_variableTopValue_;
1872     /**
1873      * Attribute for special Hiragana
1874      */
1875     boolean m_isHiragana4_;
1876     /**
1877      * Case sorting customization
1878      */
1879     int m_caseFirst_;
1880     /**
1881      * Numeric collation option
1882      */
1883     boolean m_isNumericCollation_;
1884     /**
1885      * Script order
1886      */
1887     int[] m_reorderCodes_;
1888
1889     // end Collator options --------------------------------------------------
1890
1891     /**
1892      * Expansion table
1893      */
1894     int m_expansion_[];
1895     /**
1896      * Contraction index table
1897      */
1898     char m_contractionIndex_[];
1899     /**
1900      * Contraction CE table
1901      */
1902     int m_contractionCE_[];
1903     /**
1904      * Data trie
1905      */
1906     IntTrie m_trie_;
1907     /**
1908      * Table to store all collation elements that are the last element of an expansion. This is for use in StringSearch.
1909      */
1910     int m_expansionEndCE_[];
1911     /**
1912      * Table to store the maximum size of any expansions that end with the corresponding collation element in
1913      * m_expansionEndCE_. For use in StringSearch too
1914      */
1915     byte m_expansionEndCEMaxSize_[];
1916     /**
1917      * Heuristic table to store information on whether a char character is considered "unsafe". "Unsafe" character are
1918      * combining marks or those belonging to some contraction sequence from the offset 1 onwards. E.g. if "ABC" is the
1919      * only contraction, then 'B' and 'C' are considered unsafe. If we have another contraction "ZA" with the one above,
1920      * then 'A', 'B', 'C' are "unsafe" but 'Z' is not.
1921      */
1922     byte m_unsafe_[];
1923     /**
1924      * Table to store information on whether a codepoint can occur as the last character in a contraction
1925      */
1926     byte m_contractionEnd_[];
1927     /**
1928      * Original collation rules
1929      */
1930     String m_rules_;
1931     /**
1932      * The smallest "unsafe" codepoint
1933      */
1934     char m_minUnsafe_;
1935     /**
1936      * The smallest codepoint that could be the end of a contraction
1937      */
1938     char m_minContractionEnd_;
1939     /**
1940      * General version of the collator
1941      */
1942     VersionInfo m_version_;
1943     /**
1944      * UCA version
1945      */
1946     VersionInfo m_UCA_version_;
1947     /**
1948      * UCD version
1949      */
1950     VersionInfo m_UCD_version_;
1951     /**
1952      * Lead byte and script data
1953      */
1954     int m_leadByteToScripts;
1955     int m_scriptToLeadBytes;
1956     /**
1957      * UnicodeData.txt property object
1958      */
1959     static final RuleBasedCollator UCA_;
1960     /**
1961      * UCA Constants
1962      */
1963     static final UCAConstants UCA_CONSTANTS_;
1964     /**
1965      * Lead Byte Constants
1966      */
1967     static LeadByteConstants LEADBYTE_CONSTANTS_;
1968     /**
1969      * Table for UCA and builder use
1970      */
1971     static final char UCA_CONTRACTIONS_[];
1972     static final int MAX_UCA_CONTRACTION_LENGTH;
1973
1974     private static boolean UCA_INIT_COMPLETE;
1975
1976     /**
1977      * Implicit generator
1978      */
1979     static final ImplicitCEGenerator impCEGen_;
1980
1981     static final byte SORT_LEVEL_TERMINATOR_ = 1;
1982
1983     // These are values from UCA required for
1984     // implicit generation and supressing sort key compression
1985     // they should regularly be in the UCA, but if one
1986     // is running without UCA, it could be a problem
1987     static final int maxRegularPrimary = 0x7A;
1988     static final int minImplicitPrimary = 0xE0;
1989     static final int maxImplicitPrimary = 0xE4;
1990
1991     // block to initialise character property database
1992     static {
1993         // take pains to let static class init succeed, otherwise the class itself won't exist and
1994         // clients will get a NoClassDefFoundException. Instead, make the constructors fail if
1995         // we can't load the UCA data.
1996
1997         RuleBasedCollator iUCA_ = null;
1998         UCAConstants iUCA_CONSTANTS_ = null;
1999         LeadByteConstants iLEADBYTE_CONSTANTS = null;
2000         char iUCA_CONTRACTIONS_[] = null;
2001         Output<Integer> maxUCAContractionLength = new Output<Integer>();
2002         ImplicitCEGenerator iimpCEGen_ = null;
2003         try {
2004             // !!! note what's going on here...
2005             // even though the static init of the class is not yet complete, we
2006             // instantiate an instance of the class. So we'd better be sure that
2007             // instantiation doesn't rely on the static initialization that's
2008             // not complete yet!
2009             iUCA_ = new RuleBasedCollator();
2010             iUCA_CONSTANTS_ = new UCAConstants();
2011             iLEADBYTE_CONSTANTS = new LeadByteConstants();
2012             iUCA_CONTRACTIONS_ = CollatorReader.read(iUCA_, iUCA_CONSTANTS_, iLEADBYTE_CONSTANTS, maxUCAContractionLength);
2013
2014             // called before doing canonical closure for the UCA.
2015             iimpCEGen_ = new ImplicitCEGenerator(minImplicitPrimary, maxImplicitPrimary);
2016             // iimpCEGen_ = new ImplicitCEGenerator(iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_,
2017             // iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_);
2018             iUCA_.init();
2019             ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle.getBundleInstance(
2020                     ICUResourceBundle.ICU_COLLATION_BASE_NAME, ULocale.ENGLISH);
2021             iUCA_.m_rules_ = (String) rb.getObject("UCARules");
2022         } catch (MissingResourceException ex) {
2023             // throw ex;
2024         } catch (IOException e) {
2025             // e.printStackTrace();
2026             // throw new MissingResourceException(e.getMessage(),"","");
2027         }
2028
2029         UCA_ = iUCA_;
2030         UCA_CONSTANTS_ = iUCA_CONSTANTS_;
2031         LEADBYTE_CONSTANTS_ = iLEADBYTE_CONSTANTS;
2032         UCA_CONTRACTIONS_ = iUCA_CONTRACTIONS_;
2033         MAX_UCA_CONTRACTION_LENGTH = maxUCAContractionLength.value;
2034         impCEGen_ = iimpCEGen_;
2035
2036         UCA_INIT_COMPLETE = true;
2037     }
2038
2039     private static void checkUCA() throws MissingResourceException {
2040         if (UCA_INIT_COMPLETE && UCA_ == null) {
2041             throw new MissingResourceException("Collator UCA data unavailable", "", "");
2042         }
2043     }
2044
2045     // package private constructors ------------------------------------------
2046
2047     /**
2048      * <p>
2049      * Private contructor for use by subclasses. Public access to creating Collators is handled by the API
2050      * Collator.getInstance() or RuleBasedCollator(String rules).
2051      * </p>
2052      * <p>
2053      * This constructor constructs the UCA collator internally
2054      * </p>
2055      */
2056     RuleBasedCollator() {
2057         checkUCA();
2058     }
2059
2060     /**
2061      * Constructs a RuleBasedCollator from the argument locale.
2062      * If no resource bundle is associated with the locale, UCA is used instead.
2063      *
2064      * @param locale
2065      */
2066     RuleBasedCollator(ULocale locale) {
2067         checkUCA();
2068         try {
2069             ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle.getBundleInstance(
2070                     ICUResourceBundle.ICU_COLLATION_BASE_NAME, locale);
2071             if (rb != null) {
2072                 ICUResourceBundle elements = null;
2073
2074                 // Use keywords, if supplied for lookup
2075                 String collkey = locale.getKeywordValue("collation");
2076                 if (collkey != null) {
2077                     try {
2078                         elements = rb.getWithFallback("collations/" + collkey);
2079                     } catch (MissingResourceException e) {
2080                         // fall through
2081                     }
2082                 }
2083                 if (elements == null) {
2084                     // either collation keyword was not supplied or
2085                     // the keyword was invalid - use default collation for the locale
2086
2087                     // collations/default should always give a string back
2088                     // keyword for the real collation data
2089                     collkey = rb.getStringWithFallback("collations/default");
2090                     elements = rb.getWithFallback("collations/" + collkey);
2091                 }
2092
2093                 // TODO: Determine actual & valid locale correctly
2094                 ULocale uloc = rb.getULocale();
2095                 setLocale(uloc, uloc);
2096
2097                 m_rules_ = elements.getString("Sequence");
2098                 ByteBuffer buf = elements.get("%%CollationBin").getBinary();
2099                 // %%CollationBin
2100                 if (buf != null) {
2101                     // m_rules_ = (String)rules[1][1];
2102                     CollatorReader.initRBC(this, buf);
2103                     /*
2104                      * BufferedInputStream input = new BufferedInputStream( new ByteArrayInputStream(map)); /*
2105                      * CollatorReader reader = new CollatorReader(input, false); if (map.length >
2106                      * MIN_BINARY_DATA_SIZE_) { reader.read(this, null); } else { reader.readHeader(this);
2107                      * reader.readOptions(this); // duplicating UCA_'s data setWithUCATables(); }
2108                      */
2109                     // at this point, we have read in the collator
2110                     // now we need to check whether the binary image has
2111                     // the right UCA and other versions
2112                     if (!m_UCA_version_.equals(UCA_.m_UCA_version_) || !m_UCD_version_.equals(UCA_.m_UCD_version_)) {
2113                         init(m_rules_);
2114                         return;
2115                     }
2116                     init();
2117                     try {
2118                         UResourceBundle reorderRes = elements.get("%%ReorderCodes");
2119                         if (reorderRes != null) {
2120                             int[] reorderCodes = reorderRes.getIntVector();
2121                             setReorderCodes(reorderCodes);
2122                             m_defaultReorderCodes_ = reorderCodes.clone();
2123                         }
2124                     } catch (MissingResourceException e) {
2125                         // ignore
2126                     }
2127                     return;
2128                 } else {
2129                     init(m_rules_);
2130                     return;
2131                 }
2132             }
2133         } catch (Exception e) {
2134             // fallthrough
2135         }
2136         setWithUCAData();
2137     }
2138
2139     // package private methods -----------------------------------------------
2140
2141     /**
2142      * Sets this collator to use the tables in UCA. Note options not taken care of here.
2143      */
2144     final void setWithUCATables() {
2145         m_contractionOffset_ = UCA_.m_contractionOffset_;
2146         m_expansionOffset_ = UCA_.m_expansionOffset_;
2147         m_expansion_ = UCA_.m_expansion_;
2148         m_contractionIndex_ = UCA_.m_contractionIndex_;
2149         m_contractionCE_ = UCA_.m_contractionCE_;
2150         m_trie_ = UCA_.m_trie_;
2151         m_expansionEndCE_ = UCA_.m_expansionEndCE_;
2152         m_expansionEndCEMaxSize_ = UCA_.m_expansionEndCEMaxSize_;
2153         m_unsafe_ = UCA_.m_unsafe_;
2154         m_contractionEnd_ = UCA_.m_contractionEnd_;
2155         m_minUnsafe_ = UCA_.m_minUnsafe_;
2156         m_minContractionEnd_ = UCA_.m_minContractionEnd_;
2157     }
2158
2159     /**
2160      * Sets this collator to use the all options and tables in UCA.
2161      */
2162     final void setWithUCAData() {
2163         latinOneFailed_ = true;
2164
2165         m_addition3_ = UCA_.m_addition3_;
2166         m_bottom3_ = UCA_.m_bottom3_;
2167         m_bottomCount3_ = UCA_.m_bottomCount3_;
2168         m_caseFirst_ = UCA_.m_caseFirst_;
2169         m_caseSwitch_ = UCA_.m_caseSwitch_;
2170         m_common3_ = UCA_.m_common3_;
2171         m_contractionOffset_ = UCA_.m_contractionOffset_;
2172         setDecomposition(UCA_.getDecomposition());
2173         m_defaultCaseFirst_ = UCA_.m_defaultCaseFirst_;
2174         m_defaultDecomposition_ = UCA_.m_defaultDecomposition_;
2175         m_defaultIsAlternateHandlingShifted_ = UCA_.m_defaultIsAlternateHandlingShifted_;
2176         m_defaultIsCaseLevel_ = UCA_.m_defaultIsCaseLevel_;
2177         m_defaultIsFrenchCollation_ = UCA_.m_defaultIsFrenchCollation_;
2178         m_defaultIsHiragana4_ = UCA_.m_defaultIsHiragana4_;
2179         m_defaultStrength_ = UCA_.m_defaultStrength_;
2180         m_defaultVariableTopValue_ = UCA_.m_defaultVariableTopValue_;
2181         m_defaultIsNumericCollation_ = UCA_.m_defaultIsNumericCollation_;
2182         m_expansionOffset_ = UCA_.m_expansionOffset_;
2183         m_isAlternateHandlingShifted_ = UCA_.m_isAlternateHandlingShifted_;
2184         m_isCaseLevel_ = UCA_.m_isCaseLevel_;
2185         m_isFrenchCollation_ = UCA_.m_isFrenchCollation_;
2186         m_isHiragana4_ = UCA_.m_isHiragana4_;
2187         m_isJamoSpecial_ = UCA_.m_isJamoSpecial_;
2188         m_isSimple3_ = UCA_.m_isSimple3_;
2189         m_mask3_ = UCA_.m_mask3_;
2190         m_minContractionEnd_ = UCA_.m_minContractionEnd_;
2191         m_minUnsafe_ = UCA_.m_minUnsafe_;
2192         m_rules_ = UCA_.m_rules_;
2193         setStrength(UCA_.getStrength());
2194         m_top3_ = UCA_.m_top3_;
2195         m_topCount3_ = UCA_.m_topCount3_;
2196         m_variableTopValue_ = UCA_.m_variableTopValue_;
2197         m_isNumericCollation_ = UCA_.m_isNumericCollation_;
2198         setWithUCATables();
2199         latinOneFailed_ = false;
2200     }
2201
2202     /**
2203      * Test whether a char character is potentially "unsafe" for use as a collation starting point. "Unsafe" characters
2204      * are combining marks or those belonging to some contraction sequence from the offset 1 onwards. E.g. if "ABC" is
2205      * the only contraction, then 'B' and 'C' are considered unsafe. If we have another contraction "ZA" with the one
2206      * above, then 'A', 'B', 'C' are "unsafe" but 'Z' is not.
2207      *
2208      * @param ch
2209      *            character to determin
2210      * @return true if ch is unsafe, false otherwise
2211      */
2212     final boolean isUnsafe(char ch) {
2213         if (ch < m_minUnsafe_) {
2214             return false;
2215         }
2216
2217         if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) {
2218             if (UTF16.isLeadSurrogate(ch) || UTF16.isTrailSurrogate(ch)) {
2219                 // Trail surrogate are always considered unsafe.
2220                 return true;
2221             }
2222             ch &= HEURISTIC_OVERFLOW_MASK_;
2223             ch += HEURISTIC_OVERFLOW_OFFSET_;
2224         }
2225         int value = m_unsafe_[ch >> HEURISTIC_SHIFT_];
2226         return ((value >> (ch & HEURISTIC_MASK_)) & 1) != 0;
2227     }
2228
2229     /**
2230      * Approximate determination if a char character is at a contraction end. Guaranteed to be true if a character is at
2231      * the end of a contraction, otherwise it is not deterministic.
2232      *
2233      * @param ch
2234      *            character to be determined
2235      */
2236     final boolean isContractionEnd(char ch) {
2237         if (UTF16.isTrailSurrogate(ch)) {
2238             return true;
2239         }
2240
2241         if (ch < m_minContractionEnd_) {
2242             return false;
2243         }
2244
2245         if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) {
2246             ch &= HEURISTIC_OVERFLOW_MASK_;
2247             ch += HEURISTIC_OVERFLOW_OFFSET_;
2248         }
2249         int value = m_contractionEnd_[ch >> HEURISTIC_SHIFT_];
2250         return ((value >> (ch & HEURISTIC_MASK_)) & 1) != 0;
2251     }
2252
2253     /**
2254      * Retrieve the tag of a special ce
2255      *
2256      * @param ce
2257      *            ce to test
2258      * @return tag of ce
2259      */
2260     static int getTag(int ce) {
2261         return (ce & CE_TAG_MASK_) >> CE_TAG_SHIFT_;
2262     }
2263
2264     /**
2265      * Checking if ce is special
2266      *
2267      * @param ce
2268      *            to check
2269      * @return true if ce is special
2270      */
2271     static boolean isSpecial(int ce) {
2272         return (ce & CE_SPECIAL_FLAG_) == CE_SPECIAL_FLAG_;
2273     }
2274
2275     /**
2276      * Checks if the argument ce is a continuation
2277      *
2278      * @param ce
2279      *            collation element to test
2280      * @return true if ce is a continuation
2281      */
2282     static final boolean isContinuation(int ce) {
2283         return ce != CollationElementIterator.NULLORDER && (ce & CE_CONTINUATION_TAG_) == CE_CONTINUATION_TAG_;
2284     }
2285
2286     // private inner classes ------------------------------------------------
2287
2288     // private variables -----------------------------------------------------
2289
2290     /**
2291      * The smallest natural unsafe or contraction end char character before tailoring. This is a combining mark.
2292      */
2293     private static final int DEFAULT_MIN_HEURISTIC_ = 0x300;
2294     /**
2295      * Heuristic table table size. Size is 32 bytes, 1 bit for each latin 1 char, and some power of two for hashing the
2296      * rest of the chars. Size in bytes.
2297      */
2298     private static final char HEURISTIC_SIZE_ = 1056;
2299     /**
2300      * Mask value down to "some power of two" - 1, number of bits, not num of bytes.
2301      */
2302     private static final char HEURISTIC_OVERFLOW_MASK_ = 0x1fff;
2303     /**
2304      * Unsafe character shift
2305      */
2306     private static final int HEURISTIC_SHIFT_ = 3;
2307     /**
2308      * Unsafe character addition for character too large, it has to be folded then incremented.
2309      */
2310     private static final char HEURISTIC_OVERFLOW_OFFSET_ = 256;
2311     /**
2312      * Mask value to get offset in heuristic table.
2313      */
2314     private static final char HEURISTIC_MASK_ = 7;
2315
2316     private int m_caseSwitch_;
2317     private int m_common3_;
2318     private int m_mask3_;
2319     /**
2320      * When switching case, we need to add or subtract different values.
2321      */
2322     private int m_addition3_;
2323     /**
2324      * Upper range when compressing
2325      */
2326     private int m_top3_;
2327     /**
2328      * Upper range when compressing
2329      */
2330     private int m_bottom3_;
2331     private int m_topCount3_;
2332     private int m_bottomCount3_;
2333     /**
2334      * Script reordering table
2335      */
2336     private byte[] m_leadBytePermutationTable_;
2337     /**
2338      * Case first constants
2339      */
2340     private static final int CASE_SWITCH_ = 0xC0;
2341     private static final int NO_CASE_SWITCH_ = 0;
2342     /**
2343      * Case level constants
2344      */
2345     private static final int CE_REMOVE_CASE_ = 0x3F;
2346     private static final int CE_KEEP_CASE_ = 0xFF;
2347     /**
2348      * Case strength mask
2349      */
2350     private static final int CE_CASE_MASK_3_ = 0xFF;
2351     /**
2352      * Sortkey size factor. Values can be changed.
2353      */
2354     private static final double PROPORTION_2_ = 0.5;
2355     private static final double PROPORTION_3_ = 0.667;
2356
2357     // These values come from the UCA ----------------------------------------
2358
2359     /**
2360      * This is an enum that lists magic special byte values from the fractional UCA
2361      */
2362     // private static final byte BYTE_ZERO_ = 0x0;
2363     // private static final byte BYTE_LEVEL_SEPARATOR_ = (byte)0x01;
2364     // private static final byte BYTE_SORTKEY_GLUE_ = (byte)0x02;
2365     private static final byte BYTE_SHIFT_PREFIX_ = (byte) 0x03;
2366     /* private */static final byte BYTE_UNSHIFTED_MIN_ = BYTE_SHIFT_PREFIX_;
2367     // private static final byte BYTE_FIRST_UCA_ = BYTE_COMMON_;
2368     // TODO: Make the following values dynamic since they change with almost every UCA version.
2369     static final byte CODAN_PLACEHOLDER = 0x12;
2370     private static final byte BYTE_FIRST_NON_LATIN_PRIMARY_ = (byte) 0x5B;
2371
2372     private static final byte BYTE_UNSHIFTED_MAX_ = (byte) 0xFF;
2373     private static final int TOTAL_2_ = COMMON_TOP_2_ - COMMON_BOTTOM_2_ - 1;
2374     private static final int FLAG_BIT_MASK_CASE_SWITCH_OFF_ = 0x80;
2375     private static final int FLAG_BIT_MASK_CASE_SWITCH_ON_ = 0x40;
2376     private static final int COMMON_TOP_CASE_SWITCH_OFF_3_ = 0x85;
2377     private static final int COMMON_TOP_CASE_SWITCH_LOWER_3_ = 0x45;
2378     private static final int COMMON_TOP_CASE_SWITCH_UPPER_3_ = 0xC5;
2379     private static final int COMMON_BOTTOM_3_ = 0x05;
2380     private static final int COMMON_BOTTOM_CASE_SWITCH_UPPER_3_ = 0x86;
2381     private static final int COMMON_BOTTOM_CASE_SWITCH_LOWER_3_ = COMMON_BOTTOM_3_;
2382     private static final int TOP_COUNT_2_ = (int) (PROPORTION_2_ * TOTAL_2_);
2383     private static final int BOTTOM_COUNT_2_ = TOTAL_2_ - TOP_COUNT_2_;
2384     private static final int COMMON_2_ = COMMON_BOTTOM_2_;
2385     private static final int COMMON_UPPER_FIRST_3_ = 0xC5;
2386     private static final int COMMON_NORMAL_3_ = COMMON_BOTTOM_3_;
2387     // private static final int COMMON_4_ = (byte)0xFF;
2388
2389     /*
2390      * Minimum size required for the binary collation data in bytes. Size of UCA header + size of options to 4 bytes
2391      */
2392     // private static final int MIN_BINARY_DATA_SIZE_ = (42 + 25) << 2;
2393
2394     /**
2395      * If this collator is to generate only simple tertiaries for fast path
2396      */
2397     private boolean m_isSimple3_;
2398
2399     /**
2400      * French collation sorting flag
2401      */
2402     private boolean m_isFrenchCollation_;
2403     /**
2404      * Flag indicating if shifted is requested for Quaternary alternate handling. If this is not true, the default for
2405      * alternate handling will be non-ignorable.
2406      */
2407     private boolean m_isAlternateHandlingShifted_;
2408     /**
2409      * Extra case level for sorting
2410      */
2411     private boolean m_isCaseLevel_;
2412     /**
2413      * Frozen state of the collator.
2414      */
2415     private Lock frozenLock;
2416
2417
2418     private static final int SORT_BUFFER_INIT_SIZE_ = 128;
2419     private static final int SORT_BUFFER_INIT_SIZE_1_ = SORT_BUFFER_INIT_SIZE_ << 3;
2420     private static final int SORT_BUFFER_INIT_SIZE_2_ = SORT_BUFFER_INIT_SIZE_;
2421     private static final int SORT_BUFFER_INIT_SIZE_3_ = SORT_BUFFER_INIT_SIZE_;
2422     private static final int SORT_BUFFER_INIT_SIZE_CASE_ = SORT_BUFFER_INIT_SIZE_ >> 2;
2423     private static final int SORT_BUFFER_INIT_SIZE_4_ = SORT_BUFFER_INIT_SIZE_;
2424
2425     private static final int CE_CONTINUATION_TAG_ = 0xC0;
2426     private static final int CE_REMOVE_CONTINUATION_MASK_ = 0xFFFFFF3F;
2427
2428     private static final int LAST_BYTE_MASK_ = 0xFF;
2429
2430     // private static final int CE_RESET_TOP_VALUE_ = 0x9F000303;
2431     // private static final int CE_NEXT_TOP_VALUE_ = 0xE8960303;
2432
2433     private static final byte SORT_CASE_BYTE_START_ = (byte) 0x80;
2434     private static final byte SORT_CASE_SHIFT_START_ = (byte) 7;
2435
2436     /**
2437      * CE buffer size
2438      */
2439     private static final int CE_BUFFER_SIZE_ = 512;
2440
2441     // variables for Latin-1 processing
2442     boolean latinOneUse_ = false;
2443     boolean latinOneRegenTable_ = false;
2444     boolean latinOneFailed_ = false;
2445
2446     int latinOneTableLen_ = 0;
2447     int latinOneCEs_[] = null;
2448
2449     private final class CollationBuffer {
2450         /**
2451          * Bunch of utility iterators
2452          */
2453         protected StringUCharacterIterator m_srcUtilIter_;
2454         protected CollationElementIterator m_srcUtilColEIter_;
2455         protected StringUCharacterIterator m_tgtUtilIter_;
2456         protected CollationElementIterator m_tgtUtilColEIter_;
2457
2458         /**
2459          * Utility comparison flags
2460          */
2461         protected boolean m_utilCompare0_;
2462         // private boolean m_utilCompare1_;
2463         protected boolean m_utilCompare2_;
2464         protected boolean m_utilCompare3_;
2465         protected boolean m_utilCompare4_;
2466         protected boolean m_utilCompare5_;
2467
2468         /**
2469          * Utility byte buffer
2470          */
2471         protected byte m_utilBytes0_[];
2472         protected byte m_utilBytes1_[];
2473         protected byte m_utilBytes2_[];
2474         protected byte m_utilBytes3_[];
2475         protected byte m_utilBytes4_[];
2476         // private byte m_utilBytes5_[];
2477
2478         protected RawCollationKey m_utilRawCollationKey_;
2479
2480         protected int m_utilBytesCount0_;
2481         protected int m_utilBytesCount1_;
2482         protected int m_utilBytesCount2_;
2483         protected int m_utilBytesCount3_;
2484         protected int m_utilBytesCount4_;
2485         // private int m_utilBytesCount5_;
2486
2487         // private int m_utilCount0_;
2488         // private int m_utilCount1_;
2489         protected int m_utilCount2_;
2490         protected int m_utilCount3_;
2491         protected int m_utilCount4_;
2492         // private int m_utilCount5_;
2493
2494         protected int m_utilFrenchStart_;
2495         protected int m_utilFrenchEnd_;
2496
2497         /**
2498          * Preparing the CE buffers. will be filled during the primary phase
2499          */
2500         protected int m_srcUtilCEBuffer_[];
2501         protected int m_tgtUtilCEBuffer_[];
2502         protected int m_srcUtilCEBufferSize_;
2503         protected int m_tgtUtilCEBufferSize_;
2504
2505         protected int m_srcUtilContOffset_;
2506         protected int m_tgtUtilContOffset_;
2507
2508         protected int m_srcUtilOffset_;
2509         protected int m_tgtUtilOffset_;
2510
2511         private CollationBuffer() {
2512             initBuffers();
2513         }
2514
2515         /**
2516          * Initializes utility iterators and byte buffer used by compare
2517          */
2518         protected final void initBuffers() {
2519             resetBuffers();
2520             m_srcUtilIter_ = new StringUCharacterIterator();
2521             m_srcUtilColEIter_ = new CollationElementIterator(m_srcUtilIter_, RuleBasedCollator.this);
2522             m_tgtUtilIter_ = new StringUCharacterIterator();
2523             m_tgtUtilColEIter_ = new CollationElementIterator(m_tgtUtilIter_, RuleBasedCollator.this);
2524             m_utilBytes0_ = new byte[SORT_BUFFER_INIT_SIZE_CASE_]; // case
2525             m_utilBytes1_ = new byte[SORT_BUFFER_INIT_SIZE_1_]; // primary
2526             m_utilBytes2_ = new byte[SORT_BUFFER_INIT_SIZE_2_]; // secondary
2527             m_utilBytes3_ = new byte[SORT_BUFFER_INIT_SIZE_3_]; // tertiary
2528             m_utilBytes4_ = new byte[SORT_BUFFER_INIT_SIZE_4_]; // Quaternary
2529             m_srcUtilCEBuffer_ = new int[CE_BUFFER_SIZE_];
2530             m_tgtUtilCEBuffer_ = new int[CE_BUFFER_SIZE_];
2531         }
2532
2533         protected final void resetBuffers() {
2534             m_utilCompare0_ = false;
2535             // private boolean m_utilCompare1_;
2536             m_utilCompare2_ = false;
2537             m_utilCompare3_ = false;
2538             m_utilCompare4_ = false;
2539             m_utilCompare5_ = false;
2540
2541             m_utilBytesCount0_ = 0;
2542             m_utilBytesCount1_ = 0;
2543             m_utilBytesCount2_ = 0;
2544             m_utilBytesCount3_ = 0;
2545             m_utilBytesCount4_ = 0;
2546             // private int m_utilBytesCount5_;
2547
2548             m_utilCount2_ = 0;
2549             m_utilCount3_ = 0;
2550             m_utilCount4_ = 0;
2551
2552             m_utilFrenchStart_ = 0;
2553             m_utilFrenchEnd_ = 0;
2554
2555             m_srcUtilContOffset_ = 0;
2556             m_tgtUtilContOffset_ = 0;
2557
2558             m_srcUtilOffset_ = 0;
2559             m_tgtUtilOffset_ = 0;
2560         }
2561     }
2562
2563     // private methods -------------------------------------------------------
2564
2565     private void init(String rules) throws Exception {
2566         setWithUCAData();
2567         CollationParsedRuleBuilder builder = new CollationParsedRuleBuilder(rules);
2568         builder.setRules(this);
2569         m_rules_ = rules;
2570         init();
2571         buildPermutationTable();
2572     }
2573
2574     private final int compareRegular(String source, String target, int offset, CollationBuffer buffer) {
2575         buffer.resetBuffers();
2576
2577         int strength = getStrength();
2578         // setting up the collator parameters
2579         buffer.m_utilCompare0_ = m_isCaseLevel_;
2580         // m_utilCompare1_ = true;
2581         buffer.m_utilCompare2_ = strength >= SECONDARY;
2582         buffer.m_utilCompare3_ = strength >= TERTIARY;
2583         buffer.m_utilCompare4_ = strength >= QUATERNARY;
2584         buffer.m_utilCompare5_ = strength == IDENTICAL;
2585         boolean doFrench = m_isFrenchCollation_ && buffer.m_utilCompare2_;
2586         boolean doShift4 = m_isAlternateHandlingShifted_ && buffer.m_utilCompare4_;
2587         boolean doHiragana4 = m_isHiragana4_ && buffer.m_utilCompare4_;
2588
2589         if (doHiragana4 && doShift4) {
2590             String sourcesub = source.substring(offset);
2591             String targetsub = target.substring(offset);
2592             return compareBySortKeys(sourcesub, targetsub, buffer);
2593         }
2594
2595         // This is the lowest primary value that will not be ignored if shifted
2596         int lowestpvalue = m_isAlternateHandlingShifted_ ? m_variableTopValue_ << 16 : 0;
2597         buffer.m_srcUtilCEBufferSize_ = 0;
2598         buffer.m_tgtUtilCEBufferSize_ = 0;
2599         int result = doPrimaryCompare(doHiragana4, lowestpvalue, source, target, offset, buffer);
2600         if (buffer.m_srcUtilCEBufferSize_ == -1 && buffer.m_tgtUtilCEBufferSize_ == -1) {
2601             // since the cebuffer is cleared when we have determined that
2602             // either source is greater than target or vice versa, the return
2603             // result is the comparison result and not the hiragana result
2604             return result;
2605         }
2606
2607         int hiraganaresult = result;
2608
2609         if (buffer.m_utilCompare2_) {
2610             result = doSecondaryCompare(doFrench, buffer);
2611             if (result != 0) {
2612                 return result;
2613             }
2614         }
2615         // doing the case bit
2616         if (buffer.m_utilCompare0_) {
2617             result = doCaseCompare(buffer);
2618             if (result != 0) {
2619                 return result;
2620             }
2621         }
2622         // Tertiary level
2623         if (buffer.m_utilCompare3_) {
2624             result = doTertiaryCompare(buffer);
2625             if (result != 0) {
2626                 return result;
2627             }
2628         }
2629
2630         if (doShift4) { // checkQuad
2631             result = doQuaternaryCompare(lowestpvalue, buffer);
2632             if (result != 0) {
2633                 return result;
2634             }
2635         } else if (doHiragana4 && hiraganaresult != 0) {
2636             // If we're fine on quaternaries, we might be different
2637             // on Hiragana. This, however, might fail us in shifted.
2638             return hiraganaresult;
2639         }
2640
2641         // For IDENTICAL comparisons, we use a bitwise character comparison
2642         // as a tiebreaker if all else is equal.
2643         // Getting here should be quite rare - strings are not identical -
2644         // that is checked first, but compared == through all other checks.
2645         if (buffer.m_utilCompare5_) {
2646             return doIdenticalCompare(source, target, offset, true);
2647         }
2648         return 0;
2649     }
2650
2651     // Is this primary weight compressible?
2652     // Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit).
2653     // TODO: This should use per-lead-byte flags from FractionalUCA.txt.
2654     static boolean isCompressible(int primary1) {
2655         return BYTE_FIRST_NON_LATIN_PRIMARY_ <= primary1 && primary1 <= maxRegularPrimary;
2656     }
2657
2658     /**
2659      * Gets the 2 bytes of primary order and adds it to the primary byte array
2660      *
2661      * @param ce
2662      *            current ce
2663      * @param notIsContinuation
2664      *            flag indicating if the current bytes belong to a continuation ce
2665      * @param doShift
2666      *            flag indicating if ce is to be shifted
2667      * @param leadPrimary
2668      *            lead primary used for compression
2669      * @param commonBottom4
2670      *            common byte value for Quaternary
2671      * @param bottomCount4
2672      *            smallest byte value for Quaternary
2673      * @return the new lead primary for compression
2674      */
2675     private final int doPrimaryBytes(int ce, boolean notIsContinuation, boolean doShift, int leadPrimary,
2676             int commonBottom4, int bottomCount4, CollationBuffer buffer) {
2677
2678         int p2 = (ce >>>= 16) & LAST_BYTE_MASK_; // in ints for unsigned
2679         int p1 = ce >>> 8; // comparison
2680         int originalP1 = p1;
2681         if (notIsContinuation) {
2682             if (m_leadBytePermutationTable_ != null) {
2683                 p1 = 0xff & m_leadBytePermutationTable_[p1];
2684             }
2685         }
2686
2687         if (doShift) {
2688             if (buffer.m_utilCount4_ > 0) {
2689                 while (buffer.m_utilCount4_ > bottomCount4) {
2690                     buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) (commonBottom4 + bottomCount4));
2691                     buffer.m_utilBytesCount4_++;
2692                     buffer.m_utilCount4_ -= bottomCount4;
2693                 }
2694                 buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) (commonBottom4 + (buffer.m_utilCount4_ - 1)));
2695                 buffer.m_utilBytesCount4_++;
2696                 buffer.m_utilCount4_ = 0;
2697             }
2698             // dealing with a variable and we're treating them as shifted
2699             // This is a shifted ignorable
2700             if (p1 != 0) {
2701                 // we need to check this since we could be in continuation
2702                 buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) p1);
2703                 buffer.m_utilBytesCount4_++;
2704             }
2705             if (p2 != 0) {
2706                 buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) p2);
2707                 buffer.m_utilBytesCount4_++;
2708             }
2709         } else {
2710             // Note: This code assumes that the table is well built
2711             // i.e. not having 0 bytes where they are not supposed to be.
2712             // Usually, we'll have non-zero primary1 & primary2, except
2713             // in cases of LatinOne and friends, when primary2 will be
2714             // regular and simple sortkey calc
2715             if (p1 != CollationElementIterator.IGNORABLE) {
2716                 if (notIsContinuation) {
2717                     if (leadPrimary == p1) {
2718                         buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p2);
2719                         buffer.m_utilBytesCount1_++;
2720                     } else {
2721                         if (leadPrimary != 0) {
2722                             buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_,
2723                                     ((p1 > leadPrimary) ? BYTE_UNSHIFTED_MAX_ : BYTE_UNSHIFTED_MIN_));
2724                             buffer.m_utilBytesCount1_++;
2725                         }
2726                         if (p2 == CollationElementIterator.IGNORABLE) {
2727                             // one byter, not compressed
2728                             buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p1);
2729                             buffer.m_utilBytesCount1_++;
2730                             leadPrimary = 0;
2731                         } else if (isCompressible(originalP1)) {
2732                             // compress
2733                             leadPrimary = p1;
2734                             buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p1);
2735                             buffer.m_utilBytesCount1_++;
2736                             buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p2);
2737                             buffer.m_utilBytesCount1_++;
2738                         } else {
2739                             leadPrimary = 0;
2740                             buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p1);
2741                             buffer.m_utilBytesCount1_++;
2742                             buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p2);
2743                             buffer.m_utilBytesCount1_++;
2744                         }
2745                     }
2746                 } else {
2747                     // continuation, add primary to the key, no compression
2748                     buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p1);
2749                     buffer.m_utilBytesCount1_++;
2750                     if (p2 != CollationElementIterator.IGNORABLE) {
2751                         buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p2);
2752                         // second part
2753                         buffer.m_utilBytesCount1_++;
2754                     }
2755                 }
2756             }
2757         }
2758         return leadPrimary;
2759     }
2760
2761     /**
2762      * Gets the secondary byte and adds it to the secondary byte array
2763      *
2764      * @param ce current ce
2765      * @param notIsContinuation flag indicating if the current bytes belong to a continuation ce
2766      * @param doFrench flag indicator if french sort is to be performed
2767      * @param buffer collation buffer temporary state
2768      */
2769     private final void doSecondaryBytes(int ce, boolean notIsContinuation, boolean doFrench, CollationBuffer buffer) {
2770         int s = (ce >> 8) & LAST_BYTE_MASK_; // int for comparison
2771         if (s != 0) {
2772             if (!doFrench) {
2773                 // This is compression code.
2774                 if (s == COMMON_2_ && notIsContinuation) {
2775                     buffer.m_utilCount2_++;
2776                 } else {
2777                     if (buffer.m_utilCount2_ > 0) {
2778                         if (s > COMMON_2_) { // not necessary for 4th level.
2779                             while (buffer.m_utilCount2_ > TOP_COUNT_2_) {
2780                                 buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_,
2781                                         (byte) (COMMON_TOP_2_ - TOP_COUNT_2_));
2782                                 buffer.m_utilBytesCount2_++;
2783                                 buffer.m_utilCount2_ -= TOP_COUNT_2_;
2784                             }
2785                             buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_,
2786                                     (byte) (COMMON_TOP_2_ - (buffer.m_utilCount2_ - 1)));
2787                             buffer.m_utilBytesCount2_++;
2788                         } else {
2789                             while (buffer.m_utilCount2_ > BOTTOM_COUNT_2_) {
2790                                 buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_,
2791                                         (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
2792                                 buffer.m_utilBytesCount2_++;
2793                                 buffer.m_utilCount2_ -= BOTTOM_COUNT_2_;
2794                             }
2795                             buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_,
2796                                     (byte) (COMMON_BOTTOM_2_ + (buffer.m_utilCount2_ - 1)));
2797                             buffer.m_utilBytesCount2_++;
2798                         }
2799                         buffer.m_utilCount2_ = 0;
2800                     }
2801                     buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_, (byte) s);
2802                     buffer.m_utilBytesCount2_++;
2803                 }
2804             } else {
2805                 buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_, (byte) s);
2806                 buffer.m_utilBytesCount2_++;
2807                 // Do the special handling for French secondaries
2808                 // We need to get continuation elements and do intermediate
2809                 // restore
2810                 // abc1c2c3de with french secondaries need to be edc1c2c3ba
2811                 // NOT edc3c2c1ba
2812                 if (notIsContinuation) {
2813                     if (buffer.m_utilFrenchStart_ != -1) {
2814                         // reverse secondaries from frenchStartPtr up to
2815                         // frenchEndPtr
2816                         reverseBuffer(buffer.m_utilBytes2_, buffer.m_utilFrenchStart_, buffer.m_utilFrenchEnd_);
2817                         buffer.m_utilFrenchStart_ = -1;
2818                     }
2819                 } else {
2820                     if (buffer.m_utilFrenchStart_ == -1) {
2821                         buffer.m_utilFrenchStart_ = buffer.m_utilBytesCount2_ - 2;
2822                     }
2823                     buffer.m_utilFrenchEnd_ = buffer.m_utilBytesCount2_ - 1;
2824                 }
2825             }
2826         }
2827     }
2828
2829     /**
2830      * Reverse the argument buffer
2831      *
2832      * @param buffer to reverse
2833      * @param start index in buffer to start from
2834      * @param end index in buffer to end at
2835      */
2836     private static void reverseBuffer(byte buffer[], int start, int end) {
2837         while (start < end) {
2838             byte b = buffer[start];
2839             buffer[start++] = buffer[end];
2840             buffer[end--] = b;
2841         }
2842     }
2843
2844     /**
2845      * Insert the case shifting byte if required
2846      *
2847      * @param caseshift value
2848      * @return new caseshift value
2849      */
2850     private final int doCaseShift(int caseshift, CollationBuffer buffer) {
2851         if (caseshift == 0) {
2852             buffer.m_utilBytes0_ = append(buffer.m_utilBytes0_, buffer.m_utilBytesCount0_, SORT_CASE_BYTE_START_);
2853             buffer.m_utilBytesCount0_++;
2854             caseshift = SORT_CASE_SHIFT_START_;
2855         }
2856         return caseshift;
2857     }
2858
2859     /**
2860      * Performs the casing sort
2861      *
2862      * @param tertiary byte in ints for easy comparison
2863      * @param notIsContinuation flag indicating if the current bytes belong to a continuation ce
2864      * @param caseshift
2865      * @param buffer collation buffer temporary state
2866      * @return the new value of case shift
2867      */
2868     private final int doCaseBytes(int tertiary, boolean notIsContinuation, int caseshift, CollationBuffer buffer) {
2869         caseshift = doCaseShift(caseshift, buffer);
2870
2871         if (notIsContinuation && tertiary != 0) {
2872             byte casebits = (byte) (tertiary & 0xC0);
2873             if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {
2874                 if (casebits == 0) {
2875                     buffer.m_utilBytes0_[buffer.m_utilBytesCount0_ - 1] |= (1 << (--caseshift));
2876                 } else {
2877                     // second bit
2878                     caseshift = doCaseShift(caseshift - 1, buffer);
2879                     buffer.m_utilBytes0_[buffer.m_utilBytesCount0_ - 1] |= ((casebits >> 6) & 1) << (--caseshift);
2880                 }
2881             } else {
2882                 if (casebits != 0) {
2883                     buffer.m_utilBytes0_[buffer.m_utilBytesCount0_ - 1] |= 1 << (--caseshift);
2884                     // second bit
2885                     caseshift = doCaseShift(caseshift, buffer);
2886                     buffer.m_utilBytes0_[buffer.m_utilBytesCount0_ - 1] |= ((casebits >> 7) & 1) << (--caseshift);
2887                 } else {
2888                     caseshift--;
2889                 }
2890             }
2891         }
2892
2893         return caseshift;
2894     }
2895
2896     /**
2897      * Gets the tertiary byte and adds it to the tertiary byte array
2898      *
2899      * @param tertiary byte in int for easy comparison
2900      * @param notIsContinuation flag indicating if the current bytes belong to a continuation ce
2901      * @param buffer collation buffer temporary state
2902      */
2903     private final void doTertiaryBytes(int tertiary, boolean notIsContinuation, CollationBuffer buffer) {
2904         if (tertiary != 0) {
2905             // This is compression code.
2906             // sequence size check is included in the if clause
2907             if (tertiary == m_common3_ && notIsContinuation) {
2908                 buffer.m_utilCount3_++;
2909             } else {
2910                 int common3 = m_common3_ & LAST_BYTE_MASK_;
2911                 if (tertiary > common3 && m_common3_ == COMMON_NORMAL_3_) {
2912                     tertiary += m_addition3_;
2913                 } else if (tertiary <= common3 && m_common3_ == COMMON_UPPER_FIRST_3_) {
2914                     tertiary -= m_addition3_;
2915                 }
2916                 if (buffer.m_utilCount3_ > 0) {
2917                     if (tertiary > common3) {
2918                         while (buffer.m_utilCount3_ > m_topCount3_) {
2919                             buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) (m_top3_ - m_topCount3_));
2920                             buffer.m_utilBytesCount3_++;
2921                             buffer.m_utilCount3_ -= m_topCount3_;
2922                         }
2923                         buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_,
2924                                 (byte) (m_top3_ - (buffer.m_utilCount3_ - 1)));
2925                         buffer.m_utilBytesCount3_++;
2926                     } else {
2927                         while (buffer.m_utilCount3_ > m_bottomCount3_) {
2928                             buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_,
2929                                     (byte) (m_bottom3_ + m_bottomCount3_));
2930                             buffer.m_utilBytesCount3_++;
2931                             buffer.m_utilCount3_ -= m_bottomCount3_;
2932                         }
2933                         buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_,
2934                                 (byte) (m_bottom3_ + (buffer.m_utilCount3_ - 1)));
2935                         buffer.m_utilBytesCount3_++;
2936                     }
2937                     buffer.m_utilCount3_ = 0;
2938                 }
2939                 buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) tertiary);
2940                 buffer.m_utilBytesCount3_++;
2941             }
2942         }
2943     }
2944
2945     /**
2946      * Gets the Quaternary byte and adds it to the Quaternary byte array
2947      *
2948      * @param isCodePointHiragana flag indicator if the previous codepoint we dealt with was Hiragana
2949      * @param commonBottom4 smallest common Quaternary byte
2950      * @param bottomCount4 smallest Quaternary byte
2951      * @param hiragana4 hiragana Quaternary byte
2952      * @param buffer collation buffer temporary state
2953      */
2954     private final void doQuaternaryBytes(boolean isCodePointHiragana, int commonBottom4, int bottomCount4,
2955             byte hiragana4, CollationBuffer buffer) {
2956         if (isCodePointHiragana) { // This was Hiragana, need to note it
2957             if (buffer.m_utilCount4_ > 0) { // Close this part
2958                 while (buffer.m_utilCount4_ > bottomCount4) {
2959                     buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) (commonBottom4 + bottomCount4));
2960                     buffer.m_utilBytesCount4_++;
2961                     buffer.m_utilCount4_ -= bottomCount4;
2962                 }
2963                 buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) (commonBottom4 + (buffer.m_utilCount4_ - 1)));
2964                 buffer.m_utilBytesCount4_++;
2965                 buffer.m_utilCount4_ = 0;
2966             }
2967             buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, hiragana4); // Add the Hiragana
2968             buffer.m_utilBytesCount4_++;
2969         } else { // This wasn't Hiragana, so we can continue adding stuff
2970             buffer.m_utilCount4_++;
2971         }
2972     }
2973
2974     /**
2975      * Iterates through the argument string for all ces. Split the ces into their relevant primaries, secondaries etc.
2976      *
2977      * @param source normalized string
2978      * @param doFrench flag indicator if special handling of French has to be done
2979      * @param hiragana4 offset for Hiragana quaternary
2980      * @param commonBottom4 smallest common quaternary byte
2981      * @param bottomCount4 smallest quaternary byte
2982      * @param buffer collation buffer temporary state
2983      */
2984     private final void getSortKeyBytes(String source, boolean doFrench, byte hiragana4, int commonBottom4,
2985             int bottomCount4, CollationBuffer buffer)
2986
2987     {
2988         int backupDecomposition = getDecomposition();
2989         // TODO- hack fix around frozen state - stop self-modification
2990         internalSetDecomposition(NO_DECOMPOSITION); // have to revert to backup later
2991         buffer.m_srcUtilIter_.setText(source);
2992         buffer.m_srcUtilColEIter_.setText(buffer.m_srcUtilIter_);
2993         buffer.m_utilFrenchStart_ = -1;
2994         buffer.m_utilFrenchEnd_ = -1;
2995
2996         boolean doShift = false;
2997         boolean notIsContinuation = false;
2998
2999         int leadPrimary = 0; // int for easier comparison
3000         int caseShift = 0;
3001
3002         while (true) {
3003             int ce = buffer.m_srcUtilColEIter_.next();
3004             if (ce == CollationElementIterator.NULLORDER) {
3005                 break;
3006             }
3007
3008             if (ce == CollationElementIterator.IGNORABLE) {
3009                 continue;
3010             }
3011
3012             notIsContinuation = !isContinuation(ce);
3013
3014             boolean isPrimaryByteIgnorable = (ce & CE_PRIMARY_MASK_) == 0;
3015             // actually we can just check that the first byte is 0
3016             // generation stuffs the order left first
3017             boolean isSmallerThanVariableTop = (ce >>> CE_PRIMARY_SHIFT_) <= m_variableTopValue_;
3018             doShift = (m_isAlternateHandlingShifted_
3019                     && ((notIsContinuation && isSmallerThanVariableTop && !isPrimaryByteIgnorable) // primary byte not 0
3020                             || (!notIsContinuation && doShift)) || (doShift && isPrimaryByteIgnorable));
3021             if (doShift && isPrimaryByteIgnorable) {
3022                 // amendment to the UCA says that primary ignorables and other
3023                 // ignorables should be removed if following a shifted code
3024                 // point
3025                 // if we were shifted and we got an ignorable code point
3026                 // we should just completely ignore it
3027                 continue;
3028             }
3029             leadPrimary = doPrimaryBytes(ce, notIsContinuation, doShift, leadPrimary, commonBottom4, bottomCount4, buffer);
3030
3031             if (doShift) {
3032                 continue;
3033             }
3034             if (buffer.m_utilCompare2_) {
3035                 doSecondaryBytes(ce, notIsContinuation, doFrench, buffer);
3036             }
3037
3038             int t = ce & LAST_BYTE_MASK_;
3039             if (!notIsContinuation) {
3040                 t = ce & CE_REMOVE_CONTINUATION_MASK_;
3041             }
3042
3043             if (buffer.m_utilCompare0_ && (!isPrimaryByteIgnorable || buffer.m_utilCompare2_)) {
3044                 // do the case level if we need to do it. We don't want to calculate
3045                 // case level for primary ignorables if we have only primary strength and case level
3046                 // otherwise we would break well formedness of CEs
3047                 caseShift = doCaseBytes(t, notIsContinuation, caseShift, buffer);
3048             } else if (notIsContinuation) {
3049                 t ^= m_caseSwitch_;
3050             }
3051
3052             t &= m_mask3_;
3053
3054             if (buffer.m_utilCompare3_) {
3055                 doTertiaryBytes(t, notIsContinuation, buffer);
3056             }
3057
3058             if (buffer.m_utilCompare4_ && notIsContinuation) { // compare quad
3059                 doQuaternaryBytes(buffer.m_srcUtilColEIter_.m_isCodePointHiragana_, commonBottom4, bottomCount4, hiragana4, buffer);
3060             }
3061         }
3062         // TODO - hack fix around frozen state - stop self-modification
3063         internalSetDecomposition(backupDecomposition); // reverts to original
3064         if (buffer.m_utilFrenchStart_ != -1) {
3065             // one last round of checks
3066             reverseBuffer(buffer.m_utilBytes2_, buffer.m_utilFrenchStart_, buffer.m_utilFrenchEnd_);
3067         }
3068     }
3069
3070     /**
3071      * From the individual strength byte results the final compact sortkey will be calculated.
3072      *
3073      * @param source text string
3074      * @param doFrench flag indicating that special handling of French has to be done
3075      * @param commonBottom4 smallest common quaternary byte
3076      * @param bottomCount4 smallest quaternary byte
3077      * @param key output RawCollationKey to store results, key cannot be null
3078      * @param buffer collation buffer temporary state
3079      */
3080     private final void getSortKey(String source, boolean doFrench, int commonBottom4, int bottomCount4,
3081             RawCollationKey key, CollationBuffer buffer) {
3082         // we have done all the CE's, now let's put them together to form
3083         // a key
3084         if (buffer.m_utilCompare2_) {
3085             doSecondary(doFrench, buffer);
3086         }
3087         // adding case level should be independent of secondary level
3088         if (buffer.m_utilCompare0_) {
3089             doCase(buffer);
3090         }
3091         if (buffer.m_utilCompare3_) {
3092             doTertiary(buffer);
3093             if (buffer.m_utilCompare4_) {
3094                 doQuaternary(commonBottom4, bottomCount4, buffer);
3095                 if (buffer.m_utilCompare5_) {
3096                     doIdentical(source, buffer);
3097                 }
3098
3099             }
3100         }
3101         buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) 0);
3102         buffer.m_utilBytesCount1_++;
3103
3104         key.set(buffer.m_utilBytes1_, 0, buffer.m_utilBytesCount1_);
3105     }
3106
3107     /**
3108      * Packs the French bytes
3109      * @param buffer collation buffer temporary state
3110      */
3111     private static final void doFrench(CollationBuffer buffer) {
3112         for (int i = 0; i < buffer.m_utilBytesCount2_; i++) {
3113             byte s = buffer.m_utilBytes2_[buffer.m_utilBytesCount2_ - i - 1];
3114             // This is compression code.
3115             if (s == COMMON_2_) {
3116                 ++buffer.m_utilCount2_;
3117             } else {
3118                 if (buffer.m_utilCount2_ > 0) {
3119                     // getting the unsigned value
3120                     if ((s & LAST_BYTE_MASK_) > COMMON_2_) {
3121                         // not necessary for 4th level.
3122                         while (buffer.m_utilCount2_ > TOP_COUNT_2_) {
3123                             buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_,
3124                                     (byte) (COMMON_TOP_2_ - TOP_COUNT_2_));
3125                             buffer.m_utilBytesCount1_++;
3126                             buffer.m_utilCount2_ -= TOP_COUNT_2_;
3127                         }
3128                         buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_,
3129                                 (byte) (COMMON_TOP_2_ - (buffer.m_utilCount2_ - 1)));
3130                         buffer.m_utilBytesCount1_++;
3131                     } else {
3132                         while (buffer.m_utilCount2_ > BOTTOM_COUNT_2_) {
3133                             buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_,
3134                                     (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
3135                             buffer.m_utilBytesCount1_++;
3136                             buffer.m_utilCount2_ -= BOTTOM_COUNT_2_;
3137                         }
3138                         buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_,
3139                                 (byte) (COMMON_BOTTOM_2_ + (buffer.m_utilCount2_ - 1)));
3140                         buffer.m_utilBytesCount1_++;
3141                     }
3142                     buffer.m_utilCount2_ = 0;
3143                 }
3144                 buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, s);
3145                 buffer.m_utilBytesCount1_++;
3146             }
3147         }
3148         if (buffer.m_utilCount2_ > 0) {
3149             while (buffer.m_utilCount2_ > BOTTOM_COUNT_2_) {
3150                 buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
3151                 buffer.m_utilBytesCount1_++;
3152                 buffer.m_utilCount2_ -= BOTTOM_COUNT_2_;
3153             }
3154             buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) (COMMON_BOTTOM_2_ + (buffer.m_utilCount2_ - 1)));
3155             buffer.m_utilBytesCount1_++;
3156         }
3157     }
3158
3159     /**
3160      * Compacts the secondary bytes and stores them into the primary array
3161      *
3162      * @param doFrench flag indicator that French has to be handled specially
3163      * @param buffer collation buffer temporary state
3164      */
3165     private static final void doSecondary(boolean doFrench, CollationBuffer buffer) {
3166         if (buffer.m_utilCount2_ > 0) {
3167             while (buffer.m_utilCount2_ > BOTTOM_COUNT_2_) {
3168                 buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_, (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
3169                 buffer.m_utilBytesCount2_++;
3170                 buffer.m_utilCount2_ -= BOTTOM_COUNT_2_;
3171             }
3172             buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_, (byte) (COMMON_BOTTOM_2_ + (buffer.m_utilCount2_ - 1)));
3173             buffer.m_utilBytesCount2_++;
3174         }
3175
3176         buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_);
3177         buffer.m_utilBytesCount1_++;
3178
3179         if (doFrench) { // do the reverse copy
3180             doFrench(buffer);
3181         } else {
3182             if (buffer.m_utilBytes1_.length <= buffer.m_utilBytesCount1_ + buffer.m_utilBytesCount2_) {
3183                 buffer.m_utilBytes1_ = increase(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount2_);
3184             }
3185             System.arraycopy(buffer.m_utilBytes2_, 0, buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount2_);
3186             buffer.m_utilBytesCount1_ += buffer.m_utilBytesCount2_;
3187         }
3188     }
3189
3190     /**
3191      * Increase buffer size
3192      *
3193      * @param buffer array of bytes
3194      * @param size of the byte array
3195      * @param incrementsize size to increase
3196      * @return the new buffer
3197      */
3198     private static final byte[] increase(byte buffer[], int size, int incrementsize) {
3199         byte result[] = new byte[buffer.length + incrementsize];
3200         System.arraycopy(buffer, 0, result, 0, size);
3201         return result;
3202     }
3203
3204     /**
3205      * Increase buffer size
3206      *
3207      * @param buffer array of ints
3208      * @param size of the byte array
3209      * @param incrementsize size to increase
3210      * @return the new buffer
3211      */
3212     private static final int[] increase(int buffer[], int size, int incrementsize) {
3213         int result[] = new int[buffer.length + incrementsize];
3214         System.arraycopy(buffer, 0, result, 0, size);
3215         return result;
3216     }
3217
3218     /**
3219      * Compacts the case bytes and stores them into the primary array
3220      *
3221      * @param buffer collation buffer temporary state
3222      */
3223     private static final void doCase(CollationBuffer buffer) {
3224         buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_);
3225         buffer.m_utilBytesCount1_++;
3226         if (buffer.m_utilBytes1_.length <= buffer.m_utilBytesCount1_ + buffer.m_utilBytesCount0_) {
3227             buffer.m_utilBytes1_ = increase(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount0_);
3228         }
3229         System.arraycopy(buffer.m_utilBytes0_, 0, buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount0_);
3230         buffer.m_utilBytesCount1_ += buffer.m_utilBytesCount0_;
3231     }
3232
3233     /**
3234      * Compacts the tertiary bytes and stores them into the primary array
3235      *
3236      * @param buffer collation buffer temporary state
3237      */
3238     private final void doTertiary(CollationBuffer buffer) {
3239         if (buffer.m_utilCount3_ > 0) {
3240             if (m_common3_ != COMMON_BOTTOM_3_) {
3241                 while (buffer.m_utilCount3_ >= m_topCount3_) {
3242                     buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) (m_top3_ - m_topCount3_));
3243                     buffer.m_utilBytesCount3_++;
3244                     buffer.m_utilCount3_ -= m_topCount3_;
3245                 }
3246                 buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) (m_top3_ - buffer.m_utilCount3_));
3247                 buffer.m_utilBytesCount3_++;
3248             } else {
3249                 while (buffer.m_utilCount3_ > m_bottomCount3_) {
3250                     buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) (m_bottom3_ + m_bottomCount3_));
3251                     buffer.m_utilBytesCount3_++;
3252                     buffer.m_utilCount3_ -= m_bottomCount3_;
3253                 }
3254                 buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) (m_bottom3_ + (buffer.m_utilCount3_ - 1)));
3255                 buffer.m_utilBytesCount3_++;
3256             }
3257         }
3258         buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_);
3259         buffer.m_utilBytesCount1_++;
3260         if (buffer.m_utilBytes1_.length <= buffer.m_utilBytesCount1_ + buffer.m_utilBytesCount3_) {
3261             buffer.m_utilBytes1_ = increase(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount3_);
3262         }
3263         System.arraycopy(buffer.m_utilBytes3_, 0, buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount3_);
3264         buffer.m_utilBytesCount1_ += buffer.m_utilBytesCount3_;
3265     }
3266
3267     /**
3268      * Compacts the quaternary bytes and stores them into the primary array
3269      *
3270      * @param buffer collation buffer temporary state
3271      */
3272     private final void doQuaternary(int commonbottom4, int bottomcount4, CollationBuffer buffer) {
3273         if (buffer.m_utilCount4_ > 0) {
3274             while (buffer.m_utilCount4_ > bottomcount4) {
3275                 buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) (commonbottom4 + bottomcount4));
3276                 buffer.m_utilBytesCount4_++;
3277                 buffer.m_utilCount4_ -= bottomcount4;
3278             }
3279             buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) (commonbottom4 + (buffer.m_utilCount4_ - 1)));
3280             buffer.m_utilBytesCount4_++;
3281         }
3282         buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_);
3283         buffer.m_utilBytesCount1_++;
3284         if (buffer.m_utilBytes1_.length <= buffer.m_utilBytesCount1_ + buffer.m_utilBytesCount4_) {
3285             buffer.m_utilBytes1_ = increase(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount4_);
3286         }
3287         System.arraycopy(buffer.m_utilBytes4_, 0, buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount4_);
3288         buffer.m_utilBytesCount1_ += buffer.m_utilBytesCount4_;
3289     }
3290
3291     /**
3292      * Deals with the identical sort. Appends the BOCSU version of the source string to the ends of the byte buffer.
3293      *
3294      * @param source text string
3295      * @param buffer collation buffer temporary state
3296      */
3297     private static final void doIdentical(String source, CollationBuffer buffer) {
3298         int isize = BOCU.getCompressionLength(source);
3299         buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_);
3300         buffer.m_utilBytesCount1_++;
3301         if (buffer.m_utilBytes1_.length <= buffer.m_utilBytesCount1_ + isize) {
3302             buffer.m_utilBytes1_ = increase(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, 1 + isize);
3303         }
3304         buffer.m_utilBytesCount1_ = BOCU.compress(source, buffer.m_utilBytes1_, buffer.m_utilBytesCount1_);
3305     }
3306
3307     /**
3308      * Gets the offset of the first unmatched characters in source and target. This method returns the offset of the
3309      * start of a contraction or a combining sequence, if the first difference is in the middle of such a sequence.
3310      *
3311      * @param source
3312      *            string
3313      * @param target
3314      *            string
3315      * @return offset of the first unmatched characters in source and target.
3316      */
3317     private final int getFirstUnmatchedOffset(String source, String target) {
3318         int result = 0;
3319         int slength = source.length();
3320         int tlength = target.length();
3321         int minlength = slength;
3322         if (minlength > tlength) {
3323             minlength = tlength;
3324         }
3325         while (result < minlength && source.charAt(result) == target.charAt(result)) {
3326             result++;
3327         }
3328         if (result > 0) {
3329             // There is an identical portion at the beginning of the two
3330             // strings. If the identical portion ends within a contraction or a
3331             // combining character sequence, back up to the start of that
3332             // sequence.
3333             char schar = 0;
3334             char tchar = 0;
3335             if (result < minlength) {
3336                 schar = source.charAt(result); // first differing chars
3337                 tchar = target.charAt(result);
3338             } else {
3339                 schar = source.charAt(minlength - 1);
3340                 if (isUnsafe(schar)) {
3341                     tchar = schar;
3342                 } else if (slength == tlength) {
3343                     return result;
3344                 } else if (slength < tlength) {
3345                     tchar = target.charAt(result);
3346                 } else {
3347                     schar = source.charAt(result);
3348                 }
3349             }
3350             if (isUnsafe(schar) || isUnsafe(tchar)) {
3351                 // We are stopped in the middle of a contraction or combining
3352                 // sequence.
3353                 // Look backwards for the part of the string for the start of
3354                 // the sequence
3355                 // It doesn't matter which string we scan, since they are the
3356                 // same in this region.
3357                 do {
3358                     result--;
3359                 } while (result > 0 && isUnsafe(source.charAt(result)));
3360             }
3361         }
3362         return result;
3363     }
3364
3365     /**
3366      * Appending an byte to an array of bytes and increases it if we run out of space
3367      *
3368      * @param array
3369      *            of byte arrays
3370      * @param appendindex
3371      *            index in the byte array to append
3372      * @param value
3373      *            to append
3374      * @return array if array size can accomodate the new value, otherwise a bigger array will be created and returned
3375      */
3376     private static final byte[] append(byte array[], int appendindex, byte value) {
3377         try {
3378             array[appendindex] = value;
3379         } catch (ArrayIndexOutOfBoundsException e) {
3380             array = increase(array, appendindex, SORT_BUFFER_INIT_SIZE_);
3381             array[appendindex] = value;
3382         }
3383         return array;
3384     }
3385
3386     /**
3387      * This is a trick string compare function that goes in and uses sortkeys to compare. It is used when compare gets
3388      * in trouble and needs to bail out.
3389      *
3390      * @param source text string
3391      * @param target text string
3392      * @param buffer collation buffer temporary state
3393      */
3394     private final int compareBySortKeys(String source, String target, CollationBuffer buffer)
3395     {
3396         buffer.m_utilRawCollationKey_ = getRawCollationKey(source, buffer.m_utilRawCollationKey_);
3397         // this method is very seldom called
3398         RawCollationKey targetkey = getRawCollationKey(target, null);
3399         return buffer.m_utilRawCollationKey_.compareTo(targetkey);
3400     }
3401
3402     /**
3403      * Performs the primary comparisons, and fills up the CE buffer at the same time. The return value toggles between
3404      * the comparison result and the hiragana result. If either the source is greater than target or vice versa, the
3405      * return result is the comparison result, ie 1 or -1, furthermore the cebuffers will be cleared when that happens.
3406      * If the primary comparisons are equal, we'll have to continue with secondary comparison. In this case the cebuffer
3407      * will not be cleared and the return result will be the hiragana result.
3408      *
3409      * @param doHiragana4 flag indicator that Hiragana Quaternary has to be observed
3410      * @param lowestpvalue the lowest primary value that will not be ignored if alternate handling is shifted
3411      * @param source text string
3412      * @param target text string
3413      * @param textoffset offset in text to start the comparison
3414      * @param buffer collation buffer temporary state
3415      * @return comparion result if a primary difference is found, otherwise hiragana result
3416      */
3417     private final int doPrimaryCompare(boolean doHiragana4, int lowestpvalue, String source, String target,
3418             int textoffset, CollationBuffer buffer)
3419
3420     {
3421         // Preparing the context objects for iterating over strings
3422         buffer.m_srcUtilIter_.setText(source);
3423         buffer.m_srcUtilColEIter_.setText(buffer.m_srcUtilIter_, textoffset);
3424         buffer.m_tgtUtilIter_.setText(target);
3425         buffer.m_tgtUtilColEIter_.setText(buffer.m_tgtUtilIter_, textoffset);
3426
3427         // Non shifted primary processing is quite simple
3428         if (!m_isAlternateHandlingShifted_) {
3429             int hiraganaresult = 0;
3430             while (true) {
3431                 int sorder = 0;
3432                 int sPrimary;
3433                 // We fetch CEs until we hit a non ignorable primary or end.
3434                 do {
3435                     sorder = buffer.m_srcUtilColEIter_.next();
3436                     buffer.m_srcUtilCEBuffer_ = append(buffer.m_srcUtilCEBuffer_, buffer.m_srcUtilCEBufferSize_, sorder);
3437                     buffer.m_srcUtilCEBufferSize_++;
3438                     sPrimary = sorder & CE_PRIMARY_MASK_;
3439                 } while (sPrimary == CollationElementIterator.IGNORABLE);
3440
3441                 int torder = 0;
3442                 int tPrimary;
3443                 do {
3444                     torder = buffer.m_tgtUtilColEIter_.next();
3445                     buffer.m_tgtUtilCEBuffer_ = append(buffer.m_tgtUtilCEBuffer_, buffer.m_tgtUtilCEBufferSize_, torder);
3446                     buffer.m_tgtUtilCEBufferSize_++;
3447                     tPrimary = torder & CE_PRIMARY_MASK_;
3448                 } while (tPrimary == CollationElementIterator.IGNORABLE);
3449
3450                 // if both primaries are the same
3451                 if (sPrimary == tPrimary) {
3452                     // and there are no more CEs, we advance to the next level
3453                     // see if we are at the end of either string
3454                     if (buffer.m_srcUtilCEBuffer_[buffer.m_srcUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER) {
3455                         if (buffer.m_tgtUtilCEBuffer_[buffer.m_tgtUtilCEBufferSize_ - 1] != CollationElementIterator.NULLORDER) {
3456                             return -1;
3457                         }
3458                         break;
3459                     } else if (buffer.m_tgtUtilCEBuffer_[buffer.m_tgtUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER) {
3460                         return 1;
3461                     }
3462                     if (doHiragana4 && hiraganaresult == 0
3463                             && buffer.m_srcUtilColEIter_.m_isCodePointHiragana_ != buffer.m_tgtUtilColEIter_.m_isCodePointHiragana_) {
3464                         if (buffer.m_srcUtilColEIter_.m_isCodePointHiragana_) {
3465                             hiraganaresult = -1;
3466                         } else {
3467                             hiraganaresult = 1;
3468                         }
3469                     }
3470                 } else {
3471                     if (!isContinuation(sorder) && m_leadBytePermutationTable_ != null) {
3472                         sPrimary = (m_leadBytePermutationTable_[sPrimary >>> 24] << 24) | (sPrimary & 0x00FFFFFF);
3473                         tPrimary = (m_leadBytePermutationTable_[tPrimary >>> 24] << 24) | (tPrimary & 0x00FFFFFF);
3474                     }
3475                     // if two primaries are different, we are done
3476                     return endPrimaryCompare(sPrimary, tPrimary, buffer);
3477                 }
3478             }
3479             // no primary difference... do the rest from the buffers
3480             return hiraganaresult;
3481         } else { // shifted - do a slightly more complicated processing :)
3482             while (true) {
3483                 int sorder = getPrimaryShiftedCompareCE(buffer.m_srcUtilColEIter_, lowestpvalue, true, buffer);
3484                 int torder = getPrimaryShiftedCompareCE(buffer.m_tgtUtilColEIter_, lowestpvalue, false, buffer);
3485                 if (sorder == torder) {
3486                     if (buffer.m_srcUtilCEBuffer_[buffer.m_srcUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER) {
3487                         break;
3488                     } else {
3489                         continue;
3490                     }
3491                 } else {
3492                     return endPrimaryCompare(sorder, torder, buffer);
3493                 }
3494             } // no primary difference... do the rest from the buffers
3495         }
3496         return 0;
3497     }
3498
3499     /**
3500      * This is used only for primary strength when we know that sorder is already different from torder. Compares sorder
3501      * and torder, returns -1 if sorder is less than torder. Clears the cebuffer at the same time.
3502      *
3503      * @param sorder source strength order
3504      * @param torder target strength order
3505      * @param buffer collation buffer temporary state
3506      * @return the comparison result of sorder and torder
3507      */
3508     private static final int endPrimaryCompare(int sorder, int torder, CollationBuffer buffer) {
3509         // if we reach here, the ce offset accessed is the last ce
3510         // appended to the buffer
3511         boolean isSourceNullOrder = (buffer.m_srcUtilCEBuffer_[buffer.m_srcUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER);
3512         boolean isTargetNullOrder = (buffer.m_tgtUtilCEBuffer_[buffer.m_tgtUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER);
3513         buffer.m_srcUtilCEBufferSize_ = -1;
3514         buffer.m_tgtUtilCEBufferSize_ = -1;
3515         if (isSourceNullOrder) {
3516             return -1;
3517         }
3518         if (isTargetNullOrder) {
3519             return 1;
3520         }
3521         // getting rid of the sign
3522         sorder >>>= CE_PRIMARY_SHIFT_;
3523                     torder >>>= CE_PRIMARY_SHIFT_;
3524                     if (sorder < torder) {
3525                         return -1;
3526                     }
3527                     return 1;
3528     }
3529
3530     /**
3531      * Calculates the next primary shifted value and fills up cebuffer with the next non-ignorable ce.
3532      *
3533      * @param coleiter collation element iterator
3534      * @param doHiragana4 flag indicator if hiragana quaternary is to be handled
3535      * @param lowestpvalue lowest primary shifted value that will not be ignored
3536      * @param buffer collation buffer temporary state
3537      * @return result next modified ce
3538      */
3539     private static final int getPrimaryShiftedCompareCE(CollationElementIterator coleiter, int lowestpvalue, boolean isSrc, CollationBuffer buffer)
3540     {
3541         boolean shifted = false;
3542         int result = CollationElementIterator.IGNORABLE;
3543         int cebuffer[] = buffer.m_srcUtilCEBuffer_;
3544         int cebuffersize = buffer.m_srcUtilCEBufferSize_;
3545         if (!isSrc) {
3546             cebuffer = buffer.m_tgtUtilCEBuffer_;
3547             cebuffersize = buffer.m_tgtUtilCEBufferSize_;
3548         }
3549         while (true) {
3550             result = coleiter.next();
3551             if (result == CollationElementIterator.NULLORDER) {
3552                 cebuffer = append(cebuffer, cebuffersize, result);
3553                 cebuffersize++;
3554                 break;
3555             } else if (result == CollationElementIterator.IGNORABLE
3556                     || (shifted && (result & CE_PRIMARY_MASK_) == CollationElementIterator.IGNORABLE)) {
3557                 // UCA amendment - ignore ignorables that follow shifted code
3558                 // points
3559                 continue;
3560             } else if (isContinuation(result)) {
3561                 if ((result & CE_PRIMARY_MASK_) != CollationElementIterator.IGNORABLE) {
3562                     // There is primary value
3563                     if (shifted) {
3564                         result = (result & CE_PRIMARY_MASK_) | CE_CONTINUATION_MARKER_;
3565                         // preserve interesting continuation
3566                         cebuffer = append(cebuffer, cebuffersize, result);
3567                         cebuffersize++;
3568                         continue;
3569                     } else {
3570                         cebuffer = append(cebuffer, cebuffersize, result);
3571                         cebuffersize++;
3572                         break;
3573                     }
3574                 } else { // Just lower level values
3575                     if (!shifted) {
3576                         cebuffer = append(cebuffer, cebuffersize, result);
3577                         cebuffersize++;
3578                     }
3579                 }
3580             } else { // regular
3581                 if (Utility.compareUnsigned(result & CE_PRIMARY_MASK_, lowestpvalue) > 0) {
3582                     cebuffer = append(cebuffer, cebuffersize, result);
3583                     cebuffersize++;
3584                     break;
3585                 } else {
3586                     if ((result & CE_PRIMARY_MASK_) != 0) {
3587                         shifted = true;
3588                         result &= CE_PRIMARY_MASK_;
3589                         cebuffer = append(cebuffer, cebuffersize, result);
3590                         cebuffersize++;
3591                         continue;
3592                     } else {
3593                         cebuffer = append(cebuffer, cebuffersize, result);
3594                         cebuffersize++;
3595                         shifted = false;
3596                         continue;
3597                     }
3598                 }
3599             }
3600         }
3601         if (isSrc) {
3602             buffer.m_srcUtilCEBuffer_ = cebuffer;
3603             buffer.m_srcUtilCEBufferSize_ = cebuffersize;
3604         } else {
3605             buffer.m_tgtUtilCEBuffer_ = cebuffer;
3606             buffer.m_tgtUtilCEBufferSize_ = cebuffersize;
3607         }
3608         result &= CE_PRIMARY_MASK_;
3609         return result;
3610     }
3611
3612     /**
3613      * Appending an int to an array of ints and increases it if we run out of space
3614      *
3615      * @param array
3616      *            of int arrays
3617      * @param appendindex
3618      *            index at which value will be appended
3619      * @param value
3620      *            to append
3621      * @return array if size is not increased, otherwise a new array will be returned
3622      */
3623     private static final int[] append(int array[], int appendindex, int value) {
3624         if (appendindex + 1 >= array.length) {
3625             array = increase(array, appendindex, CE_BUFFER_SIZE_);
3626         }
3627         array[appendindex] = value;
3628         return array;
3629     }
3630
3631     /**
3632      * Does secondary strength comparison based on the collected ces.
3633      *
3634      * @param doFrench flag indicates if French ordering is to be done
3635      * @param buffer collation buffer temporary state
3636      * @return the secondary strength comparison result
3637      */
3638     private static final int doSecondaryCompare(boolean doFrench, CollationBuffer buffer) {
3639         // now, we're gonna reexamine collected CEs
3640         if (!doFrench) { // normal
3641             int soffset = 0;
3642             int toffset = 0;
3643             while (true) {
3644                 int sorder = CollationElementIterator.IGNORABLE;
3645                 while (sorder == CollationElementIterator.IGNORABLE) {
3646                     sorder = buffer.m_srcUtilCEBuffer_[soffset++] & CE_SECONDARY_MASK_;
3647                 }
3648                 int torder = CollationElementIterator.IGNORABLE;
3649                 while (torder == CollationElementIterator.IGNORABLE) {
3650                     torder = buffer.m_tgtUtilCEBuffer_[toffset++] & CE_SECONDARY_MASK_;
3651                 }
3652
3653                 if (sorder == torder) {
3654                     if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
3655                         if (buffer.m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) {
3656                             return -1;
3657                         }
3658                         break;
3659                     } else if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
3660                         return 1;
3661                     }
3662                 } else {
3663                     if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
3664                         return -1;
3665                     }
3666                     if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
3667                         return 1;
3668                     }
3669                     return (sorder < torder) ? -1 : 1;
3670                 }
3671             }
3672         } else { // do the French
3673             buffer.m_srcUtilContOffset_ = 0;
3674             buffer.m_tgtUtilContOffset_ = 0;
3675             buffer.m_srcUtilOffset_ = buffer.m_srcUtilCEBufferSize_ - 2;
3676             buffer.m_tgtUtilOffset_ = buffer.m_tgtUtilCEBufferSize_ - 2;
3677             while (true) {
3678                 int sorder = getSecondaryFrenchCE(true, buffer);
3679                 int torder = getSecondaryFrenchCE(false, buffer);
3680                 if (sorder == torder) {
3681                     if ((buffer.m_srcUtilOffset_ < 0 && buffer.m_tgtUtilOffset_ < 0)
3682                             || (buffer.m_srcUtilOffset_ >= 0 && buffer.m_srcUtilCEBuffer_[buffer.m_srcUtilOffset_] == CollationElementIterator.NULLORDER)) {
3683                         break;
3684                     }
3685                 } else {
3686                     return (sorder < torder) ? -1 : 1;
3687                 }
3688             }
3689         }
3690         return 0;
3691     }
3692
3693     /**
3694      * Calculates the next secondary french CE.
3695      *
3696      * @param isSrc flag indicator if we are calculating the src ces
3697      * @param buffer collation buffer temporary state
3698      * @return result next modified ce
3699      */
3700     private static final int getSecondaryFrenchCE(boolean isSrc, CollationBuffer buffer) {
3701         int result = CollationElementIterator.IGNORABLE;
3702         int offset = buffer.m_srcUtilOffset_;
3703         int continuationoffset = buffer.m_srcUtilContOffset_;
3704         int cebuffer[] = buffer.m_srcUtilCEBuffer_;
3705         if (!isSrc) {
3706             offset = buffer.m_tgtUtilOffset_;
3707             continuationoffset = buffer.m_tgtUtilContOffset_;
3708             cebuffer = buffer.m_tgtUtilCEBuffer_;
3709         }
3710
3711         while (result == CollationElementIterator.IGNORABLE && offset >= 0) {
3712             if (continuationoffset == 0) {
3713                 result = cebuffer[offset];
3714                 while (isContinuation(cebuffer[offset--])) {
3715                 }
3716                 // after this, sorder is at the start of continuation,
3717                 // and offset points before that
3718                 if (isContinuation(cebuffer[offset + 1])) {
3719                     // save offset for later
3720                     continuationoffset = offset;
3721                     offset += 2;
3722                 }
3723             } else {
3724                 result = cebuffer[offset++];
3725                 if (!isContinuation(result)) {
3726                     // we have finished with this continuation
3727                     offset = continuationoffset;
3728                     // reset the pointer to before continuation
3729                     continuationoffset = 0;
3730                     continue;
3731                 }
3732             }
3733             result &= CE_SECONDARY_MASK_; // remove continuation bit
3734         }
3735         if (isSrc) {
3736             buffer.m_srcUtilOffset_ = offset;
3737             buffer.m_srcUtilContOffset_ = continuationoffset;
3738         } else {
3739             buffer.m_tgtUtilOffset_ = offset;
3740             buffer.m_tgtUtilContOffset_ = continuationoffset;
3741         }
3742         return result;
3743     }
3744
3745     /**
3746      * Does case strength comparison based on the collected ces.
3747      *
3748      * @param buffer collation buffer temporary state
3749      * @return the case strength comparison result
3750      */
3751     private final int doCaseCompare(CollationBuffer buffer) {
3752         int soffset = 0;
3753         int toffset = 0;
3754         while (true) {
3755             int sorder = CollationElementIterator.IGNORABLE;
3756             int torder = CollationElementIterator.IGNORABLE;
3757             while ((sorder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) {
3758                 sorder = buffer.m_srcUtilCEBuffer_[soffset++];
3759                 if (!isContinuation(sorder) && ((sorder & CE_PRIMARY_MASK_) != 0 || buffer.m_utilCompare2_ == true)) {
3760                     // primary ignorables should not be considered on the case level when the strength is primary
3761                     // otherwise, the CEs stop being well-formed
3762                     sorder &= CE_CASE_MASK_3_;
3763                     sorder ^= m_caseSwitch_;
3764                 } else {
3765                     sorder = CollationElementIterator.IGNORABLE;
3766                 }
3767             }
3768
3769             while ((torder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) {
3770                 torder = buffer.m_tgtUtilCEBuffer_[toffset++];
3771                 if (!isContinuation(torder) && ((torder & CE_PRIMARY_MASK_) != 0 || buffer.m_utilCompare2_ == true)) {
3772                     // primary ignorables should not be considered on the case level when the strength is primary
3773                     // otherwise, the CEs stop being well-formed
3774                     torder &= CE_CASE_MASK_3_;
3775                     torder ^= m_caseSwitch_;
3776                 } else {
3777                     torder = CollationElementIterator.IGNORABLE;
3778                 }
3779             }
3780
3781             sorder &= CE_CASE_BIT_MASK_;
3782             torder &= CE_CASE_BIT_MASK_;
3783             if (sorder == torder) {
3784                 // checking end of strings
3785                 if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
3786                     if (buffer.m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) {
3787                         return -1;
3788                     }
3789                     break;
3790                 } else if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
3791                     return 1;
3792                 }
3793             } else {
3794                 if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
3795                     return -1;
3796                 }
3797                 if (buffer.m_tgtUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
3798                     return 1;
3799                 }
3800                 return (sorder < torder) ? -1 : 1;
3801             }
3802         }
3803         return 0;
3804     }
3805
3806     /**
3807      * Does tertiary strength comparison based on the collected ces.
3808      *
3809      * @param buffer collation buffer temporary state
3810      * @return the tertiary strength comparison result
3811      */
3812     private final int doTertiaryCompare(CollationBuffer buffer) {
3813         int soffset = 0;
3814         int toffset = 0;
3815         while (true) {
3816             int sorder = CollationElementIterator.IGNORABLE;
3817             int torder = CollationElementIterator.IGNORABLE;
3818             while ((sorder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) {
3819                 sorder = buffer.m_srcUtilCEBuffer_[soffset++];
3820                 if (!isContinuation(sorder)) {
3821                     sorder = (sorder & m_mask3_) ^ m_caseSwitch_;
3822                 } else {
3823                     sorder = (sorder & m_mask3_) & CE_REMOVE_CASE_;
3824                 }
3825             }
3826
3827             while ((torder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) {
3828                 torder = buffer.m_tgtUtilCEBuffer_[toffset++];
3829                 if (!isContinuation(torder)) {
3830                     torder = (torder & m_mask3_) ^ m_caseSwitch_;
3831                 } else {
3832                     torder = (torder & m_mask3_) & CE_REMOVE_CASE_;
3833                 }
3834             }
3835
3836             if (sorder == torder) {
3837                 if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
3838                     if (buffer.m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) {
3839                         return -1;
3840                     }
3841                     break;
3842                 } else if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
3843                     return 1;
3844                 }
3845             } else {
3846                 if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
3847                     return -1;
3848                 }
3849                 if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
3850                     return 1;
3851                 }
3852                 return (sorder < torder) ? -1 : 1;
3853             }
3854         }
3855         return 0;
3856     }
3857
3858     /**
3859      * Does quaternary strength comparison based on the collected ces.
3860      *
3861      * @param lowestpvalue the lowest primary value that will not be ignored if alternate handling is shifted
3862      * @param buffer collation buffer temporary state
3863      * @return the quaternary strength comparison result
3864      */
3865     private final int doQuaternaryCompare(int lowestpvalue, CollationBuffer buffer) {
3866         boolean sShifted = true;
3867         boolean tShifted = true;
3868         int soffset = 0;
3869         int toffset = 0;
3870         while (true) {
3871             int sorder = CollationElementIterator.IGNORABLE;
3872             int torder = CollationElementIterator.IGNORABLE;
3873             while (sorder == CollationElementIterator.IGNORABLE || (isContinuation(sorder) && !sShifted)) {
3874                 sorder = buffer.m_srcUtilCEBuffer_[soffset++];
3875                 if (isContinuation(sorder)) {
3876                     if (!sShifted) {
3877                         continue;
3878                     }
3879                 } else if (Utility.compareUnsigned(sorder, lowestpvalue) > 0
3880                         || (sorder & CE_PRIMARY_MASK_) == CollationElementIterator.IGNORABLE) {
3881                     // non continuation
3882                     sorder = CE_PRIMARY_MASK_;
3883                     sShifted = false;
3884                 } else {
3885                     sShifted = true;
3886                 }
3887             }
3888             sorder >>>= CE_PRIMARY_SHIFT_;
3889                     while (torder == CollationElementIterator.IGNORABLE || (isContinuation(torder) && !tShifted)) {
3890                         torder = buffer.m_tgtUtilCEBuffer_[toffset++];
3891                         if (isContinuation(torder)) {
3892                             if (!tShifted) {
3893                                 continue;
3894                             }
3895                         } else if (Utility.compareUnsigned(torder, lowestpvalue) > 0
3896                                 || (torder & CE_PRIMARY_MASK_) == CollationElementIterator.IGNORABLE) {
3897                             // non continuation
3898                             torder = CE_PRIMARY_MASK_;
3899                             tShifted = false;
3900                         } else {
3901                             tShifted = true;
3902                         }
3903                     }
3904                     torder >>>= CE_PRIMARY_SHIFT_;
3905
3906                     if (sorder == torder) {
3907                         if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
3908                             if (buffer.m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) {
3909                                 return -1;
3910                             }
3911                             break;
3912                         } else if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
3913                             return 1;
3914                         }
3915                     } else {
3916                         if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
3917                             return -1;
3918                         }
3919                         if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
3920                             return 1;
3921                         }
3922                         return (sorder < torder) ? -1 : 1;
3923                     }
3924         }
3925         return 0;
3926     }
3927
3928     /**
3929      * Internal function. Does byte level string compare. Used by strcoll if strength == identical and strings are
3930      * otherwise equal. This is a rare case. Comparison must be done on NFD normalized strings. FCD is not good enough.
3931      *
3932      * @param source
3933      *            text
3934      * @param target
3935      *            text
3936      * @param offset
3937      *            of the first difference in the text strings
3938      * @param normalize
3939      *            flag indicating if we are to normalize the text before comparison
3940      * @return 1 if source is greater than target, -1 less than and 0 if equals
3941      */
3942     private static final int doIdenticalCompare(String source, String target, int offset, boolean normalize)
3943
3944     {
3945         if (normalize) {
3946             if (Normalizer.quickCheck(source, Normalizer.NFD, 0) != Normalizer.YES) {
3947                 source = Normalizer.decompose(source, false);
3948             }
3949
3950             if (Normalizer.quickCheck(target, Normalizer.NFD, 0) != Normalizer.YES) {
3951                 target = Normalizer.decompose(target, false);
3952             }
3953             offset = 0;
3954         }
3955
3956         return doStringCompare(source, target, offset);
3957     }
3958
3959     /**
3960      * Compares string for their codepoint order. This comparison handles surrogate characters and place them after the
3961      * all non surrogate characters.
3962      *
3963      * @param source
3964      *            text
3965      * @param target
3966      *            text
3967      * @param offset
3968      *            start offset for comparison
3969      * @return 1 if source is greater than target, -1 less than and 0 if equals
3970      */
3971     private static final int doStringCompare(String source, String target, int offset) {
3972         // compare identical prefixes - they do not need to be fixed up
3973         char schar = 0;
3974         char tchar = 0;
3975         int slength = source.length();
3976         int tlength = target.length();
3977         int minlength = Math.min(slength, tlength);
3978         while (offset < minlength) {
3979             schar = source.charAt(offset);
3980             tchar = target.charAt(offset++);
3981             if (schar != tchar) {
3982                 break;
3983             }
3984         }
3985
3986         if (schar == tchar && offset == minlength) {
3987             if (slength > minlength) {
3988                 return 1;
3989             }
3990             if (tlength > minlength) {
3991                 return -1;
3992             }
3993             return 0;
3994         }
3995
3996         // if both values are in or above the surrogate range, Fix them up.
3997         if (schar >= UTF16.LEAD_SURROGATE_MIN_VALUE && tchar >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
3998             schar = fixupUTF16(schar);
3999             tchar = fixupUTF16(tchar);
4000         }
4001
4002         // now c1 and c2 are in UTF-32-compatible order
4003         return (schar < tchar) ? -1 : 1; // schar and tchar has to be different
4004     }
4005
4006     /**
4007      * Rotate surrogates to the top to get code point order
4008      */
4009     private static final char fixupUTF16(char ch) {
4010         if (ch >= 0xe000) {
4011             ch -= 0x800;
4012         } else {
4013             ch += 0x2000;
4014         }
4015         return ch;
4016     }
4017
4018     private static final int UCOL_REORDER_CODE_IGNORE = ReorderCodes.LIMIT + 1;
4019     /**
4020      * Builds the lead byte permuatation table
4021      */
4022     private void buildPermutationTable() {
4023         if (m_reorderCodes_ == null || m_reorderCodes_.length == 0 || (m_reorderCodes_.length == 1 && m_reorderCodes_[0] == ReorderCodes.NONE)) {
4024             m_leadBytePermutationTable_ = null;
4025             return;
4026         }
4027
4028         if (m_reorderCodes_[0] == ReorderCodes.DEFAULT) {
4029             if (m_reorderCodes_.length != 1) {
4030                 throw new IllegalArgumentException("Illegal collation reorder codes - default reorder code must be the only code in the list.");
4031             }
4032             // swap the reorder codes for those at build of the rules
4033             if (m_defaultReorderCodes_ == null || m_defaultReorderCodes_.length == 0) {
4034                 m_leadBytePermutationTable_ = null;
4035                 return;
4036             }
4037             m_reorderCodes_ = m_defaultReorderCodes_.clone();
4038         }
4039
4040         // TODO - these need to be read in from the UCA data file
4041         // The lowest byte that hasn't been assigned a mapping
4042         int toBottom = 0x03;
4043         // The highest byte that hasn't been assigned a mapping
4044         int toTop = 0xe4;
4045
4046         // filled slots in the output m_scriptOrder_
4047         boolean[] permutationSlotFilled = new boolean[256];
4048
4049         // used lead bytes
4050         boolean[] newLeadByteUsed = new boolean[256];
4051
4052         if (m_leadBytePermutationTable_ == null) {
4053             m_leadBytePermutationTable_ = new byte[256];
4054         }
4055
4056         // prefill the reordering codes with the leading entries
4057         int[] internalReorderCodes = new int[m_reorderCodes_.length + (ReorderCodes.LIMIT - ReorderCodes.FIRST)];
4058         for (int codeIndex = 0; codeIndex < ReorderCodes.LIMIT - ReorderCodes.FIRST; codeIndex++) {
4059             internalReorderCodes[codeIndex] = ReorderCodes.FIRST + codeIndex;
4060         }
4061         for (int codeIndex = 0; codeIndex < m_reorderCodes_.length; codeIndex++) {
4062             internalReorderCodes[codeIndex + (ReorderCodes.LIMIT - ReorderCodes.FIRST)] = m_reorderCodes_[codeIndex];
4063             if (m_reorderCodes_[codeIndex] >= ReorderCodes.FIRST && m_reorderCodes_[codeIndex] < ReorderCodes.LIMIT) {
4064                 internalReorderCodes[m_reorderCodes_[codeIndex] - ReorderCodes.FIRST] = UCOL_REORDER_CODE_IGNORE;
4065             }
4066         }
4067
4068         /*
4069          * Start from the front of the list and place each script we encounter at the earliest possible locatation
4070          * in the permutation table. If we encounter UNKNOWN, start processing from the back, and place each script
4071          * in the last possible location. At each step, we also need to make sure that any scripts that need to not
4072          * be moved are copied to their same location in the final table.
4073          */
4074         boolean fromTheBottom = true;
4075         int reorderCodesIndex = -1;
4076         for (int reorderCodesCount = 0; reorderCodesCount < internalReorderCodes.length; reorderCodesCount++) {
4077             reorderCodesIndex += fromTheBottom ? 1 : -1;
4078             int next = internalReorderCodes[reorderCodesIndex];
4079             if (next == UCOL_REORDER_CODE_IGNORE) {
4080                 continue;
4081             }
4082             if (next == UScript.UNKNOWN) {
4083                 if (fromTheBottom == false) {
4084                     // double turnaround
4085                     m_leadBytePermutationTable_ = null;
4086                     throw new IllegalArgumentException("Illegal collation reorder codes - two \"from the end\" markers.");
4087                 }
4088                 fromTheBottom = false;
4089                 reorderCodesIndex = internalReorderCodes.length;
4090                 continue;
4091             }
4092
4093             int[] leadBytes = RuleBasedCollator.LEADBYTE_CONSTANTS_.getLeadBytesForReorderCode(next);
4094             if (fromTheBottom) {
4095                 for (int leadByte : leadBytes) {
4096                     // don't place a lead byte twice in the permutation table
4097                     if (permutationSlotFilled[leadByte]) {
4098                         // lead byte already used
4099                         m_leadBytePermutationTable_ = null;
4100                         throw new IllegalArgumentException("Illegal reorder codes specified - multiple codes with the same lead byte.");
4101                     }
4102                     m_leadBytePermutationTable_[leadByte] = (byte) toBottom;
4103                     newLeadByteUsed[toBottom] = true;
4104                     permutationSlotFilled[leadByte] = true;
4105                     toBottom++;
4106                 }
4107             } else {
4108                 for (int leadByteIndex = leadBytes.length - 1; leadByteIndex >= 0; leadByteIndex--) {
4109                     int leadByte = leadBytes[leadByteIndex];
4110                     // don't place a lead byte twice in the permutation table
4111                     if (permutationSlotFilled[leadByte]) {
4112                         // lead byte already used
4113                         m_leadBytePermutationTable_ = null;
4114                         throw new IllegalArgumentException("Illegal reorder codes specified - multiple codes with the same lead byte.");
4115                     }
4116
4117                     m_leadBytePermutationTable_[leadByte] = (byte) toTop;
4118                     newLeadByteUsed[toTop] = true;
4119                     permutationSlotFilled[leadByte] = true;
4120                     toTop--;
4121                 }
4122             }
4123         }
4124
4125         /* Copy everything that's left over */
4126         int reorderCode = 0;
4127         for (int i = 0; i < 256; i++) {
4128             if (!permutationSlotFilled[i]) {
4129                 while (newLeadByteUsed[reorderCode]) {
4130                     if (reorderCode > 255) {
4131                         throw new IllegalArgumentException("Unable to fill collation reordering table slots - no available reordering code.");
4132                     }
4133                     reorderCode++;
4134                 }
4135                 m_leadBytePermutationTable_[i] = (byte) reorderCode;
4136                 permutationSlotFilled[i] = true;
4137                 newLeadByteUsed[reorderCode] = true;
4138             }
4139         }
4140
4141         // for (int i = 0; i < 256; i++){
4142         // System.out.println(Integer.toString(i, 16) + " -> " + Integer.toString(m_scriptReorderTable_[i], 16));
4143         // }
4144         latinOneRegenTable_ = true;
4145         updateInternalState();
4146     }
4147
4148     /**
4149      * Resets the internal case data members and compression values.
4150      */
4151     private void updateInternalState() {
4152         if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {
4153             m_caseSwitch_ = CASE_SWITCH_;
4154         } else {
4155             m_caseSwitch_ = NO_CASE_SWITCH_;
4156         }
4157
4158         if (m_isCaseLevel_ || m_caseFirst_ == AttributeValue.OFF_) {
4159             m_mask3_ = CE_REMOVE_CASE_;
4160             m_common3_ = COMMON_NORMAL_3_;
4161             m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_OFF_;
4162             m_top3_ = COMMON_TOP_CASE_SWITCH_OFF_3_;
4163             m_bottom3_ = COMMON_BOTTOM_3_;
4164         } else {
4165             m_mask3_ = CE_KEEP_CASE_;
4166             m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_ON_;
4167             if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {
4168                 m_common3_ = COMMON_UPPER_FIRST_3_;
4169                 m_top3_ = COMMON_TOP_CASE_SWITCH_UPPER_3_;
4170                 m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_UPPER_3_;
4171             } else {
4172                 m_common3_ = COMMON_NORMAL_3_;
4173                 m_top3_ = COMMON_TOP_CASE_SWITCH_LOWER_3_;
4174                 m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_LOWER_3_;
4175             }
4176         }
4177
4178         // Set the compression values
4179         int total3 = m_top3_ - m_bottom3_ - 1;
4180         // we multilply double with int, but need only int
4181         m_topCount3_ = (int) (PROPORTION_3_ * total3);
4182         m_bottomCount3_ = total3 - m_topCount3_;
4183
4184         if (!m_isCaseLevel_ && getStrength() == AttributeValue.TERTIARY_ && !m_isFrenchCollation_
4185                 && !m_isAlternateHandlingShifted_) {
4186             m_isSimple3_ = true;
4187         } else {
4188             m_isSimple3_ = false;
4189         }
4190         if (!m_isCaseLevel_ && getStrength() <= AttributeValue.TERTIARY_ && !m_isNumericCollation_
4191                 && !m_isAlternateHandlingShifted_ && !latinOneFailed_) {
4192             if (latinOneCEs_ == null || latinOneRegenTable_) {
4193                 if (setUpLatinOne()) { // if we succeed in building latin1 table, we'll use it
4194                     latinOneUse_ = true;
4195                 } else {
4196                     latinOneUse_ = false;
4197                     latinOneFailed_ = true;
4198                 }
4199                 latinOneRegenTable_ = false;
4200             } else { // latin1Table exists and it doesn't need to be regenerated, just use it
4201                 latinOneUse_ = true;
4202             }
4203         } else {
4204             latinOneUse_ = false;
4205         }
4206
4207     }
4208
4209     /**
4210      * Initializes the RuleBasedCollator
4211      */
4212     private final void init() {
4213         for (m_minUnsafe_ = 0; m_minUnsafe_ < DEFAULT_MIN_HEURISTIC_; m_minUnsafe_++) {
4214             // Find the smallest unsafe char.
4215             if (isUnsafe(m_minUnsafe_)) {
4216                 break;
4217             }
4218         }
4219
4220         for (m_minContractionEnd_ = 0; m_minContractionEnd_ < DEFAULT_MIN_HEURISTIC_; m_minContractionEnd_++) {
4221             // Find the smallest contraction-ending char.
4222             if (isContractionEnd(m_minContractionEnd_)) {
4223                 break;
4224             }
4225         }
4226         latinOneFailed_ = true;
4227         setStrength(m_defaultStrength_);
4228         setDecomposition(m_defaultDecomposition_);
4229         m_variableTopValue_ = m_defaultVariableTopValue_;
4230         m_isFrenchCollation_ = m_defaultIsFrenchCollation_;
4231         m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_;
4232         m_isCaseLevel_ = m_defaultIsCaseLevel_;
4233         m_caseFirst_ = m_defaultCaseFirst_;
4234         m_isHiragana4_ = m_defaultIsHiragana4_;
4235         m_isNumericCollation_ = m_defaultIsNumericCollation_;
4236         latinOneFailed_ = false;
4237         if (m_defaultReorderCodes_ != null) {
4238             m_reorderCodes_ = m_defaultReorderCodes_.clone();
4239         } else {
4240             m_reorderCodes_ = null;
4241         }
4242         updateInternalState();
4243     }
4244
4245     // Consts for Latin-1 special processing
4246     private static final int ENDOFLATINONERANGE_ = 0xFF;
4247     private static final int LATINONETABLELEN_ = (ENDOFLATINONERANGE_ + 50);
4248     private static final int BAIL_OUT_CE_ = 0xFF000000;
4249
4250     /**
4251      * Generate latin-1 tables
4252      */
4253
4254     private static class shiftValues {
4255         int primShift = 24;
4256         int secShift = 24;
4257         int terShift = 24;
4258     }
4259
4260     private final void addLatinOneEntry(char ch, int CE, shiftValues sh) {
4261         int primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
4262         boolean continuation = isContinuation(CE);
4263         boolean reverseSecondary = false;
4264         if (!continuation) {
4265             tertiary = ((CE & m_mask3_));
4266             tertiary ^= m_caseSwitch_;
4267             reverseSecondary = true;
4268         } else {
4269             tertiary = (byte) ((CE & CE_REMOVE_CONTINUATION_MASK_));
4270             tertiary &= CE_REMOVE_CASE_;
4271             reverseSecondary = false;
4272         }
4273
4274         secondary = ((CE >>>= 8) & LAST_BYTE_MASK_);
4275         primary2 = ((CE >>>= 8) & LAST_BYTE_MASK_);
4276         primary1 = (CE >>> 8);
4277
4278         if (primary1 != 0) {
4279             if (m_leadBytePermutationTable_ != null && !continuation) {
4280                 primary1 = m_leadBytePermutationTable_[primary1];
4281             }
4282             latinOneCEs_[ch] |= (primary1 << sh.primShift);
4283             sh.primShift -= 8;
4284         }
4285         if (primary2 != 0) {
4286             if (sh.primShift < 0) {
4287                 latinOneCEs_[ch] = BAIL_OUT_CE_;
4288                 latinOneCEs_[latinOneTableLen_ + ch] = BAIL_OUT_CE_;
4289                 latinOneCEs_[2 * latinOneTableLen_ + ch] = BAIL_OUT_CE_;
4290                 return;
4291             }
4292             latinOneCEs_[ch] |= (primary2 << sh.primShift);
4293             sh.primShift -= 8;
4294         }
4295         if (secondary != 0) {
4296             if (reverseSecondary && m_isFrenchCollation_) { // reverse secondary
4297                 latinOneCEs_[latinOneTableLen_ + ch] >>>= 8; // make space for secondary
4298             latinOneCEs_[latinOneTableLen_ + ch] |= (secondary << 24);
4299             } else { // normal case
4300                 latinOneCEs_[latinOneTableLen_ + ch] |= (secondary << sh.secShift);
4301             }
4302             sh.secShift -= 8;
4303         }
4304         if (tertiary != 0) {
4305             latinOneCEs_[2 * latinOneTableLen_ + ch] |= (tertiary << sh.terShift);
4306             sh.terShift -= 8;
4307         }
4308     }
4309
4310     private final void resizeLatinOneTable(int newSize) {
4311         int newTable[] = new int[3 * newSize];
4312         int sizeToCopy = ((newSize < latinOneTableLen_) ? newSize : latinOneTableLen_);
4313         // uprv_memset(newTable, 0, newSize*sizeof(uint32_t)*3); // automatically cleared.
4314         System.arraycopy(latinOneCEs_, 0, newTable, 0, sizeToCopy);
4315         System.arraycopy(latinOneCEs_, latinOneTableLen_, newTable, newSize, sizeToCopy);
4316         System.arraycopy(latinOneCEs_, 2 * latinOneTableLen_, newTable, 2 * newSize, sizeToCopy);
4317         latinOneTableLen_ = newSize;
4318         latinOneCEs_ = newTable;
4319     }
4320
4321     private final boolean setUpLatinOne() {
4322         if (latinOneCEs_ == null || m_reallocLatinOneCEs_) {
4323             latinOneCEs_ = new int[3 * LATINONETABLELEN_];
4324             latinOneTableLen_ = LATINONETABLELEN_;
4325             m_reallocLatinOneCEs_ = false;
4326         } else {
4327             Arrays.fill(latinOneCEs_, 0);
4328         }
4329         if (m_ContInfo_ == null) {
4330             m_ContInfo_ = new ContractionInfo();
4331         }
4332         char ch = 0;
4333         // StringBuffer sCh = new StringBuffer();
4334         // CollationElementIterator it = getCollationElementIterator(sCh.toString());
4335         CollationElementIterator it = getCollationElementIterator("");
4336
4337         shiftValues s = new shiftValues();
4338         int CE = 0;
4339         char contractionOffset = ENDOFLATINONERANGE_ + 1;
4340
4341         for (ch = 0; ch <= ENDOFLATINONERANGE_; ch++) {
4342             s.primShift = 24;
4343             s.secShift = 24;
4344             s.terShift = 24;
4345             if (ch < 0x100) {
4346                 CE = m_trie_.getLatin1LinearValue(ch);
4347             } else {
4348                 CE = m_trie_.getLeadValue(ch);
4349                 if (CE == CollationElementIterator.CE_NOT_FOUND_) {
4350                     CE = UCA_.m_trie_.getLeadValue(ch);
4351                 }
4352             }
4353             if (!isSpecial(CE)) {
4354                 addLatinOneEntry(ch, CE, s);
4355             } else {
4356                 switch (RuleBasedCollator.getTag(CE)) {
4357                 case CollationElementIterator.CE_EXPANSION_TAG_:
4358                 case CollationElementIterator.CE_DIGIT_TAG_:
4359                     // sCh.delete(0, sCh.length());
4360                     // sCh.append(ch);
4361                     // it.setText(sCh.toString());
4362                     it.setText(UCharacter.toString(ch));
4363                     while ((CE = it.next()) != CollationElementIterator.NULLORDER) {
4364                         if (s.primShift < 0 || s.secShift < 0 || s.terShift < 0) {
4365                             latinOneCEs_[ch] = BAIL_OUT_CE_;
4366                             latinOneCEs_[latinOneTableLen_ + ch] = BAIL_OUT_CE_;
4367                             latinOneCEs_[2 * latinOneTableLen_ + ch] = BAIL_OUT_CE_;
4368                             break;
4369                         }
4370                         addLatinOneEntry(ch, CE, s);
4371                     }
4372                     break;
4373                 case CollationElementIterator.CE_CONTRACTION_TAG_:
4374                     // here is the trick
4375                     // F2 is contraction. We do something very similar to contractions
4376                     // but have two indices, one in the real contraction table and the
4377                     // other to where we stuffed things. This hopes that we don't have
4378                     // many contractions (this should work for latin-1 tables).
4379                 {
4380                     if ((CE & 0x00FFF000) != 0) {
4381                         latinOneFailed_ = true;
4382                         return false;
4383                     }
4384
4385                     int UCharOffset = (CE & 0xFFFFFF) - m_contractionOffset_; // getContractionOffset(CE)]
4386
4387                     CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
4388
4389                     latinOneCEs_[ch] = CE;
4390                     latinOneCEs_[latinOneTableLen_ + ch] = CE;
4391                     latinOneCEs_[2 * latinOneTableLen_ + ch] = CE;
4392
4393                     // We're going to jump into contraction table, pick the elements
4394                     // and use them
4395                     do {
4396                         // CE = *(contractionCEs + (UCharOffset - contractionIndex));
4397                         CE = m_contractionCE_[UCharOffset];
4398                         if (isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) {
4399                             int i; /* general counter */
4400                             // uint32_t *CEOffset = (uint32_t *)image+getExpansionOffset(CE); /* find the offset to
4401                             // expansion table */
4402                             int offset = ((CE & 0xFFFFF0) >> 4) - m_expansionOffset_; // it.getExpansionOffset(this,
4403                             // CE);
4404                             int size = CE & 0xF; // getExpansionCount(CE);
4405                             // CE = *CEOffset++;
4406                             if (size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
4407                                 for (i = 0; i < size; i++) {
4408                                     if (s.primShift < 0 || s.secShift < 0 || s.terShift < 0) {
4409                                         latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;
4410                                         latinOneCEs_[latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_;
4411                                         latinOneCEs_[2 * latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_;
4412                                         break;
4413                                     }
4414                                     addLatinOneEntry(contractionOffset, m_expansion_[offset + i], s);
4415                                 }
4416                             } else { /* else, we do */
4417                                 while (m_expansion_[offset] != 0) {
4418                                     if (s.primShift < 0 || s.secShift < 0 || s.terShift < 0) {
4419                                         latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;
4420                                         latinOneCEs_[latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_;
4421                                         latinOneCEs_[2 * latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_;
4422                                         break;
4423                                     }
4424                                     addLatinOneEntry(contractionOffset, m_expansion_[offset++], s);
4425                                 }
4426                             }
4427                             contractionOffset++;
4428                         } else if (!isSpecial(CE)) {
4429                             addLatinOneEntry(contractionOffset++, CE, s);
4430                         } else {
4431                             latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;
4432                             latinOneCEs_[latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_;
4433                             latinOneCEs_[2 * latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_;
4434                             contractionOffset++;
4435                         }
4436                         UCharOffset++;
4437                         s.primShift = 24;
4438                         s.secShift = 24;
4439                         s.terShift = 24;
4440                         if (contractionOffset == latinOneTableLen_) { // we need to reallocate
4441                             resizeLatinOneTable(2 * latinOneTableLen_);
4442                         }
4443                     } while (m_contractionIndex_[UCharOffset] != 0xFFFF);
4444                 }
4445                 break;
4446                 case CollationElementIterator.CE_SPEC_PROC_TAG_: {
4447                     // 0xB7 is a precontext character defined in UCA5.1, a special
4448                     // handle is implemeted in order to save LatinOne table for
4449                     // most locales.
4450                     if (ch == 0xb7) {
4451                         addLatinOneEntry(ch, CE, s);
4452                     } else {
4453                         latinOneFailed_ = true;
4454                         return false;
4455                     }
4456                 }
4457                 break;
4458                 default:
4459                     latinOneFailed_ = true;
4460                     return false;
4461                 }
4462             }
4463         }
4464         // compact table
4465         if (contractionOffset < latinOneTableLen_) {
4466             resizeLatinOneTable(contractionOffset);
4467         }
4468         return true;
4469     }
4470
4471     private static class ContractionInfo {
4472         int index;
4473     }
4474
4475     ContractionInfo m_ContInfo_;
4476
4477     private int getLatinOneContraction(int strength, int CE, String s) {
4478         // int strength, int CE, String s, Integer ind) {
4479         int len = s.length();
4480         // const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
4481         int UCharOffset = (CE & 0xFFF) - m_contractionOffset_;
4482         int offset = 1;
4483         int latinOneOffset = (CE & 0x00FFF000) >>> 12;
4484                                     char schar = 0, tchar = 0;
4485
4486                                     for (;;) {
4487                                         /*
4488                                          * if(len == -1) { if(s[*index] == 0) { // end of string
4489                                          * return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); } else { schar = s[*index]; }
4490                                          * } else {
4491                                          */
4492                                         if (m_ContInfo_.index == len) {
4493                                             return (latinOneCEs_[strength * latinOneTableLen_ + latinOneOffset]);
4494                                         } else {
4495                                             schar = s.charAt(m_ContInfo_.index);
4496                                         }
4497                                         // }
4498
4499                                         while (schar > (tchar = m_contractionIndex_[UCharOffset + offset]/** (UCharOffset+offset) */
4500                                         )) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
4501                                             offset++;
4502                                         }
4503
4504                                         if (schar == tchar) {
4505                                             m_ContInfo_.index++;
4506                                             return (latinOneCEs_[strength * latinOneTableLen_ + latinOneOffset + offset]);
4507                                         } else {
4508                                             if (schar > ENDOFLATINONERANGE_ /* & 0xFF00 */) {
4509                                                 return BAIL_OUT_CE_;
4510                                             }
4511                                             // skip completely ignorables
4512                                             int isZeroCE = m_trie_.getLeadValue(schar); // UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
4513                                             if (isZeroCE == 0) { // we have to ignore completely ignorables
4514                                                 m_ContInfo_.index++;
4515                                                 continue;
4516                                             }
4517
4518                                             return (latinOneCEs_[strength * latinOneTableLen_ + latinOneOffset]);
4519                                         }
4520                                     }
4521     }
4522
4523     /**
4524      * This is a fast strcoll, geared towards text in Latin-1. It supports contractions of size two, French secondaries
4525      * and case switching. You can use it with strengths primary to tertiary. It does not support shifted and case
4526      * level. It relies on the table build by setupLatin1Table. If it doesn't understand something, it will go to the
4527      * regular strcoll.
4528      * @param buffer collation buffer temporary state
4529      */
4530     private final int compareUseLatin1(String source, String target, int startOffset, CollationBuffer buffer) {
4531         int sLen = source.length();
4532         int tLen = target.length();
4533
4534         int strength = getStrength();
4535
4536         int sIndex = startOffset, tIndex = startOffset;
4537         char sChar = 0, tChar = 0;
4538         int sOrder = 0, tOrder = 0;
4539
4540         boolean endOfSource = false;
4541
4542         // uint32_t *elements = coll->latinOneCEs;
4543
4544         boolean haveContractions = false; // if we have contractions in our string
4545         // we cannot do French secondary
4546
4547         int offset = latinOneTableLen_;
4548
4549         // Do the primary level
4550         primLoop:
4551             for (;;) {
4552                 while (sOrder == 0) { // this loop skips primary ignorables
4553                     // sOrder=getNextlatinOneCE(source);
4554                     if (sIndex == sLen) {
4555                         endOfSource = true;
4556                         break;
4557                     }
4558                     sChar = source.charAt(sIndex++); // [sIndex++];
4559                     // }
4560                     if (sChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out
4561                         // fprintf(stderr, "R");
4562                         return compareRegular(source, target, startOffset, buffer);
4563                     }
4564                     sOrder = latinOneCEs_[sChar];
4565                     if (isSpecial(sOrder)) { // if we got a special
4566                         // specials can basically be either contractions or bail-out signs. If we get anything
4567                         // else, we'll bail out anywasy
4568                         if (getTag(sOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) {
4569                             m_ContInfo_.index = sIndex;
4570                             sOrder = getLatinOneContraction(0, sOrder, source);
4571                             sIndex = m_ContInfo_.index;
4572                             haveContractions = true; // if there are contractions, we cannot do French secondary
4573                             // However, if there are contractions in the table, but we always use just one char,
4574                             // we might be able to do French. This should be checked out.
4575                         }
4576                         if (isSpecial(sOrder) /* == UCOL_BAIL_OUT_CE */) {
4577                             // fprintf(stderr, "S");
4578                             return compareRegular(source, target, startOffset, buffer);
4579                         }
4580                     }
4581                 }
4582
4583                 while (tOrder == 0) { // this loop skips primary ignorables
4584                     // tOrder=getNextlatinOneCE(target);
4585                     if (tIndex == tLen) {
4586                         if (endOfSource) {
4587                             break primLoop;
4588                         } else {
4589                             return 1;
4590                         }
4591                     }
4592                     tChar = target.charAt(tIndex++); // [tIndex++];
4593                     if (tChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out
4594                         // fprintf(stderr, "R");
4595                         return compareRegular(source, target, startOffset, buffer);
4596                     }
4597                     tOrder = latinOneCEs_[tChar];
4598                     if (isSpecial(tOrder)) {
4599                         // Handling specials, see the comments for source
4600                         if (getTag(tOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) {
4601                             m_ContInfo_.index = tIndex;
4602                             tOrder = getLatinOneContraction(0, tOrder, target);
4603                             tIndex = m_ContInfo_.index;
4604                             haveContractions = true;
4605                         }
4606                         if (isSpecial(tOrder)/* == UCOL_BAIL_OUT_CE */) {
4607                             // fprintf(stderr, "S");
4608                             return compareRegular(source, target, startOffset, buffer);
4609                         }
4610                     }
4611                 }
4612                 if (endOfSource) { // source is finished, but target is not, say the result.
4613                     return -1;
4614                 }
4615
4616                 if (sOrder == tOrder) { // if we have same CEs, we continue the loop
4617                     sOrder = 0;
4618                     tOrder = 0;
4619                     continue;
4620                 } else {
4621                     // compare current top bytes
4622                     if (((sOrder ^ tOrder) & 0xFF000000) != 0) {
4623                         // top bytes differ, return difference
4624                         if (sOrder >>> 8 < tOrder >>> 8) {
4625                             return -1;
4626                         } else {
4627                             return 1;
4628                         }
4629                         // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
4630                         // since we must return enum value
4631                     }
4632
4633                     // top bytes match, continue with following bytes
4634                     sOrder <<= 8;
4635                     tOrder <<= 8;
4636                 }
4637             }
4638
4639         // after primary loop, we definitely know the sizes of strings,
4640         // so we set it and use simpler loop for secondaries and tertiaries
4641         // sLen = sIndex; tLen = tIndex;
4642         if (strength >= SECONDARY) {
4643             // adjust the table beggining
4644             // latinOneCEs_ += coll->latinOneTableLen;
4645             endOfSource = false;
4646
4647             if (!m_isFrenchCollation_) { // non French
4648                 // This loop is a simplified copy of primary loop
4649                 // at this point we know that whole strings are latin-1, so we don't
4650                 // check for that. We also know that we only have contractions as
4651                 // specials.
4652                 // sIndex = 0; tIndex = 0;
4653                 sIndex = startOffset;
4654                 tIndex = startOffset;
4655                 secLoop: for (;;) {
4656                     while (sOrder == 0) {
4657                         if (sIndex == sLen) {
4658                             endOfSource = true;
4659                             break;
4660                         }
4661                         sChar = source.charAt(sIndex++); // [sIndex++];
4662                         sOrder = latinOneCEs_[offset + sChar];
4663                         if (isSpecial(sOrder)) {
4664                             m_ContInfo_.index = sIndex;
4665                             sOrder = getLatinOneContraction(1, sOrder, source);
4666                             sIndex = m_ContInfo_.index;
4667                         }
4668                     }
4669
4670                     while (tOrder == 0) {
4671                         if (tIndex == tLen) {
4672                             if (endOfSource) {
4673                                 break secLoop;
4674                             } else {
4675                                 return 1;
4676                             }
4677                         }
4678                         tChar = target.charAt(tIndex++); // [tIndex++];
4679                         tOrder = latinOneCEs_[offset + tChar];
4680                         if (isSpecial(tOrder)) {
4681                             m_ContInfo_.index = tIndex;
4682                             tOrder = getLatinOneContraction(1, tOrder, target);
4683                             tIndex = m_ContInfo_.index;
4684                         }
4685                     }
4686                     if (endOfSource) {
4687                         return -1;
4688                     }
4689
4690                     if (sOrder == tOrder) {
4691                         sOrder = 0;
4692                         tOrder = 0;
4693                         continue;
4694                     } else {
4695                         // see primary loop for comments on this
4696                         if (((sOrder ^ tOrder) & 0xFF000000) != 0) {
4697                             if (sOrder >>> 8 < tOrder >>> 8) {
4698                                 return -1;
4699                             } else {
4700                                 return 1;
4701                             }
4702                         }
4703                         sOrder <<= 8;
4704                         tOrder <<= 8;
4705                     }
4706                 }
4707             } else { // French
4708                 if (haveContractions) { // if we have contractions, we have to bail out
4709                     // since we don't really know how to handle them here
4710                     return compareRegular(source, target, startOffset, buffer);
4711                 }
4712                 // For French, we go backwards
4713                 sIndex = sLen;
4714                 tIndex = tLen;
4715                 secFLoop: for (;;) {
4716                     while (sOrder == 0) {
4717                         if (sIndex == startOffset) {
4718                             endOfSource = true;
4719                             break;
4720                         }
4721                         sChar = source.charAt(--sIndex); // [--sIndex];
4722                         sOrder = latinOneCEs_[offset + sChar];
4723                         // don't even look for contractions
4724                     }
4725
4726                     while (tOrder == 0) {
4727                         if (tIndex == startOffset) {
4728                             if (endOfSource) {
4729                                 break secFLoop;
4730                             } else {
4731                                 return 1;
4732                             }
4733                         }
4734                         tChar = target.charAt(--tIndex); // [--tIndex];
4735                         tOrder = latinOneCEs_[offset + tChar];
4736                         // don't even look for contractions
4737                     }
4738                     if (endOfSource) {
4739                         return -1;
4740                     }
4741
4742                     if (sOrder == tOrder) {
4743                         sOrder = 0;
4744                         tOrder = 0;
4745                         continue;
4746                     } else {
4747                         // see the primary loop for comments
4748                         if (((sOrder ^ tOrder) & 0xFF000000) != 0) {
4749                             if (sOrder >>> 8 < tOrder >>> 8) {
4750                                 return -1;
4751                             } else {
4752                                 return 1;
4753                             }
4754                         }
4755                         sOrder <<= 8;
4756                         tOrder <<= 8;
4757                     }
4758                 }
4759             }
4760         }
4761
4762         if (strength >= TERTIARY) {
4763             // tertiary loop is the same as secondary (except no French)
4764             offset += latinOneTableLen_;
4765             // sIndex = 0; tIndex = 0;
4766             sIndex = startOffset;
4767             tIndex = startOffset;
4768             endOfSource = false;
4769             for (;;) {
4770                 while (sOrder == 0) {
4771                     if (sIndex == sLen) {
4772                         endOfSource = true;
4773                         break;
4774                     }
4775                     sChar = source.charAt(sIndex++); // [sIndex++];
4776                     sOrder = latinOneCEs_[offset + sChar];
4777                     if (isSpecial(sOrder)) {
4778                         m_ContInfo_.index = sIndex;
4779                         sOrder = getLatinOneContraction(2, sOrder, source);
4780                         sIndex = m_ContInfo_.index;
4781                     }
4782                 }
4783                 while (tOrder == 0) {
4784                     if (tIndex == tLen) {
4785                         if (endOfSource) {
4786                             return 0; // if both strings are at the end, they are equal
4787                         } else {
4788                             return 1;
4789                         }
4790                     }
4791                     tChar = target.charAt(tIndex++); // [tIndex++];
4792                     tOrder = latinOneCEs_[offset + tChar];
4793                     if (isSpecial(tOrder)) {
4794                         m_ContInfo_.index = tIndex;
4795                         tOrder = getLatinOneContraction(2, tOrder, target);
4796                         tIndex = m_ContInfo_.index;
4797                     }
4798                 }
4799                 if (endOfSource) {
4800                     return -1;
4801                 }
4802                 if (sOrder == tOrder) {
4803                     sOrder = 0;
4804                     tOrder = 0;
4805                     continue;
4806                 } else {
4807                     if (((sOrder ^ tOrder) & 0xff000000) != 0) {
4808                         if (sOrder >>> 8 < tOrder >>> 8) {
4809                             return -1;
4810                         } else {
4811                             return 1;
4812                         }
4813                     }
4814                     sOrder <<= 8;
4815                     tOrder <<= 8;
4816                 }
4817             }
4818         }
4819         return 0;
4820     }
4821
4822     /**
4823      * Get the version of this collator object.
4824      *
4825      * @return the version object associated with this collator
4826      * @stable ICU 2.8
4827      */
4828     public VersionInfo getVersion() {
4829         /* RunTime version */
4830         int rtVersion = VersionInfo.UCOL_RUNTIME_VERSION.getMajor();
4831         /* Builder version */
4832         int bdVersion = m_version_.getMajor();
4833
4834         /*
4835          * Charset Version. Need to get the version from cnv files makeconv should populate cnv files with version and
4836          * an api has to be provided in ucnv.h to obtain this version
4837          */
4838         int csVersion = 0;
4839
4840         /* combine the version info */
4841         int cmbVersion = ((rtVersion << 11) | (bdVersion << 6) | (csVersion)) & 0xFFFF;
4842
4843         /* Tailoring rules */
4844         return VersionInfo.getInstance(cmbVersion >> 8, cmbVersion & 0xFF, m_version_.getMinor(),
4845         UCA_.m_UCA_version_.getMajor());
4846
4847         // versionInfo[0] = (uint8_t)(cmbVersion>>8);
4848         // versionInfo[1] = (uint8_t)cmbVersion;
4849         // versionInfo[2] = coll->image->version[1];
4850         // versionInfo[3] = coll->UCA->image->UCAVersion[0];
4851     }
4852
4853     /**
4854      * Get the UCA version of this collator object.
4855      *
4856      * @return the version object associated with this collator
4857      * @stable ICU 2.8
4858      */
4859     public VersionInfo getUCAVersion() {
4860         return UCA_.m_UCA_version_;
4861     }
4862
4863     private transient boolean m_reallocLatinOneCEs_;
4864
4865     private CollationBuffer collationBuffer;
4866
4867     private final CollationBuffer getCollationBuffer() {
4868         if (isFrozen()) {
4869             frozenLock.lock();
4870         }
4871         if (collationBuffer == null) {
4872             collationBuffer = new CollationBuffer();
4873         } else {
4874             collationBuffer.resetBuffers();
4875         }
4876         return collationBuffer;
4877     }
4878
4879     private final void releaseCollationBuffer(CollationBuffer buffer) {
4880         if (isFrozen()) {
4881             frozenLock.unlock();
4882         }
4883     }
4884 }