-//##header\r
-/**\r
-*******************************************************************************\r
-* Copyright (C) 1996-2009, International Business Machines Corporation and *\r
-* others. All Rights Reserved. *\r
-*******************************************************************************\r
-*/\r
-package com.ibm.icu.text;\r
-\r
-import java.io.IOException;\r
-import java.text.CharacterIterator;\r
-import java.text.ParseException;\r
-import java.util.Arrays;\r
-import java.util.MissingResourceException;\r
-\r
-//#if defined(FOUNDATION10) || defined(J2SE13) || defined(ECLIPSE_FRAGMENT)\r
-//##import com.ibm.icu.impl.ByteBuffer;\r
-//#else\r
-import java.nio.ByteBuffer;\r
-//#endif\r
-\r
-import com.ibm.icu.impl.BOCU;\r
-import com.ibm.icu.impl.ICUDebug;\r
-import com.ibm.icu.impl.ICUResourceBundle;\r
-import com.ibm.icu.impl.ImplicitCEGenerator;\r
-import com.ibm.icu.impl.IntTrie;\r
-import com.ibm.icu.impl.StringUCharacterIterator;\r
-import com.ibm.icu.impl.Trie;\r
-import com.ibm.icu.impl.TrieIterator;\r
-import com.ibm.icu.impl.Utility;\r
-import com.ibm.icu.lang.UCharacter;\r
-import com.ibm.icu.util.RangeValueIterator;\r
-import com.ibm.icu.util.ULocale;\r
-import com.ibm.icu.util.UResourceBundle;\r
-import com.ibm.icu.util.VersionInfo;\r
-\r
-/**\r
- * <p>RuleBasedCollator is a concrete subclass of Collator. It allows\r
- * customization of the Collator via user-specified rule sets.\r
- * RuleBasedCollator is designed to be fully compliant to the <a\r
- * href="http://www.unicode.org/unicode/reports/tr10/">Unicode\r
- * Collation Algorithm (UCA)</a> and conforms to ISO 14651.</p>\r
- *\r
- * <p>Users are strongly encouraged to read <a\r
- * href="http://www.icu-project.org/userguide/Collate_Intro.html">\r
- * the users guide</a> for more information about the collation\r
- * service before using this class.</p>\r
- *\r
- * <p>Create a RuleBasedCollator from a locale by calling the\r
- * getInstance(Locale) factory method in the base class Collator.\r
- * Collator.getInstance(Locale) creates a RuleBasedCollator object\r
- * based on the collation rules defined by the argument locale. If a\r
- * customized collation ordering ar attributes is required, use the\r
- * RuleBasedCollator(String) constructor with the appropriate\r
- * rules. The customized RuleBasedCollator will base its ordering on\r
- * UCA, while re-adjusting the attributes and orders of the characters\r
- * in the specified rule accordingly.</p>\r
- *\r
- * <p>RuleBasedCollator provides correct collation orders for most\r
- * locales supported in ICU. If specific data for a locale is not\r
- * available, the orders eventually falls back to the <a\r
- * href="http://www.unicode.org/unicode/reports/tr10/">UCA collation\r
- * order </a>.</p>\r
- *\r
- * <p>For information about the collation rule syntax and details\r
- * about customization, please refer to the\r
- * <a href="http://www.icu-project.org/userguide/Collate_Customization.html">\r
- * Collation customization</a> section of the user's guide.</p>\r
- *\r
- * <p><strong>Note</strong> that there are some differences between\r
- * the Collation rule syntax used in Java and ICU4J:\r
- *\r
- * <ul>\r
- * <li>According to the JDK documentation:\r
- * <i>\r
- * <p>\r
- * Modifier '!' : Turns on Thai/Lao vowel-consonant swapping. If this rule\r
- * is in force when a Thai vowel of the range \U0E40-\U0E44 precedes a\r
- * Thai consonant of the range \U0E01-\U0E2E OR a Lao vowel of the\r
- * range \U0EC0-\U0EC4 precedes a Lao consonant of the range\r
- * \U0E81-\U0EAE then the\r
- * vowel is placed after the consonant for collation purposes.\r
- * </p>\r
- * <p>\r
- * If a rule is without the modifier '!', the Thai/Lao vowel-consonant\r
- * swapping is not turned on.\r
- * </p>\r
- * </i>\r
- * <p>\r
- * ICU4J's RuleBasedCollator does not support turning off the Thai/Lao\r
- * vowel-consonant swapping, since the UCA clearly states that it has to be\r
- * supported to ensure a correct sorting order. If a '!' is encountered, it is\r
- * ignored.\r
- * </p>\r
- * <li>As mentioned in the documentation of the base class Collator,\r
- * compatibility decomposition mode is not supported.\r
- * </ul>\r
- * <p>\r
- * <strong>Examples</strong>\r
- * </p>\r
- * <p>\r
- * Creating Customized RuleBasedCollators:\r
- * <blockquote>\r
- * <pre>\r
- * String simple = "& a < b < c < d";\r
- * RuleBasedCollator simpleCollator = new RuleBasedCollator(simple);\r
- *\r
- * String norwegian = "& a , A < b , B < c , C < d , D < e , E "\r
- * + "< f , F < g , G < h , H < i , I < j , "\r
- * + "J < k , K < l , L < m , M < n , N < "\r
- * + "o , O < p , P < q , Q < r , R < s , S < "\r
- * + "t , T < u , U < v , V < w , W < x , X "\r
- * + "< y , Y < z , Z < \u00E5 = a\u030A "\r
- * + ", \u00C5 = A\u030A ; aa , AA < \u00E6 "\r
- * + ", \u00C6 < \u00F8 , \u00D8";\r
- * RuleBasedCollator norwegianCollator = new RuleBasedCollator(norwegian);\r
- * </pre>\r
- * </blockquote>\r
- *\r
- * Concatenating rules to combine <code>Collator</code>s:\r
- * <blockquote>\r
- * <pre>\r
- * // Create an en_US Collator object\r
- * RuleBasedCollator en_USCollator = (RuleBasedCollator)\r
- * Collator.getInstance(new Locale("en", "US", ""));\r
- * // Create a da_DK Collator object\r
- * RuleBasedCollator da_DKCollator = (RuleBasedCollator)\r
- * Collator.getInstance(new Locale("da", "DK", ""));\r
- * // Combine the two\r
- * // First, get the collation rules from en_USCollator\r
- * String en_USRules = en_USCollator.getRules();\r
- * // Second, get the collation rules from da_DKCollator\r
- * String da_DKRules = da_DKCollator.getRules();\r
- * RuleBasedCollator newCollator =\r
- * new RuleBasedCollator(en_USRules + da_DKRules);\r
- * // newCollator has the combined rules\r
- * </pre>\r
- * </blockquote>\r
- *\r
- * Making changes to an existing RuleBasedCollator to create a new\r
- * <code>Collator</code> object, by appending changes to the existing rule:\r
- * <blockquote>\r
- * <pre>\r
- * // Create a new Collator object with additional rules\r
- * String addRules = "& C < ch, cH, Ch, CH";\r
- * RuleBasedCollator myCollator =\r
- * new RuleBasedCollator(en_USCollator.getRules() + addRules);\r
- * // myCollator contains the new rules\r
- * </pre>\r
- * </blockquote>\r
- *\r
- * How to change the order of non-spacing accents:\r
- * <blockquote>\r
- * <pre>\r
- * // old rule with main accents\r
- * String oldRules = "= \u0301 ; \u0300 ; \u0302 ; \u0308 "\r
- * + "; \u0327 ; \u0303 ; \u0304 ; \u0305 "\r
- * + "; \u0306 ; \u0307 ; \u0309 ; \u030A "\r
- * + "; \u030B ; \u030C ; \u030D ; \u030E "\r
- * + "; \u030F ; \u0310 ; \u0311 ; \u0312 "\r
- * + "< a , A ; ae, AE ; \u00e6 , \u00c6 "\r
- * + "< b , B < c, C < e, E & C < d , D";\r
- * // change the order of accent characters\r
- * String addOn = "& \u0300 ; \u0308 ; \u0302";\r
- * RuleBasedCollator myCollator = new RuleBasedCollator(oldRules + addOn);\r
- * </pre>\r
- * </blockquote>\r
- *\r
- * Putting in a new primary ordering before the default setting,\r
- * e.g. sort English characters before or after Japanese characters in the Japanese\r
- * <code>Collator</code>:\r
- * <blockquote>\r
- * <pre>\r
- * // get en_US Collator rules\r
- * RuleBasedCollator en_USCollator\r
- * = (RuleBasedCollator)Collator.getInstance(Locale.US);\r
- * // add a few Japanese characters to sort before English characters\r
- * // suppose the last character before the first base letter 'a' in\r
- * // the English collation rule is \u2212\r
- * String jaString = "& \u2212 < \u3041, \u3042 < \u3043, "\r
- * + "\u3044";\r
- * RuleBasedCollator myJapaneseCollator\r
- * = new RuleBasedCollator(en_USCollator.getRules() + jaString);\r
- * </pre>\r
- * </blockquote>\r
- * </p>\r
- * <p>\r
- * This class is not subclassable\r
- * </p>\r
- * @author Syn Wee Quek\r
- * @stable ICU 2.8\r
- */\r
-public final class RuleBasedCollator extends Collator\r
-{ \r
- // public constructors ---------------------------------------------------\r
-\r
- /**\r
- * <p>\r
- * Constructor that takes the argument rules for\r
- * customization. The collator will be based on UCA,\r
- * with the attributes and re-ordering of the characters specified in the\r
- * argument rules.\r
- * </p>\r
- * <p>See the user guide's section on\r
- * <a href="http://www.icu-project.org/userguide/Collate_Customization.html">\r
- * Collation Customization</a> for details on the rule syntax.\r
- * </p>\r
- * @param rules the collation rules to build the collation table from.\r
- * @exception ParseException and IOException thrown. ParseException thrown\r
- * when argument rules have an invalid syntax. IOException\r
- * thrown when an error occured while reading internal data.\r
- * @stable ICU 2.8\r
- */\r
- public RuleBasedCollator(String rules) throws Exception\r
- {\r
- checkUCA();\r
- if (rules == null) {\r
- throw new IllegalArgumentException(\r
- "Collation rules can not be null");\r
- }\r
- init(rules);\r
- }\r
-\r
- // public methods --------------------------------------------------------\r
-\r
- /**\r
- * Clones the RuleBasedCollator\r
- * @return a new instance of this RuleBasedCollator object\r
- * @stable ICU 2.8\r
- */\r
- public Object clone() throws CloneNotSupportedException\r
- {\r
- RuleBasedCollator result = (RuleBasedCollator)super.clone();\r
- if (latinOneCEs_ != null) {\r
- result.m_reallocLatinOneCEs_ = true;\r
- result.m_ContInfo_ = new ContractionInfo();\r
- }\r
-\r
- // since all collation data in the RuleBasedCollator do not change\r
- // we can safely assign the result.fields to this collator\r
- result.initUtility(false); // let the new clone have their own util\r
- // iterators\r
- return result;\r
- }\r
-\r
- /**\r
- * Return a CollationElementIterator for the given String.\r
- * @see CollationElementIterator\r
- * @stable ICU 2.8\r
- */\r
- public CollationElementIterator getCollationElementIterator(String source)\r
- {\r
- return new CollationElementIterator(source, this);\r
- }\r
-\r
- /**\r
- * Return a CollationElementIterator for the given CharacterIterator.\r
- * The source iterator's integrity will be preserved since a new copy\r
- * will be created for use.\r
- * @see CollationElementIterator\r
- * @stable ICU 2.8\r
- */\r
- public CollationElementIterator getCollationElementIterator(\r
- CharacterIterator source)\r
- {\r
- CharacterIterator newsource = (CharacterIterator)source.clone();\r
- return new CollationElementIterator(newsource, this);\r
- }\r
- \r
- /**\r
- * Return a CollationElementIterator for the given UCharacterIterator.\r
- * The source iterator's integrity will be preserved since a new copy\r
- * will be created for use.\r
- * @see CollationElementIterator\r
- * @stable ICU 2.8\r
- */\r
- public CollationElementIterator getCollationElementIterator(\r
- UCharacterIterator source)\r
- {\r
- return new CollationElementIterator(source, this);\r
- }\r
-\r
- // public setters --------------------------------------------------------\r
-\r
- /**\r
- * Sets the Hiragana Quaternary mode to be on or off.\r
- * When the Hiragana Quaternary mode is turned on, the collator\r
- * positions Hiragana characters before all non-ignorable characters in\r
- * QUATERNARY strength. This is to produce a correct JIS collation order,\r
- * distinguishing between Katakana and Hiragana characters.\r
- * @param flag true if Hiragana Quaternary mode is to be on, false\r
- * otherwise\r
- * @see #setHiraganaQuaternaryDefault\r
- * @see #isHiraganaQuaternary\r
- * @stable ICU 2.8\r
- */\r
- public void setHiraganaQuaternary(boolean flag)\r
- {\r
- m_isHiragana4_ = flag;\r
- updateInternalState(); \r
- }\r
-\r
- /**\r
- * Sets the Hiragana Quaternary mode to the initial mode set during\r
- * construction of the RuleBasedCollator.\r
- * See setHiraganaQuaternary(boolean) for more details.\r
- * @see #setHiraganaQuaternary(boolean)\r
- * @see #isHiraganaQuaternary\r
- * @stable ICU 2.8\r
- */\r
- public void setHiraganaQuaternaryDefault()\r
- {\r
- m_isHiragana4_ = m_defaultIsHiragana4_;\r
- updateInternalState();\r
- }\r
-\r
- /**\r
- * Sets whether uppercase characters sort before lowercase\r
- * characters or vice versa, in strength TERTIARY. The default\r
- * mode is false, and so lowercase characters sort before uppercase\r
- * characters.\r
- * If true, sort upper case characters first.\r
- * @param upperfirst true to sort uppercase characters before\r
- * lowercase characters, false to sort lowercase\r
- * characters before uppercase characters\r
- * @see #isLowerCaseFirst\r
- * @see #isUpperCaseFirst\r
- * @see #setLowerCaseFirst\r
- * @see #setCaseFirstDefault\r
- * @stable ICU 2.8\r
- */\r
- public void setUpperCaseFirst(boolean upperfirst)\r
- {\r
- if (upperfirst) {\r
- if(m_caseFirst_ != AttributeValue.UPPER_FIRST_) {\r
- latinOneRegenTable_ = true;\r
- }\r
- m_caseFirst_ = AttributeValue.UPPER_FIRST_;\r
- }\r
- else {\r
- if(m_caseFirst_ != AttributeValue.OFF_) {\r
- latinOneRegenTable_ = true;\r
- }\r
- m_caseFirst_ = AttributeValue.OFF_;\r
- }\r
- updateInternalState();\r
- }\r
-\r
- /**\r
- * Sets the orders of lower cased characters to sort before upper cased\r
- * characters, in strength TERTIARY. The default\r
- * mode is false.\r
- * If true is set, the RuleBasedCollator will sort lower cased characters\r
- * before the upper cased ones.\r
- * Otherwise, if false is set, the RuleBasedCollator will ignore case\r
- * preferences.\r
- * @param lowerfirst true for sorting lower cased characters before\r
- * upper cased characters, false to ignore case\r
- * preferences.\r
- * @see #isLowerCaseFirst\r
- * @see #isUpperCaseFirst\r
- * @see #setUpperCaseFirst\r
- * @see #setCaseFirstDefault\r
- * @stable ICU 2.8\r
- */\r
- public void setLowerCaseFirst(boolean lowerfirst)\r
- {\r
- if (lowerfirst) {\r
- if(m_caseFirst_ != AttributeValue.LOWER_FIRST_) {\r
- latinOneRegenTable_ = true;\r
- }\r
- m_caseFirst_ = AttributeValue.LOWER_FIRST_;\r
- }\r
- else {\r
- if(m_caseFirst_ != AttributeValue.OFF_) {\r
- latinOneRegenTable_ = true;\r
- }\r
- m_caseFirst_ = AttributeValue.OFF_;\r
- }\r
- updateInternalState();\r
- }\r
-\r
- /**\r
- * Sets the case first mode to the initial mode set during\r
- * construction of the RuleBasedCollator.\r
- * See setUpperCaseFirst(boolean) and setLowerCaseFirst(boolean) for more\r
- * details.\r
- * @see #isLowerCaseFirst\r
- * @see #isUpperCaseFirst\r
- * @see #setLowerCaseFirst(boolean)\r
- * @see #setUpperCaseFirst(boolean)\r
- * @stable ICU 2.8\r
- */\r
- public final void setCaseFirstDefault()\r
- {\r
- if(m_caseFirst_ != m_defaultCaseFirst_) {\r
- latinOneRegenTable_ = true;\r
- }\r
- m_caseFirst_ = m_defaultCaseFirst_;\r
- updateInternalState();\r
- }\r
-\r
- /**\r
- * Sets the alternate handling mode to the initial mode set during\r
- * construction of the RuleBasedCollator.\r
- * See setAlternateHandling(boolean) for more details.\r
- * @see #setAlternateHandlingShifted(boolean)\r
- * @see #isAlternateHandlingShifted()\r
- * @stable ICU 2.8\r
- */\r
- public void setAlternateHandlingDefault()\r
- {\r
- m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_;\r
- updateInternalState();\r
- }\r
-\r
- /**\r
- * Sets the case level mode to the initial mode set during\r
- * construction of the RuleBasedCollator.\r
- * See setCaseLevel(boolean) for more details.\r
- * @see #setCaseLevel(boolean)\r
- * @see #isCaseLevel\r
- * @stable ICU 2.8\r
- */\r
- public void setCaseLevelDefault()\r
- {\r
- m_isCaseLevel_ = m_defaultIsCaseLevel_;\r
- updateInternalState();\r
- }\r
-\r
- /**\r
- * Sets the decomposition mode to the initial mode set during construction\r
- * of the RuleBasedCollator.\r
- * See setDecomposition(int) for more details.\r
- * @see #getDecomposition\r
- * @see #setDecomposition(int)\r
- * @stable ICU 2.8\r
- */\r
- public void setDecompositionDefault()\r
- {\r
- setDecomposition(m_defaultDecomposition_);\r
- updateInternalState(); \r
- }\r
-\r
- /**\r
- * Sets the French collation mode to the initial mode set during\r
- * construction of the RuleBasedCollator.\r
- * See setFrenchCollation(boolean) for more details.\r
- * @see #isFrenchCollation\r
- * @see #setFrenchCollation(boolean)\r
- * @stable ICU 2.8\r
- */\r
- public void setFrenchCollationDefault()\r
- {\r
- if(m_isFrenchCollation_ != m_defaultIsFrenchCollation_) {\r
- latinOneRegenTable_ = true;\r
- }\r
- m_isFrenchCollation_ = m_defaultIsFrenchCollation_;\r
- updateInternalState();\r
- }\r
-\r
- /**\r
- * Sets the collation strength to the initial mode set during the\r
- * construction of the RuleBasedCollator.\r
- * See setStrength(int) for more details.\r
- * @see #setStrength(int)\r
- * @see #getStrength\r
- * @stable ICU 2.8\r
- */\r
- public void setStrengthDefault()\r
- {\r
- setStrength(m_defaultStrength_);\r
- updateInternalState(); \r
- }\r
- \r
- /**\r
- * Method to set numeric collation to its default value.\r
- * When numeric collation is turned on, this Collator generates a collation \r
- * key for the numeric value of substrings of digits. This is a way to get \r
- * '100' to sort AFTER '2'\r
- * @see #getNumericCollation\r
- * @see #setNumericCollation\r
- * @stable ICU 2.8\r
- */\r
- public void setNumericCollationDefault()\r
- {\r
- setNumericCollation(m_defaultIsNumericCollation_);\r
- updateInternalState(); \r
- }\r
-\r
- /**\r
- * Sets the mode for the direction of SECONDARY weights to be used in\r
- * French collation.\r
- * The default value is false, which treats SECONDARY weights in the order\r
- * they appear.\r
- * If set to true, the SECONDARY weights will be sorted backwards.\r
- * See the section on\r
- * <a href="http://www.icu-project.org/userguide/Collate_ServiceArchitecture.html">\r
- * French collation</a> for more information.\r
- * @param flag true to set the French collation on, false to set it off\r
- * @stable ICU 2.8\r
- * @see #isFrenchCollation\r
- * @see #setFrenchCollationDefault\r
- */\r
- public void setFrenchCollation(boolean flag)\r
- {\r
- if(m_isFrenchCollation_ != flag) {\r
- latinOneRegenTable_ = true;\r
- }\r
- m_isFrenchCollation_ = flag;\r
- updateInternalState();\r
- }\r
-\r
- /**\r
- * Sets the alternate handling for QUATERNARY strength to be either\r
- * shifted or non-ignorable.\r
- * See the UCA definition on\r
- * <a href="http://www.unicode.org/unicode/reports/tr10/#Variable_Weighting">\r
- * Alternate Weighting</a>.\r
- * This attribute will only be effective when QUATERNARY strength is set.\r
- * The default value for this mode is false, corresponding to the\r
- * NON_IGNORABLE mode in UCA. In the NON-IGNORABLE mode, the\r
- * RuleBasedCollator will treats all the codepoints with non-ignorable\r
- * primary weights in the same way.\r
- * If the mode is set to true, the behaviour corresponds to SHIFTED defined\r
- * in UCA, this causes codepoints with PRIMARY orders that are equal or\r
- * below the variable top value to be ignored in PRIMARY order and\r
- * moved to the QUATERNARY order.\r
- * @param shifted true if SHIFTED behaviour for alternate handling is\r
- * desired, false for the NON_IGNORABLE behaviour.\r
- * @see #isAlternateHandlingShifted\r
- * @see #setAlternateHandlingDefault\r
- * @stable ICU 2.8\r
- */\r
- public void setAlternateHandlingShifted(boolean shifted)\r
- {\r
- m_isAlternateHandlingShifted_ = shifted;\r
- updateInternalState();\r
- }\r
-\r
- /**\r
- * <p>\r
- * When case level is set to true, an additional weight is formed\r
- * between the SECONDARY and TERTIARY weight, known as the case level.\r
- * The case level is used to distinguish large and small Japanese Kana\r
- * characters. Case level could also be used in other situations.\r
- * For example to distinguish certain Pinyin characters.\r
- * The default value is false, which means the case level is not generated.\r
- * The contents of the case level are affected by the case first\r
- * mode. A simple way to ignore accent differences in a string is to set\r
- * the strength to PRIMARY and enable case level.\r
- * </p>\r
- * <p>\r
- * See the section on\r
- * <a href="http://www.icu-project.org/userguide/Collate_ServiceArchitecture.html">\r
- * case level</a> for more information.\r
- * </p>\r
- * @param flag true if case level sorting is required, false otherwise\r
- * @stable ICU 2.8\r
- * @see #setCaseLevelDefault\r
- * @see #isCaseLevel\r
- */\r
- public void setCaseLevel(boolean flag)\r
- {\r
- m_isCaseLevel_ = flag;\r
- updateInternalState();\r
- }\r
-\r
- /**\r
- * <p>\r
- * Sets this Collator's strength property. The strength property\r
- * determines the minimum level of difference considered significant\r
- * during comparison.\r
- * </p>\r
- * <p>See the Collator class description for an example of use.</p>\r
- * @param newStrength the new strength value.\r
- * @see #getStrength\r
- * @see #setStrengthDefault\r
- * @see #PRIMARY\r
- * @see #SECONDARY\r
- * @see #TERTIARY\r
- * @see #QUATERNARY\r
- * @see #IDENTICAL\r
- * @exception IllegalArgumentException If the new strength value is not one\r
- * of PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL.\r
- * @stable ICU 2.8\r
- */\r
- public void setStrength(int newStrength)\r
- {\r
- super.setStrength(newStrength);\r
- updateInternalState();\r
- }\r
- \r
- /** \r
- * <p>\r
- * Variable top is a two byte primary value which causes all the codepoints \r
- * with primary values that are less or equal than the variable top to be \r
- * shifted when alternate handling is set to SHIFTED.\r
- * </p>\r
- * <p>\r
- * Sets the variable top to a collation element value of a string supplied.\r
- * </p> \r
- * @param varTop one or more (if contraction) characters to which the \r
- * variable top should be set\r
- * @return a int value containing the value of the variable top in upper 16\r
- * bits. Lower 16 bits are undefined.\r
- * @exception IllegalArgumentException is thrown if varTop argument is not \r
- * a valid variable top element. A variable top element is \r
- * invalid when \r
- * <ul>\r
- * <li>it is a contraction that does not exist in the\r
- * Collation order\r
- * <li>when the PRIMARY strength collation element for the \r
- * variable top has more than two bytes\r
- * <li>when the varTop argument is null or zero in length.\r
- * </ul>\r
- * @see #getVariableTop\r
- * @see RuleBasedCollator#setAlternateHandlingShifted\r
- * @stable ICU 2.6\r
- */\r
- public int setVariableTop(String varTop)\r
- {\r
- if (varTop == null || varTop.length() == 0) {\r
- throw new IllegalArgumentException(\r
- "Variable top argument string can not be null or zero in length.");\r
- }\r
- if (m_srcUtilIter_ == null) {\r
- initUtility(true);\r
- }\r
-\r
- m_srcUtilColEIter_.setText(varTop);\r
- int ce = m_srcUtilColEIter_.next();\r
- \r
- // here we check if we have consumed all characters \r
- // you can put in either one character or a contraction\r
- // you shouldn't put more... \r
- if (m_srcUtilColEIter_.getOffset() != varTop.length() \r
- || ce == CollationElementIterator.NULLORDER) {\r
- throw new IllegalArgumentException(\r
- "Variable top argument string is a contraction that does not exist "\r
- + "in the Collation order");\r
- }\r
- \r
- int nextCE = m_srcUtilColEIter_.next();\r
- \r
- if ((nextCE != CollationElementIterator.NULLORDER) \r
- && (!isContinuation(nextCE) || (nextCE & CE_PRIMARY_MASK_) != 0)) {\r
- throw new IllegalArgumentException(\r
- "Variable top argument string can only have a single collation "\r
- + "element that has less than or equal to two PRIMARY strength "\r
- + "bytes");\r
- }\r
- \r
- m_variableTopValue_ = (ce & CE_PRIMARY_MASK_) >> 16;\r
- \r
- return ce & CE_PRIMARY_MASK_;\r
- }\r
- \r
- /** \r
- * Sets the variable top to a collation element value supplied.\r
- * Variable top is set to the upper 16 bits. \r
- * Lower 16 bits are ignored.\r
- * @param varTop Collation element value, as returned by setVariableTop or \r
- * getVariableTop\r
- * @see #getVariableTop\r
- * @see #setVariableTop(String)\r
- * @stable ICU 2.6\r
- */\r
- public void setVariableTop(int varTop)\r
- {\r
- m_variableTopValue_ = (varTop & CE_PRIMARY_MASK_) >> 16;\r
- }\r
- \r
- /**\r
- * When numeric collation is turned on, this Collator generates a collation \r
- * key for the numeric value of substrings of digits. This is a way to get \r
- * '100' to sort AFTER '2'\r
- * @param flag true to turn numeric collation on and false to turn it off\r
- * @see #getNumericCollation\r
- * @see #setNumericCollationDefault\r
- * @stable ICU 2.8\r
- */\r
- public void setNumericCollation(boolean flag)\r
- {\r
- // sort substrings of digits as numbers\r
- m_isNumericCollation_ = flag;\r
- updateInternalState();\r
- }\r
-\r
- // public getters --------------------------------------------------------\r
-\r
- /**\r
- * Gets the collation rules for this RuleBasedCollator.\r
- * Equivalent to String getRules(RuleOption.FULL_RULES).\r
- * @return returns the collation rules\r
- * @see #getRules(boolean)\r
- * @stable ICU 2.8\r
- */\r
- public String getRules()\r
- {\r
- return m_rules_;\r
- }\r
- \r
- /**\r
- * Returns current rules. The argument defines whether full rules \r
- * (UCA + tailored) rules are returned or just the tailoring. \r
- * @param fullrules true if the rules that defines the full set of \r
- * collation order is required, otherwise false for returning only \r
- * the tailored rules\r
- * @return the current rules that defines this Collator.\r
- * @see #getRules()\r
- * @stable ICU 2.6\r
- */\r
- public String getRules(boolean fullrules)\r
- {\r
- if (!fullrules) {\r
- return m_rules_;\r
- }\r
- // take the UCA rules and append real rules at the end \r
- return UCA_.m_rules_.concat(m_rules_);\r
- }\r
-\r
- /**\r
- * Get an UnicodeSet that contains all the characters and sequences\r
- * tailored in this collator.\r
- * @return a pointer to a UnicodeSet object containing all the\r
- * code points and sequences that may sort differently than\r
- * in the UCA.\r
- * @exception ParseException thrown when argument rules have an\r
- * invalid syntax. IOException\r
- * @stable ICU 2.4\r
- */\r
- public UnicodeSet getTailoredSet()\r
- {\r
- try {\r
- CollationRuleParser src = new CollationRuleParser(getRules());\r
- return src.getTailoredSet();\r
- } catch(Exception e) {\r
- throw new IllegalStateException("A tailoring rule should not " +\r
- "have errors. Something is quite wrong!");\r
- }\r
- }\r
-\r
- private class contContext {\r
- RuleBasedCollator coll;\r
- UnicodeSet contractions;\r
- UnicodeSet expansions;\r
- UnicodeSet removedContractions;\r
- boolean addPrefixes; \r
- contContext(RuleBasedCollator coll, UnicodeSet contractions, UnicodeSet expansions, \r
- UnicodeSet removedContractions, boolean addPrefixes) {\r
- this.coll = coll;\r
- this.contractions = contractions;\r
- this.expansions = expansions;\r
- this.removedContractions = removedContractions;\r
- this.addPrefixes = addPrefixes;\r
- }\r
- }\r
- \r
- private void\r
- addSpecial(contContext c, StringBuffer buffer, int CE)\r
- {\r
- StringBuffer b = new StringBuffer();\r
- int offset = (CE & 0xFFFFFF) - c.coll.m_contractionOffset_;\r
- int newCE = c.coll.m_contractionCE_[offset];\r
- // we might have a contraction that ends from previous level\r
- if(newCE != CollationElementIterator.CE_NOT_FOUND_) {\r
- if(isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_ \r
- && isSpecial(newCE) && getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_ \r
- && c.addPrefixes) {\r
- addSpecial(c, buffer, newCE);\r
- }\r
- if(buffer.length() > 1) {\r
- if(c.contractions != null) {\r
- c.contractions.add(buffer.toString());\r
- }\r
- if(c.expansions != null && isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) {\r
- c.expansions.add(buffer.toString());\r
- }\r
- }\r
- } \r
- \r
- offset++;\r
- // check whether we're doing contraction or prefix\r
- if(getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) {\r
- while(c.coll.m_contractionIndex_[offset] != 0xFFFF) {\r
- b.delete(0, b.length());\r
- b.append(buffer);\r
- newCE = c.coll.m_contractionCE_[offset];\r
- b.insert(0, c.coll.m_contractionIndex_[offset]);\r
- if(isSpecial(newCE) && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) {\r
- addSpecial(c, b, newCE);\r
- } else {\r
- if(c.contractions != null) {\r
- c.contractions.add(b.toString());\r
- }\r
- if(c.expansions != null && isSpecial(newCE) && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) {\r
- c.expansions.add(b.toString());\r
- }\r
- }\r
- offset++;\r
- }\r
- } else if(getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_) {\r
- while(c.coll.m_contractionIndex_[offset] != 0xFFFF) {\r
- b.delete(0, b.length());\r
- b.append(buffer);\r
- newCE = c.coll.m_contractionCE_[offset];\r
- b.append(c.coll.m_contractionIndex_[offset]);\r
- if(isSpecial(newCE) && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) {\r
- addSpecial(c, b, newCE);\r
- } else {\r
- if(c.contractions != null) {\r
- c.contractions.add(b.toString());\r
- }\r
- if(c.expansions != null && isSpecial(newCE) && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) {\r
- c.expansions.add(b.toString());\r
- }\r
- }\r
- offset++;\r
- }\r
- }\r
- }\r
- \r
- private\r
- void processSpecials(contContext c) \r
- {\r
- int internalBufferSize = 512;\r
- TrieIterator trieiterator \r
- = new TrieIterator(c.coll.m_trie_);\r
- RangeValueIterator.Element element = new RangeValueIterator.Element();\r
- while (trieiterator.next(element)) {\r
- int start = element.start;\r
- int limit = element.limit;\r
- int CE = element.value;\r
- StringBuffer contraction = new StringBuffer(internalBufferSize);\r
- \r
- if(isSpecial(CE)) {\r
- if(((getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) || getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_)) {\r
- while(start < limit) {\r
- // if there are suppressed contractions, we don't \r
- // want to add them.\r
- if(c.removedContractions != null && c.removedContractions.contains(start)) {\r
- start++;\r
- continue;\r
- }\r
- // we start our contraction from middle, since we don't know if it\r
- // will grow toward right or left\r
- contraction.append((char) start);\r
- addSpecial(c, contraction, CE);\r
- start++;\r
- }\r
- } else if(c.expansions != null && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) {\r
- while(start < limit) {\r
- c.expansions.add(start++);\r
- }\r
- }\r
- }\r
- }\r
- }\r
- \r
- /**\r
- * Gets unicode sets containing contractions and/or expansions of a collator\r
- * @param contractions if not null, set to contain contractions\r
- * @param expansions if not null, set to contain expansions\r
- * @param addPrefixes add the prefix contextual elements to contractions\r
- * @throws Exception \r
- * @stable ICU 3.4\r
- */\r
- public void\r
- getContractionsAndExpansions(UnicodeSet contractions, UnicodeSet expansions,\r
- boolean addPrefixes) throws Exception {\r
- if(contractions != null) {\r
- contractions.clear();\r
- }\r
- if(expansions != null) {\r
- expansions.clear();\r
- }\r
- String rules = getRules();\r
- try {\r
- CollationRuleParser src = new CollationRuleParser(rules);\r
- contContext c = new contContext(RuleBasedCollator.UCA_, \r
- contractions, expansions, src.m_removeSet_, addPrefixes);\r
- \r
- // Add the UCA contractions\r
- processSpecials(c);\r
- // This is collator specific. Add contractions from a collator\r
- c.coll = this;\r
- c.removedContractions = null;\r
- processSpecials(c);\r
- } catch (Exception e) {\r
- throw e;\r
- }\r
- }\r
- \r
- /**\r
- * <p>\r
- * Get a Collation key for the argument String source from this\r
- * RuleBasedCollator.\r
- * </p>\r
- * <p>\r
- * General recommendation: <br>\r
- * If comparison are to be done to the same String multiple times, it would\r
- * be more efficient to generate CollationKeys for the Strings and use\r
- * CollationKey.compareTo(CollationKey) for the comparisons.\r
- * If the each Strings are compared to only once, using the method\r
- * RuleBasedCollator.compare(String, String) will have a better performance.\r
- * </p>\r
- * <p>\r
- * See the class documentation for an explanation about CollationKeys.\r
- * </p>\r
- * @param source the text String to be transformed into a collation key.\r
- * @return the CollationKey for the given String based on this\r
- * RuleBasedCollator's collation rules. If the source String is\r
- * null, a null CollationKey is returned.\r
- * @see CollationKey\r
- * @see #compare(String, String)\r
- * @see #getRawCollationKey\r
- * @stable ICU 2.8\r
- */\r
- public CollationKey getCollationKey(String source) {\r
- if (source == null) {\r
- return null;\r
- }\r
- m_utilRawCollationKey_ = getRawCollationKey(source, \r
- m_utilRawCollationKey_);\r
- return new CollationKey(source, m_utilRawCollationKey_);\r
- }\r
- \r
- /**\r
- * Gets the simpler form of a CollationKey for the String source following\r
- * the rules of this Collator and stores the result into the user provided \r
- * argument key. \r
- * If key has a internal byte array of length that's too small for the \r
- * result, the internal byte array will be grown to the exact required \r
- * size.\r
- * @param source the text String to be transformed into a RawCollationKey \r
- * @param key output RawCollationKey to store results\r
- * @return If key is null, a new instance of RawCollationKey will be \r
- * created and returned, otherwise the user provided key will be \r
- * returned.\r
- * @see #getCollationKey \r
- * @see #compare(String, String)\r
- * @see RawCollationKey\r
- * @stable ICU 2.8\r
- */\r
- public RawCollationKey getRawCollationKey(String source, \r
- RawCollationKey key)\r
- {\r
- if (source == null) {\r
- return null;\r
- }\r
- int strength = getStrength();\r
- m_utilCompare0_ = m_isCaseLevel_;\r
- //m_utilCompare1_ = true;\r
- m_utilCompare2_ = strength >= SECONDARY;\r
- m_utilCompare3_ = strength >= TERTIARY;\r
- m_utilCompare4_ = strength >= QUATERNARY;\r
- m_utilCompare5_ = strength == IDENTICAL;\r
-\r
- m_utilBytesCount0_ = 0;\r
- m_utilBytesCount1_ = 0;\r
- m_utilBytesCount2_ = 0;\r
- m_utilBytesCount3_ = 0;\r
- m_utilBytesCount4_ = 0;\r
- //m_utilBytesCount5_ = 0;\r
- //m_utilCount0_ = 0;\r
- //m_utilCount1_ = 0;\r
- m_utilCount2_ = 0;\r
- m_utilCount3_ = 0;\r
- m_utilCount4_ = 0;\r
- //m_utilCount5_ = 0;\r
- boolean doFrench = m_isFrenchCollation_ && m_utilCompare2_;\r
- // TODO: UCOL_COMMON_BOT4 should be a function of qShifted.\r
- // If we have no qShifted, we don't need to set UCOL_COMMON_BOT4 so\r
- // high.\r
- int commonBottom4 = ((m_variableTopValue_ >>> 8) + 1) & LAST_BYTE_MASK_;\r
- byte hiragana4 = 0;\r
- if (m_isHiragana4_ && m_utilCompare4_) {\r
- // allocate one more space for hiragana, value for hiragana\r
- hiragana4 = (byte)commonBottom4;\r
- commonBottom4 ++;\r
- }\r
-\r
- int bottomCount4 = 0xFF - commonBottom4;\r
- // If we need to normalize, we'll do it all at once at the beginning!\r
- if (m_utilCompare5_ && Normalizer.quickCheck(source, Normalizer.NFD,0)\r
- != Normalizer.YES) {\r
- // if it is identical strength, we have to normalize the string to\r
- // NFD so that it will be appended correctly to the end of the sort\r
- // key\r
- source = Normalizer.decompose(source, false);\r
- }\r
- else if (getDecomposition() != NO_DECOMPOSITION\r
- && Normalizer.quickCheck(source, Normalizer.FCD,0)\r
- != Normalizer.YES) {\r
- // for the rest of the strength, if decomposition is on, FCD is\r
- // enough for us to work on.\r
- source = Normalizer.normalize(source,Normalizer.FCD);\r
- }\r
- getSortKeyBytes(source, doFrench, hiragana4, commonBottom4,\r
- bottomCount4);\r
- if (key == null) {\r
- key = new RawCollationKey();\r
- }\r
- getSortKey(source, doFrench, commonBottom4, bottomCount4, key);\r
- return key;\r
- }\r
-\r
- /**\r
- * Return true if an uppercase character is sorted before the corresponding lowercase character.\r
- * See setCaseFirst(boolean) for details.\r
- * @see #setUpperCaseFirst\r
- * @see #setLowerCaseFirst\r
- * @see #isLowerCaseFirst\r
- * @see #setCaseFirstDefault\r
- * @return true if upper cased characters are sorted before lower cased\r
- * characters, false otherwise\r
- * @stable ICU 2.8\r
- */\r
- public boolean isUpperCaseFirst()\r
- {\r
- return (m_caseFirst_ == AttributeValue.UPPER_FIRST_);\r
- }\r
- \r
- /**\r
- * Return true if a lowercase character is sorted before the corresponding uppercase character.\r
- * See setCaseFirst(boolean) for details.\r
- * @see #setUpperCaseFirst\r
- * @see #setLowerCaseFirst\r
- * @see #isUpperCaseFirst\r
- * @see #setCaseFirstDefault\r
- * @return true lower cased characters are sorted before upper cased\r
- * characters, false otherwise\r
- * @stable ICU 2.8\r
- */\r
- public boolean isLowerCaseFirst()\r
- {\r
- return (m_caseFirst_ == AttributeValue.LOWER_FIRST_);\r
- }\r
-\r
- /**\r
- * Checks if the alternate handling behaviour is the UCA defined SHIFTED or\r
- * NON_IGNORABLE.\r
- * If return value is true, then the alternate handling attribute for the\r
- * Collator is SHIFTED. Otherwise if return value is false, then the\r
- * alternate handling attribute for the Collator is NON_IGNORABLE\r
- * See setAlternateHandlingShifted(boolean) for more details.\r
- * @return true or false\r
- * @see #setAlternateHandlingShifted(boolean)\r
- * @see #setAlternateHandlingDefault\r
- * @stable ICU 2.8\r
- */\r
- public boolean isAlternateHandlingShifted()\r
- {\r
- return m_isAlternateHandlingShifted_;\r
- }\r
-\r
- /**\r
- * Checks if case level is set to true.\r
- * See setCaseLevel(boolean) for details.\r
- * @return the case level mode\r
- * @see #setCaseLevelDefault\r
- * @see #isCaseLevel\r
- * @see #setCaseLevel(boolean)\r
- * @stable ICU 2.8\r
- */\r
- public boolean isCaseLevel()\r
- {\r
- return m_isCaseLevel_;\r
- }\r
-\r
- /**\r
- * Checks if French Collation is set to true.\r
- * See setFrenchCollation(boolean) for details.\r
- * @return true if French Collation is set to true, false otherwise\r
- * @see #setFrenchCollation(boolean)\r
- * @see #setFrenchCollationDefault\r
- * @stable ICU 2.8\r
- */\r
- public boolean isFrenchCollation()\r
- {\r
- return m_isFrenchCollation_;\r
- }\r
-\r
- /**\r
- * Checks if the Hiragana Quaternary mode is set on.\r
- * See setHiraganaQuaternary(boolean) for more details.\r
- * @return flag true if Hiragana Quaternary mode is on, false otherwise\r
- * @see #setHiraganaQuaternaryDefault\r
- * @see #setHiraganaQuaternary(boolean)\r
- * @stable ICU 2.8\r
- */\r
- public boolean isHiraganaQuaternary()\r
- {\r
- return m_isHiragana4_;\r
- }\r
-\r
- /** \r
- * Gets the variable top value of a Collator. \r
- * Lower 16 bits are undefined and should be ignored.\r
- * @return the variable top value of a Collator.\r
- * @see #setVariableTop\r
- * @stable ICU 2.6\r
- */\r
- public int getVariableTop()\r
- {\r
- return m_variableTopValue_ << 16;\r
- }\r
- \r
- /** \r
- * Method to retrieve the numeric collation value.\r
- * When numeric collation is turned on, this Collator generates a collation \r
- * key for the numeric value of substrings of digits. This is a way to get \r
- * '100' to sort AFTER '2'\r
- * @see #setNumericCollation\r
- * @see #setNumericCollationDefault\r
- * @return true if numeric collation is turned on, false otherwise\r
- * @stable ICU 2.8\r
- */\r
- public boolean getNumericCollation()\r
- {\r
- return m_isNumericCollation_;\r
- }\r
- \r
- // public other methods -------------------------------------------------\r
-\r
- /**\r
- * Compares the equality of two RuleBasedCollator objects.\r
- * RuleBasedCollator objects are equal if they have the same collation\r
- * rules and the same attributes.\r
- * @param obj the RuleBasedCollator to be compared to.\r
- * @return true if this RuleBasedCollator has exactly the same\r
- * collation behaviour as obj, false otherwise.\r
- * @stable ICU 2.8\r
- */\r
- public boolean equals(Object obj)\r
- {\r
- if (obj == null) {\r
- return false; // super does class check\r
- }\r
- if (this == obj) {\r
- return true;\r
- }\r
- if (getClass() != obj.getClass()) {\r
- return false;\r
- }\r
- RuleBasedCollator other = (RuleBasedCollator)obj;\r
- // all other non-transient information is also contained in rules.\r
- if (getStrength() != other.getStrength()\r
- || getDecomposition() != other.getDecomposition()\r
- || other.m_caseFirst_ != m_caseFirst_\r
- || other.m_caseSwitch_ != m_caseSwitch_\r
- || other.m_isAlternateHandlingShifted_\r
- != m_isAlternateHandlingShifted_\r
- || other.m_isCaseLevel_ != m_isCaseLevel_\r
- || other.m_isFrenchCollation_ != m_isFrenchCollation_\r
- || other.m_isHiragana4_ != m_isHiragana4_) {\r
- return false;\r
- }\r
- boolean rules = m_rules_ == other.m_rules_;\r
- if (!rules && (m_rules_ != null && other.m_rules_ != null)) {\r
- rules = m_rules_.equals(other.m_rules_);\r
- }\r
- if (!rules || !ICUDebug.enabled("collation")) {\r
- return rules;\r
- }\r
- if (m_addition3_ != other.m_addition3_\r
- || m_bottom3_ != other.m_bottom3_\r
- || m_bottomCount3_ != other.m_bottomCount3_\r
- || m_common3_ != other.m_common3_\r
- || m_isSimple3_ != other.m_isSimple3_\r
- || m_mask3_ != other.m_mask3_\r
- || m_minContractionEnd_ != other.m_minContractionEnd_\r
- || m_minUnsafe_ != other.m_minUnsafe_\r
- || m_top3_ != other.m_top3_\r
- || m_topCount3_ != other.m_topCount3_\r
- || !Arrays.equals(m_unsafe_, other.m_unsafe_)) {\r
- return false;\r
- }\r
- if (!m_trie_.equals(other.m_trie_)) {\r
- // we should use the trie iterator here, but then this part is\r
- // only used in the test.\r
- for (int i = UCharacter.MAX_VALUE; i >= UCharacter.MIN_VALUE; i --)\r
- {\r
- int v = m_trie_.getCodePointValue(i);\r
- int otherv = other.m_trie_.getCodePointValue(i);\r
- if (v != otherv) {\r
- int mask = v & (CE_TAG_MASK_ | CE_SPECIAL_FLAG_);\r
- if (mask == (otherv & 0xff000000)) {\r
- v &= 0xffffff;\r
- otherv &= 0xffffff;\r
- if (mask == 0xf1000000) {\r
- v -= (m_expansionOffset_ << 4);\r
- otherv -= (other.m_expansionOffset_ << 4);\r
- }\r
- else if (mask == 0xf2000000) {\r
- v -= m_contractionOffset_;\r
- otherv -= other.m_contractionOffset_;\r
- }\r
- if (v == otherv) {\r
- continue;\r
- }\r
- }\r
- return false;\r
- }\r
- }\r
- }\r
- if (Arrays.equals(m_contractionCE_, other.m_contractionCE_)\r
- && Arrays.equals(m_contractionEnd_, other.m_contractionEnd_)\r
- && Arrays.equals(m_contractionIndex_, other.m_contractionIndex_)\r
- && Arrays.equals(m_expansion_, other.m_expansion_)\r
- && Arrays.equals(m_expansionEndCE_, other.m_expansionEndCE_)) {\r
- // not comparing paddings\r
- for (int i = 0; i < m_expansionEndCE_.length; i ++) {\r
- if (m_expansionEndCEMaxSize_[i]\r
- != other.m_expansionEndCEMaxSize_[i]) {\r
- return false;\r
- }\r
- return true;\r
- }\r
- }\r
- return false;\r
- }\r
-\r
- /**\r
- * Generates a unique hash code for this RuleBasedCollator.\r
- * @return the unique hash code for this Collator\r
- * @stable ICU 2.8\r
- */\r
- public int hashCode()\r
- {\r
- String rules = getRules();\r
- if (rules == null) {\r
- rules = "";\r
- }\r
- return rules.hashCode();\r
- }\r
-\r
- /**\r
- * Compares the source text String to the target text String according to\r
- * the collation rules, strength and decomposition mode for this\r
- * RuleBasedCollator.\r
- * Returns an integer less than,\r
- * equal to or greater than zero depending on whether the source String is\r
- * less than, equal to or greater than the target String. See the Collator\r
- * class description for an example of use.\r
- * </p>\r
- * <p>\r
- * General recommendation: <br>\r
- * If comparison are to be done to the same String multiple times, it would\r
- * be more efficient to generate CollationKeys for the Strings and use\r
- * CollationKey.compareTo(CollationKey) for the comparisons.\r
- * If speed performance is critical and object instantiation is to be \r
- * reduced, further optimization may be achieved by generating a simpler \r
- * key of the form RawCollationKey and reusing this RawCollationKey \r
- * object with the method RuleBasedCollator.getRawCollationKey. Internal \r
- * byte representation can be directly accessed via RawCollationKey and\r
- * stored for future use. Like CollationKey, RawCollationKey provides a\r
- * method RawCollationKey.compareTo for key comparisons.\r
- * If the each Strings are compared to only once, using the method\r
- * RuleBasedCollator.compare(String, String) will have a better performance.\r
- * </p>\r
- * @param source the source text String.\r
- * @param target the target text String.\r
- * @return Returns an integer value. Value is less than zero if source is\r
- * less than target, value is zero if source and target are equal,\r
- * value is greater than zero if source is greater than target.\r
- * @see CollationKey\r
- * @see #getCollationKey\r
- * @stable ICU 2.8\r
- */\r
- public int compare(String source, String target)\r
- {\r
- if (source == target) {\r
- return 0;\r
- }\r
-\r
- // Find the length of any leading portion that is equal\r
- int offset = getFirstUnmatchedOffset(source, target);\r
- //return compareRegular(source, target, offset);\r
- if(latinOneUse_) {\r
- if ((offset < source.length() \r
- && source.charAt(offset) > ENDOFLATINONERANGE_) \r
- || (offset < target.length() \r
- && target.charAt(offset) > ENDOFLATINONERANGE_)) { \r
- // source or target start with non-latin-1\r
- return compareRegular(source, target, offset);\r
- } else {\r
- return compareUseLatin1(source, target, offset);\r
- }\r
- } else {\r
- return compareRegular(source, target, offset);\r
- }\r
- }\r
- \r
- // package private inner interfaces --------------------------------------\r
-\r
- /**\r
- * Attribute values to be used when setting the Collator options\r
- */\r
- static interface AttributeValue\r
- {\r
- /**\r
- * Indicates that the default attribute value will be used.\r
- * See individual attribute for details on its default value.\r
- */\r
- static final int DEFAULT_ = -1;\r
- /**\r
- * Primary collation strength\r
- */\r
- static final int PRIMARY_ = Collator.PRIMARY;\r
- /**\r
- * Secondary collation strength\r
- */\r
- static final int SECONDARY_ = Collator.SECONDARY;\r
- /**\r
- * Tertiary collation strength\r
- */\r
- static final int TERTIARY_ = Collator.TERTIARY;\r
- /**\r
- * Default collation strength\r
- */\r
- static final int DEFAULT_STRENGTH_ = Collator.TERTIARY;\r
- /**\r
- * Internal use for strength checks in Collation elements\r
- */\r
- static final int CE_STRENGTH_LIMIT_ = Collator.TERTIARY + 1;\r
- /**\r
- * Quaternary collation strength\r
- */\r
- static final int QUATERNARY_ = 3;\r
- /**\r
- * Identical collation strength\r
- */\r
- static final int IDENTICAL_ = Collator.IDENTICAL;\r
- /**\r
- * Internal use for strength checks\r
- */\r
- static final int STRENGTH_LIMIT_ = Collator.IDENTICAL + 1;\r
- /**\r
- * Turn the feature off - works for FRENCH_COLLATION, CASE_LEVEL,\r
- * HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE\r
- */\r
- static final int OFF_ = 16;\r
- /**\r
- * Turn the feature on - works for FRENCH_COLLATION, CASE_LEVEL,\r
- * HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE\r
- */\r
- static final int ON_ = 17;\r
- /**\r
- * Valid for ALTERNATE_HANDLING. Alternate handling will be shifted\r
- */\r
- static final int SHIFTED_ = 20;\r
- /**\r
- * Valid for ALTERNATE_HANDLING. Alternate handling will be non\r
- * ignorable\r
- */\r
- static final int NON_IGNORABLE_ = 21;\r
- /**\r
- * Valid for CASE_FIRST - lower case sorts before upper case\r
- */\r
- static final int LOWER_FIRST_ = 24;\r
- /**\r
- * Upper case sorts before lower case\r
- */\r
- static final int UPPER_FIRST_ = 25;\r
- /**\r
- * Number of attribute values\r
- */\r
- static final int LIMIT_ = 29;\r
- }\r
-\r
- /**\r
- * Attributes that collation service understands. All the attributes can\r
- * take DEFAULT value, as well as the values specific to each one.\r
- */\r
- static interface Attribute\r
- {\r
- /**\r
- * Attribute for direction of secondary weights - used in French.\r
- * Acceptable values are ON, which results in secondary weights being\r
- * considered backwards and OFF which treats secondary weights in the\r
- * order they appear.\r
- */\r
- static final int FRENCH_COLLATION_ = 0;\r
- /**\r
- * Attribute for handling variable elements. Acceptable values are\r
- * NON_IGNORABLE (default) which treats all the codepoints with\r
- * non-ignorable primary weights in the same way, and SHIFTED which\r
- * causes codepoints with primary weights that are equal or below the\r
- * variable top value to be ignored on primary level and moved to the\r
- * quaternary level.\r
- */\r
- static final int ALTERNATE_HANDLING_ = 1;\r
- /**\r
- * Controls the ordering of upper and lower case letters. Acceptable\r
- * values are OFF (default), which orders upper and lower case letters\r
- * in accordance to their tertiary weights, UPPER_FIRST which forces\r
- * upper case letters to sort before lower case letters, and\r
- * LOWER_FIRST which does the opposite.\r
- */\r
- static final int CASE_FIRST_ = 2;\r
- /**\r
- * Controls whether an extra case level (positioned before the third\r
- * level) is generated or not. Acceptable values are OFF (default),\r
- * when case level is not generated, and ON which causes the case\r
- * level to be generated. Contents of the case level are affected by\r
- * the value of CASE_FIRST attribute. A simple way to ignore accent\r
- * differences in a string is to set the strength to PRIMARY and\r
- * enable case level.\r
- */\r
- static final int CASE_LEVEL_ = 3;\r
- /**\r
- * Controls whether the normalization check and necessary\r
- * normalizations are performed. When set to OFF (default) no\r
- * normalization check is performed. The correctness of the result is\r
- * guaranteed only if the input data is in so-called FCD form (see\r
- * users manual for more info). When set to ON, an incremental check\r
- * is performed to see whether the input data is in the FCD form. If\r
- * the data is not in the FCD form, incremental NFD normalization is\r
- * performed.\r
- */\r
- static final int NORMALIZATION_MODE_ = 4;\r
- /**\r
- * The strength attribute. Can be either PRIMARY, SECONDARY, TERTIARY,\r
- * QUATERNARY or IDENTICAL. The usual strength for most locales\r
- * (except Japanese) is tertiary. Quaternary strength is useful when\r
- * combined with shifted setting for alternate handling attribute and\r
- * for JIS x 4061 collation, when it is used to distinguish between\r
- * Katakana and Hiragana (this is achieved by setting the\r
- * HIRAGANA_QUATERNARY mode to on. Otherwise, quaternary level is\r
- * affected only by the number of non ignorable code points in the\r
- * string. Identical strength is rarely useful, as it amounts to\r
- * codepoints of the NFD form of the string.\r
- */\r
- static final int STRENGTH_ = 5;\r
- /**\r
- * When turned on, this attribute positions Hiragana before all\r
- * non-ignorables on quaternary level. This is a sneaky way to produce\r
- * JIS sort order.\r
- */\r
- static final int HIRAGANA_QUATERNARY_MODE_ = 6;\r
- /**\r
- * Attribute count\r
- */\r
- static final int LIMIT_ = 7;\r
- }\r
-\r
- /**\r
- * DataManipulate singleton\r
- */\r
- static class DataManipulate implements Trie.DataManipulate\r
- {\r
- // public methods ----------------------------------------------------\r
-\r
- /**\r
- * Internal method called to parse a lead surrogate's ce for the offset\r
- * to the next trail surrogate data.\r
- * @param ce collation element of the lead surrogate\r
- * @return data offset or 0 for the next trail surrogate\r
- * @stable ICU 2.8\r
- */\r
- public final int getFoldingOffset(int ce)\r
- {\r
- if (isSpecial(ce) && getTag(ce) == CE_SURROGATE_TAG_) {\r
- return (ce & 0xFFFFFF);\r
- }\r
- return 0;\r
- }\r
-\r
- /**\r
- * Get singleton object\r
- */\r
- public static final DataManipulate getInstance()\r
- {\r
- if (m_instance_ == null) {\r
- m_instance_ = new DataManipulate();\r
- }\r
- return m_instance_;\r
- }\r
-\r
- // private data member ----------------------------------------------\r
-\r
- /**\r
- * Singleton instance\r
- */\r
- private static DataManipulate m_instance_;\r
-\r
- // private constructor ----------------------------------------------\r
-\r
- /**\r
- * private to prevent initialization\r
- */\r
- private DataManipulate()\r
- {\r
- }\r
- }\r
-\r
- /**\r
- * UCAConstants\r
- */\r
- static final class UCAConstants\r
- {\r
- int FIRST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000\r
- int LAST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000\r
- int FIRST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x00008705\r
- int FIRST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000000\r
- int LAST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000500\r
- int LAST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x0000DD05\r
- int FIRST_VARIABLE_[] = new int[2]; // 0x05070505\r
- int LAST_VARIABLE_[] = new int[2]; // 0x13CF0505\r
- int FIRST_NON_VARIABLE_[] = new int[2]; // 0x16200505\r
- int LAST_NON_VARIABLE_[] = new int[2]; // 0x767C0505\r
- int RESET_TOP_VALUE_[] = new int[2]; // 0x9F000303\r
- int FIRST_IMPLICIT_[] = new int[2];\r
- int LAST_IMPLICIT_[] = new int[2];\r
- int FIRST_TRAILING_[] = new int[2];\r
- int LAST_TRAILING_[] = new int[2];\r
- int PRIMARY_TOP_MIN_;\r
- int PRIMARY_IMPLICIT_MIN_; // 0xE8000000\r
- int PRIMARY_IMPLICIT_MAX_; // 0xF0000000\r
- int PRIMARY_TRAILING_MIN_; // 0xE8000000\r
- int PRIMARY_TRAILING_MAX_; // 0xF0000000\r
- int PRIMARY_SPECIAL_MIN_; // 0xE8000000\r
- int PRIMARY_SPECIAL_MAX_; // 0xF0000000\r
- }\r
-\r
- // package private data member -------------------------------------------\r
-\r
- static final byte BYTE_FIRST_TAILORED_ = (byte)0x04;\r
- static final byte BYTE_COMMON_ = (byte)0x05;\r
- static final int COMMON_TOP_2_ = 0x86; // int for unsigness\r
- static final int COMMON_BOTTOM_2_ = BYTE_COMMON_;\r
- static final int COMMON_BOTTOM_3 = 0x05;\r
- /**\r
- * Case strength mask\r
- */\r
- static final int CE_CASE_BIT_MASK_ = 0xC0;\r
- static final int CE_TAG_SHIFT_ = 24;\r
- static final int CE_TAG_MASK_ = 0x0F000000;\r
-\r
- static final int CE_SPECIAL_FLAG_ = 0xF0000000;\r
- /**\r
- * Lead surrogate that is tailored and doesn't start a contraction\r
- */\r
- static final int CE_SURROGATE_TAG_ = 5;\r
- /**\r
- * Mask to get the primary strength of the collation element\r
- */\r
- static final int CE_PRIMARY_MASK_ = 0xFFFF0000;\r
- /**\r
- * Mask to get the secondary strength of the collation element\r
- */\r
- static final int CE_SECONDARY_MASK_ = 0xFF00;\r
- /**\r
- * Mask to get the tertiary strength of the collation element\r
- */\r
- static final int CE_TERTIARY_MASK_ = 0xFF;\r
- /**\r
- * Primary strength shift\r
- */\r
- static final int CE_PRIMARY_SHIFT_ = 16;\r
- /**\r
- * Secondary strength shift\r
- */\r
- static final int CE_SECONDARY_SHIFT_ = 8;\r
- /**\r
- * Continuation marker\r
- */\r
- static final int CE_CONTINUATION_MARKER_ = 0xC0;\r
-\r
- /**\r
- * Size of collator raw data headers and options before the expansion\r
- * data. This is used when expansion ces are to be retrieved. ICU4C uses\r
- * the expansion offset starting from UCollator.UColHeader, hence ICU4J\r
- * will have to minus that off to get the right expansion ce offset. In\r
- * number of ints.\r
- */\r
- int m_expansionOffset_;\r
- /**\r
- * Size of collator raw data headers, options and expansions before\r
- * contraction data. This is used when contraction ces are to be retrieved.\r
- * ICU4C uses contraction offset starting from UCollator.UColHeader, hence\r
- * ICU4J will have to minus that off to get the right contraction ce\r
- * offset. In number of chars.\r
- */\r
- int m_contractionOffset_;\r
- /**\r
- * Flag indicator if Jamo is special\r
- */\r
- boolean m_isJamoSpecial_;\r
-\r
- // Collator options ------------------------------------------------------\r
- \r
- int m_defaultVariableTopValue_;\r
- boolean m_defaultIsFrenchCollation_;\r
- boolean m_defaultIsAlternateHandlingShifted_;\r
- int m_defaultCaseFirst_;\r
- boolean m_defaultIsCaseLevel_;\r
- int m_defaultDecomposition_;\r
- int m_defaultStrength_;\r
- boolean m_defaultIsHiragana4_;\r
- boolean m_defaultIsNumericCollation_;\r
- \r
- /**\r
- * Value of the variable top\r
- */\r
- int m_variableTopValue_;\r
- /**\r
- * Attribute for special Hiragana\r
- */\r
- boolean m_isHiragana4_;\r
- /**\r
- * Case sorting customization\r
- */\r
- int m_caseFirst_;\r
- /**\r
- * Numeric collation option\r
- */\r
- boolean m_isNumericCollation_;\r
-\r
- // end Collator options --------------------------------------------------\r
-\r
- /**\r
- * Expansion table\r
- */\r
- int m_expansion_[];\r
- /**\r
- * Contraction index table\r
- */\r
- char m_contractionIndex_[];\r
- /**\r
- * Contraction CE table\r
- */\r
- int m_contractionCE_[];\r
- /**\r
- * Data trie\r
- */\r
- IntTrie m_trie_;\r
- /**\r
- * Table to store all collation elements that are the last element of an\r
- * expansion. This is for use in StringSearch.\r
- */\r
- int m_expansionEndCE_[];\r
- /**\r
- * Table to store the maximum size of any expansions that end with the\r
- * corresponding collation element in m_expansionEndCE_. For use in\r
- * StringSearch too\r
- */\r
- byte m_expansionEndCEMaxSize_[];\r
- /**\r
- * Heuristic table to store information on whether a char character is\r
- * considered "unsafe". "Unsafe" character are combining marks or those\r
- * belonging to some contraction sequence from the offset 1 onwards.\r
- * E.g. if "ABC" is the only contraction, then 'B' and 'C' are considered\r
- * unsafe. If we have another contraction "ZA" with the one above, then\r
- * 'A', 'B', 'C' are "unsafe" but 'Z' is not.\r
- */\r
- byte m_unsafe_[];\r
- /**\r
- * Table to store information on whether a codepoint can occur as the last\r
- * character in a contraction\r
- */\r
- byte m_contractionEnd_[];\r
- /**\r
- * Original collation rules\r
- */\r
- String m_rules_;\r
- /**\r
- * The smallest "unsafe" codepoint\r
- */\r
- char m_minUnsafe_;\r
- /**\r
- * The smallest codepoint that could be the end of a contraction\r
- */\r
- char m_minContractionEnd_;\r
- /**\r
- * General version of the collator\r
- */\r
- VersionInfo m_version_;\r
- /**\r
- * UCA version\r
- */\r
- VersionInfo m_UCA_version_;\r
- /**\r
- * UCD version\r
- */\r
- VersionInfo m_UCD_version_;\r
-\r
- /**\r
- * UnicodeData.txt property object\r
- */\r
- static final RuleBasedCollator UCA_;\r
- /**\r
- * UCA Constants\r
- */\r
- static final UCAConstants UCA_CONSTANTS_;\r
- /**\r
- * Table for UCA and builder use\r
- */\r
- static final char UCA_CONTRACTIONS_[];\r
-\r
- private static boolean UCA_INIT_COMPLETE;\r
-\r
- /**\r
- * Implicit generator\r
- */\r
- static final ImplicitCEGenerator impCEGen_;\r
-// /**\r
-// * Implicit constants\r
-// */\r
-// static final int IMPLICIT_BASE_BYTE_;\r
-// static final int IMPLICIT_LIMIT_BYTE_;\r
-// static final int IMPLICIT_4BYTE_BOUNDARY_;\r
-// static final int LAST_MULTIPLIER_;\r
-// static final int LAST2_MULTIPLIER_;\r
-// static final int IMPLICIT_BASE_3BYTE_;\r
-// static final int IMPLICIT_BASE_4BYTE_;\r
-// static final int BYTES_TO_AVOID_ = 3;\r
-// static final int OTHER_COUNT_ = 256 - BYTES_TO_AVOID_;\r
-// static final int LAST_COUNT_ = OTHER_COUNT_ / 2;\r
-// /**\r
-// * Room for intervening, without expanding to 5 bytes\r
-// */\r
-// static final int LAST_COUNT2_ = OTHER_COUNT_ / 21;\r
-// static final int IMPLICIT_3BYTE_COUNT_ = 1;\r
-// \r
- static final byte SORT_LEVEL_TERMINATOR_ = 1;\r
-\r
-// These are values from UCA required for\r
-// implicit generation and supressing sort key compression\r
-// they should regularly be in the UCA, but if one\r
-// is running without UCA, it could be a problem\r
- static final int maxRegularPrimary = 0xA0;\r
- static final int minImplicitPrimary = 0xE0;\r
- static final int maxImplicitPrimary = 0xE4;\r
-\r
-\r
- // block to initialise character property database\r
- static\r
- {\r
- // take pains to let static class init succeed, otherwise the class itself won't exist and\r
- // clients will get a NoClassDefFoundException. Instead, make the constructors fail if\r
- // we can't load the UCA data.\r
-\r
- RuleBasedCollator iUCA_ = null;\r
- UCAConstants iUCA_CONSTANTS_ = null;\r
- char iUCA_CONTRACTIONS_[] = null;\r
- ImplicitCEGenerator iimpCEGen_ = null;\r
- try\r
- {\r
- // !!! note what's going on here...\r
- // even though the static init of the class is not yet complete, we\r
- // instantiate an instance of the class. So we'd better be sure that\r
- // instantiation doesn't rely on the static initialization that's\r
- // not complete yet!\r
- iUCA_ = new RuleBasedCollator();\r
- iUCA_CONSTANTS_ = new UCAConstants();\r
- iUCA_CONTRACTIONS_ = CollatorReader.read(iUCA_, iUCA_CONSTANTS_);\r
-\r
- // called before doing canonical closure for the UCA.\r
- iimpCEGen_ = new ImplicitCEGenerator(minImplicitPrimary, maxImplicitPrimary);\r
- //iimpCEGen_ = new ImplicitCEGenerator(iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_, iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_);\r
- iUCA_.init();\r
- ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_COLLATION_BASE_NAME, ULocale.ENGLISH);\r
- iUCA_.m_rules_ = (String)rb.getObject("UCARules");\r
- }\r
- catch (MissingResourceException ex)\r
- {\r
-// throw ex;\r
- }\r
- catch (IOException e)\r
- {\r
- // e.printStackTrace();\r
-// throw new MissingResourceException(e.getMessage(),"","");\r
- }\r
-\r
- UCA_ = iUCA_;\r
- UCA_CONSTANTS_ = iUCA_CONSTANTS_;\r
- UCA_CONTRACTIONS_ = iUCA_CONTRACTIONS_;\r
- impCEGen_ = iimpCEGen_;\r
-\r
- UCA_INIT_COMPLETE = true;\r
- }\r
-\r
-\r
- private static void checkUCA() throws MissingResourceException {\r
- if (UCA_INIT_COMPLETE && UCA_ == null) {\r
- throw new MissingResourceException("Collator UCA data unavailable", "", "");\r
- }\r
- }\r
- \r
- // package private constructors ------------------------------------------\r
-\r
- /**\r
- * <p>Private contructor for use by subclasses.\r
- * Public access to creating Collators is handled by the API\r
- * Collator.getInstance() or RuleBasedCollator(String rules).\r
- * </p>\r
- * <p>\r
- * This constructor constructs the UCA collator internally\r
- * </p>\r
- */\r
- RuleBasedCollator()\r
- {\r
- checkUCA();\r
- initUtility(false);\r
- }\r
-\r
- /**\r
- * Constructors a RuleBasedCollator from the argument locale.\r
- * If no resource bundle is associated with the locale, UCA is used\r
- * instead.\r
- * @param locale\r
- */\r
- RuleBasedCollator(ULocale locale)\r
- {\r
- checkUCA();\r
- ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_COLLATION_BASE_NAME, locale);\r
- initUtility(false);\r
- if (rb != null) {\r
- try {\r
- // Use keywords, if supplied for lookup\r
- String collkey = locale.getKeywordValue("collation");\r
- if(collkey == null) {\r
- collkey = rb.getStringWithFallback("collations/default");\r
- }\r
- \r
- // collations/default will always give a string back\r
- // keyword for the real collation data\r
- // if "collations/collkey" will return null if collkey == null \r
- ICUResourceBundle elements = rb.getWithFallback("collations/" + collkey);\r
- if (elements != null) {\r
- // TODO: Determine actual & valid locale correctly\r
- ULocale uloc = rb.getULocale();\r
- setLocale(uloc, uloc);\r
-\r
- m_rules_ = elements.getString("Sequence");\r
- ByteBuffer buf = elements.get("%%CollationBin").getBinary();\r
- // %%CollationBin\r
- if(buf!=null){\r
- // m_rules_ = (String)rules[1][1];\r
- byte map[] = buf.array();\r
- CollatorReader.initRBC(this, map);\r
- /*\r
- BufferedInputStream input =\r
- new BufferedInputStream(\r
- new ByteArrayInputStream(map));\r
- /*\r
- CollatorReader reader = new CollatorReader(input, false);\r
- if (map.length > MIN_BINARY_DATA_SIZE_) {\r
- reader.read(this, null);\r
- }\r
- else {\r
- reader.readHeader(this);\r
- reader.readOptions(this);\r
- // duplicating UCA_'s data\r
- setWithUCATables();\r
- }\r
- */\r
- // at this point, we have read in the collator\r
- // now we need to check whether the binary image has\r
- // the right UCA and other versions\r
- if(!m_UCA_version_.equals(UCA_.m_UCA_version_) ||\r
- !m_UCD_version_.equals(UCA_.m_UCD_version_)) {\r
- init(m_rules_);\r
- return;\r
- }\r
- init();\r
- return;\r
- }\r
- else { \r
- init(m_rules_);\r
- return;\r
- }\r
- }\r
- }\r
- catch (Exception e) {\r
- // e.printStackTrace();\r
- // if failed use UCA.\r
- }\r
- }\r
- setWithUCAData();\r
- }\r
-\r
- // package private methods -----------------------------------------------\r
-\r
- /**\r
- * Sets this collator to use the tables in UCA. Note options not taken\r
- * care of here.\r
- */\r
- final void setWithUCATables()\r
- {\r
- m_contractionOffset_ = UCA_.m_contractionOffset_;\r
- m_expansionOffset_ = UCA_.m_expansionOffset_;\r
- m_expansion_ = UCA_.m_expansion_;\r
- m_contractionIndex_ = UCA_.m_contractionIndex_;\r
- m_contractionCE_ = UCA_.m_contractionCE_;\r
- m_trie_ = UCA_.m_trie_;\r
- m_expansionEndCE_ = UCA_.m_expansionEndCE_;\r
- m_expansionEndCEMaxSize_ = UCA_.m_expansionEndCEMaxSize_;\r
- m_unsafe_ = UCA_.m_unsafe_;\r
- m_contractionEnd_ = UCA_.m_contractionEnd_;\r
- m_minUnsafe_ = UCA_.m_minUnsafe_;\r
- m_minContractionEnd_ = UCA_.m_minContractionEnd_;\r
- }\r
-\r
- /**\r
- * Sets this collator to use the all options and tables in UCA.\r
- */\r
- final void setWithUCAData()\r
- {\r
- latinOneFailed_ = true;\r
-\r
- m_addition3_ = UCA_.m_addition3_;\r
- m_bottom3_ = UCA_.m_bottom3_;\r
- m_bottomCount3_ = UCA_.m_bottomCount3_;\r
- m_caseFirst_ = UCA_.m_caseFirst_;\r
- m_caseSwitch_ = UCA_.m_caseSwitch_;\r
- m_common3_ = UCA_.m_common3_;\r
- m_contractionOffset_ = UCA_.m_contractionOffset_;\r
- setDecomposition(UCA_.getDecomposition());\r
- m_defaultCaseFirst_ = UCA_.m_defaultCaseFirst_;\r
- m_defaultDecomposition_ = UCA_.m_defaultDecomposition_;\r
- m_defaultIsAlternateHandlingShifted_\r
- = UCA_.m_defaultIsAlternateHandlingShifted_;\r
- m_defaultIsCaseLevel_ = UCA_.m_defaultIsCaseLevel_;\r
- m_defaultIsFrenchCollation_ = UCA_.m_defaultIsFrenchCollation_;\r
- m_defaultIsHiragana4_ = UCA_.m_defaultIsHiragana4_;\r
- m_defaultStrength_ = UCA_.m_defaultStrength_;\r
- m_defaultVariableTopValue_ = UCA_.m_defaultVariableTopValue_;\r
- m_defaultIsNumericCollation_ = UCA_.m_defaultIsNumericCollation_;\r
- m_expansionOffset_ = UCA_.m_expansionOffset_;\r
- m_isAlternateHandlingShifted_ = UCA_.m_isAlternateHandlingShifted_;\r
- m_isCaseLevel_ = UCA_.m_isCaseLevel_;\r
- m_isFrenchCollation_ = UCA_.m_isFrenchCollation_;\r
- m_isHiragana4_ = UCA_.m_isHiragana4_;\r
- m_isJamoSpecial_ = UCA_.m_isJamoSpecial_;\r
- m_isSimple3_ = UCA_.m_isSimple3_;\r
- m_mask3_ = UCA_.m_mask3_;\r
- m_minContractionEnd_ = UCA_.m_minContractionEnd_;\r
- m_minUnsafe_ = UCA_.m_minUnsafe_;\r
- m_rules_ = UCA_.m_rules_;\r
- setStrength(UCA_.getStrength());\r
- m_top3_ = UCA_.m_top3_;\r
- m_topCount3_ = UCA_.m_topCount3_;\r
- m_variableTopValue_ = UCA_.m_variableTopValue_;\r
- m_isNumericCollation_ = UCA_.m_isNumericCollation_;\r
- setWithUCATables();\r
- latinOneFailed_ = false;\r
- }\r
-\r
- /**\r
- * Test whether a char character is potentially "unsafe" for use as a\r
- * collation starting point. "Unsafe" characters are combining marks or\r
- * those belonging to some contraction sequence from the offset 1 onwards.\r
- * E.g. if "ABC" is the only contraction, then 'B' and\r
- * 'C' are considered unsafe. If we have another contraction "ZA" with\r
- * the one above, then 'A', 'B', 'C' are "unsafe" but 'Z' is not.\r
- * @param ch character to determin\r
- * @return true if ch is unsafe, false otherwise\r
- */\r
- final boolean isUnsafe(char ch)\r
- {\r
- if (ch < m_minUnsafe_) {\r
- return false;\r
- }\r
- \r
- if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) {\r
- if (UTF16.isLeadSurrogate(ch) \r
- || UTF16.isTrailSurrogate(ch)) {\r
- // Trail surrogate are always considered unsafe.\r
- return true;\r
- }\r
- ch &= HEURISTIC_OVERFLOW_MASK_;\r
- ch += HEURISTIC_OVERFLOW_OFFSET_;\r
- }\r
- int value = m_unsafe_[ch >> HEURISTIC_SHIFT_];\r
- return ((value >> (ch & HEURISTIC_MASK_)) & 1) != 0;\r
- }\r
-\r
- /**\r
- * Approximate determination if a char character is at a contraction end.\r
- * Guaranteed to be true if a character is at the end of a contraction,\r
- * otherwise it is not deterministic.\r
- * @param ch character to be determined\r
- */\r
- final boolean isContractionEnd(char ch)\r
- {\r
- if (UTF16.isTrailSurrogate(ch)) {\r
- return true;\r
- }\r
-\r
- if (ch < m_minContractionEnd_) {\r
- return false;\r
- }\r
-\r
- if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) {\r
- ch &= HEURISTIC_OVERFLOW_MASK_;\r
- ch += HEURISTIC_OVERFLOW_OFFSET_;\r
- }\r
- int value = m_contractionEnd_[ch >> HEURISTIC_SHIFT_];\r
- return ((value >> (ch & HEURISTIC_MASK_)) & 1) != 0;\r
- }\r
-\r
- /**\r
- * Retrieve the tag of a special ce\r
- * @param ce ce to test\r
- * @return tag of ce\r
- */\r
- static int getTag(int ce)\r
- {\r
- return (ce & CE_TAG_MASK_) >> CE_TAG_SHIFT_;\r
- }\r
-\r
- /**\r
- * Checking if ce is special\r
- * @param ce to check\r
- * @return true if ce is special\r
- */\r
- static boolean isSpecial(int ce)\r
- {\r
- return (ce & CE_SPECIAL_FLAG_) == CE_SPECIAL_FLAG_;\r
- }\r
-\r
- /**\r
- * Checks if the argument ce is a continuation\r
- * @param ce collation element to test\r
- * @return true if ce is a continuation\r
- */\r
- static final boolean isContinuation(int ce)\r
- {\r
- return ce != CollationElementIterator.NULLORDER\r
- && (ce & CE_CONTINUATION_TAG_) == CE_CONTINUATION_TAG_;\r
- }\r
-\r
- // private inner classes ------------------------------------------------\r
-\r
- // private variables -----------------------------------------------------\r
-\r
- /**\r
- * The smallest natural unsafe or contraction end char character before\r
- * tailoring.\r
- * This is a combining mark.\r
- */\r
- private static final int DEFAULT_MIN_HEURISTIC_ = 0x300;\r
- /**\r
- * Heuristic table table size. Size is 32 bytes, 1 bit for each\r
- * latin 1 char, and some power of two for hashing the rest of the chars.\r
- * Size in bytes.\r
- */\r
- private static final char HEURISTIC_SIZE_ = 1056;\r
- /**\r
- * Mask value down to "some power of two" - 1,\r
- * number of bits, not num of bytes.\r
- */\r
- private static final char HEURISTIC_OVERFLOW_MASK_ = 0x1fff;\r
- /**\r
- * Unsafe character shift\r
- */\r
- private static final int HEURISTIC_SHIFT_ = 3;\r
- /**\r
- * Unsafe character addition for character too large, it has to be folded\r
- * then incremented.\r
- */\r
- private static final char HEURISTIC_OVERFLOW_OFFSET_ = 256;\r
- /**\r
- * Mask value to get offset in heuristic table.\r
- */\r
- private static final char HEURISTIC_MASK_ = 7;\r
-\r
- private int m_caseSwitch_;\r
- private int m_common3_;\r
- private int m_mask3_;\r
- /**\r
- * When switching case, we need to add or subtract different values.\r
- */\r
- private int m_addition3_;\r
- /**\r
- * Upper range when compressing\r
- */\r
- private int m_top3_;\r
- /**\r
- * Upper range when compressing\r
- */\r
- private int m_bottom3_;\r
- private int m_topCount3_;\r
- private int m_bottomCount3_;\r
- /**\r
- * Case first constants\r
- */\r
- private static final int CASE_SWITCH_ = 0xC0;\r
- private static final int NO_CASE_SWITCH_ = 0;\r
- /**\r
- * Case level constants\r
- */\r
- private static final int CE_REMOVE_CASE_ = 0x3F;\r
- private static final int CE_KEEP_CASE_ = 0xFF;\r
- /**\r
- * Case strength mask\r
- */\r
- private static final int CE_CASE_MASK_3_ = 0xFF;\r
- /**\r
- * Sortkey size factor. Values can be changed.\r
- */\r
- private static final double PROPORTION_2_ = 0.5;\r
- private static final double PROPORTION_3_ = 0.667;\r
-\r
- // These values come from the UCA ----------------------------------------\r
-\r
- /**\r
- * This is an enum that lists magic special byte values from the\r
- * fractional UCA\r
- */\r
- //private static final byte BYTE_ZERO_ = 0x0;\r
- //private static final byte BYTE_LEVEL_SEPARATOR_ = (byte)0x01;\r
- //private static final byte BYTE_SORTKEY_GLUE_ = (byte)0x02;\r
- private static final byte BYTE_SHIFT_PREFIX_ = (byte)0x03;\r
- /*private*/ static final byte BYTE_UNSHIFTED_MIN_ = BYTE_SHIFT_PREFIX_;\r
- //private static final byte BYTE_FIRST_UCA_ = BYTE_COMMON_;\r
- static final byte CODAN_PLACEHOLDER = 0x27;\r
- //private static final byte BYTE_LAST_LATIN_PRIMARY_ = (byte)0x4C;\r
- private static final byte BYTE_FIRST_NON_LATIN_PRIMARY_ = (byte)0x4D;\r
- private static final byte BYTE_UNSHIFTED_MAX_ = (byte)0xFF;\r
- private static final int TOTAL_2_ = COMMON_TOP_2_ - COMMON_BOTTOM_2_ - 1;\r
- private static final int FLAG_BIT_MASK_CASE_SWITCH_OFF_ = 0x80;\r
- private static final int FLAG_BIT_MASK_CASE_SWITCH_ON_ = 0x40;\r
- private static final int COMMON_TOP_CASE_SWITCH_OFF_3_ = 0x85;\r
- private static final int COMMON_TOP_CASE_SWITCH_LOWER_3_ = 0x45;\r
- private static final int COMMON_TOP_CASE_SWITCH_UPPER_3_ = 0xC5;\r
- private static final int COMMON_BOTTOM_3_ = 0x05;\r
- private static final int COMMON_BOTTOM_CASE_SWITCH_UPPER_3_ = 0x86;\r
- private static final int COMMON_BOTTOM_CASE_SWITCH_LOWER_3_ =\r
- COMMON_BOTTOM_3_;\r
- private static final int TOP_COUNT_2_ = (int)(PROPORTION_2_ * TOTAL_2_);\r
- private static final int BOTTOM_COUNT_2_ = TOTAL_2_ - TOP_COUNT_2_;\r
- private static final int COMMON_2_ = COMMON_BOTTOM_2_;\r
- private static final int COMMON_UPPER_FIRST_3_ = 0xC5;\r
- private static final int COMMON_NORMAL_3_ = COMMON_BOTTOM_3_;\r
- //private static final int COMMON_4_ = (byte)0xFF;\r
-\r
-\r
-\r
- /*\r
- * Minimum size required for the binary collation data in bytes.\r
- * Size of UCA header + size of options to 4 bytes\r
- */\r
- //private static final int MIN_BINARY_DATA_SIZE_ = (42 + 25) << 2;\r
-\r
- /**\r
- * If this collator is to generate only simple tertiaries for fast path\r
- */\r
- private boolean m_isSimple3_;\r
-\r
- /**\r
- * French collation sorting flag\r
- */\r
- private boolean m_isFrenchCollation_;\r
- /**\r
- * Flag indicating if shifted is requested for Quaternary alternate\r
- * handling. If this is not true, the default for alternate handling will\r
- * be non-ignorable.\r
- */\r
- private boolean m_isAlternateHandlingShifted_;\r
- /**\r
- * Extra case level for sorting\r
- */\r
- private boolean m_isCaseLevel_;\r
-\r
- private static final int SORT_BUFFER_INIT_SIZE_ = 128;\r
- private static final int SORT_BUFFER_INIT_SIZE_1_ =\r
- SORT_BUFFER_INIT_SIZE_ << 3;\r
- private static final int SORT_BUFFER_INIT_SIZE_2_ = SORT_BUFFER_INIT_SIZE_;\r
- private static final int SORT_BUFFER_INIT_SIZE_3_ = SORT_BUFFER_INIT_SIZE_;\r
- private static final int SORT_BUFFER_INIT_SIZE_CASE_ =\r
- SORT_BUFFER_INIT_SIZE_ >> 2;\r
- private static final int SORT_BUFFER_INIT_SIZE_4_ = SORT_BUFFER_INIT_SIZE_;\r
-\r
- private static final int CE_CONTINUATION_TAG_ = 0xC0;\r
- private static final int CE_REMOVE_CONTINUATION_MASK_ = 0xFFFFFF3F;\r
-\r
- private static final int LAST_BYTE_MASK_ = 0xFF;\r
-\r
- //private static final int CE_RESET_TOP_VALUE_ = 0x9F000303;\r
- //private static final int CE_NEXT_TOP_VALUE_ = 0xE8960303;\r
-\r
- private static final byte SORT_CASE_BYTE_START_ = (byte)0x80;\r
- private static final byte SORT_CASE_SHIFT_START_ = (byte)7;\r
-\r
- /**\r
- * CE buffer size\r
- */\r
- private static final int CE_BUFFER_SIZE_ = 512;\r
-\r
- // variables for Latin-1 processing\r
- boolean latinOneUse_ = false;\r
- boolean latinOneRegenTable_ = false;\r
- boolean latinOneFailed_ = false;\r
-\r
- int latinOneTableLen_ = 0;\r
- int latinOneCEs_[] = null;\r
- /**\r
- * Bunch of utility iterators\r
- */\r
- private StringUCharacterIterator m_srcUtilIter_;\r
- private CollationElementIterator m_srcUtilColEIter_;\r
- private StringUCharacterIterator m_tgtUtilIter_;\r
- private CollationElementIterator m_tgtUtilColEIter_;\r
- /**\r
- * Utility comparison flags\r
- */\r
- private boolean m_utilCompare0_;\r
- //private boolean m_utilCompare1_;\r
- private boolean m_utilCompare2_;\r
- private boolean m_utilCompare3_;\r
- private boolean m_utilCompare4_;\r
- private boolean m_utilCompare5_;\r
- /**\r
- * Utility byte buffer\r
- */\r
- private byte m_utilBytes0_[];\r
- private byte m_utilBytes1_[];\r
- private byte m_utilBytes2_[];\r
- private byte m_utilBytes3_[];\r
- private byte m_utilBytes4_[];\r
- //private byte m_utilBytes5_[];\r
- private RawCollationKey m_utilRawCollationKey_;\r
-\r
- private int m_utilBytesCount0_;\r
- private int m_utilBytesCount1_;\r
- private int m_utilBytesCount2_;\r
- private int m_utilBytesCount3_;\r
- private int m_utilBytesCount4_;\r
- //private int m_utilBytesCount5_;\r
- //private int m_utilCount0_;\r
- //private int m_utilCount1_;\r
- private int m_utilCount2_;\r
- private int m_utilCount3_;\r
- private int m_utilCount4_;\r
- //private int m_utilCount5_;\r
-\r
- private int m_utilFrenchStart_;\r
- private int m_utilFrenchEnd_;\r
-\r
- /**\r
- * Preparing the CE buffers. will be filled during the primary phase\r
- */\r
- private int m_srcUtilCEBuffer_[];\r
- private int m_tgtUtilCEBuffer_[];\r
- private int m_srcUtilCEBufferSize_;\r
- private int m_tgtUtilCEBufferSize_;\r
-\r
- private int m_srcUtilContOffset_;\r
- private int m_tgtUtilContOffset_;\r
-\r
- private int m_srcUtilOffset_;\r
- private int m_tgtUtilOffset_;\r
-\r
- // private methods -------------------------------------------------------\r
-\r
- private void init(String rules) throws Exception\r
- {\r
- setWithUCAData();\r
- CollationParsedRuleBuilder builder\r
- = new CollationParsedRuleBuilder(rules);\r
- builder.setRules(this);\r
- m_rules_ = rules;\r
- init();\r
- initUtility(false);\r
- }\r
- \r
- private final int compareRegular(String source, String target, int offset) {\r
- if (m_srcUtilIter_ == null) {\r
- initUtility(true);\r
- }\r
- int strength = getStrength();\r
- // setting up the collator parameters\r
- m_utilCompare0_ = m_isCaseLevel_;\r
- //m_utilCompare1_ = true;\r
- m_utilCompare2_ = strength >= SECONDARY;\r
- m_utilCompare3_ = strength >= TERTIARY;\r
- m_utilCompare4_ = strength >= QUATERNARY;\r
- m_utilCompare5_ = strength == IDENTICAL;\r
- boolean doFrench = m_isFrenchCollation_ && m_utilCompare2_;\r
- boolean doShift4 = m_isAlternateHandlingShifted_ && m_utilCompare4_;\r
- boolean doHiragana4 = m_isHiragana4_ && m_utilCompare4_;\r
-\r
- if (doHiragana4 && doShift4) {\r
- String sourcesub = source.substring(offset);\r
- String targetsub = target.substring(offset);\r
- return compareBySortKeys(sourcesub, targetsub);\r
- }\r
-\r
- // This is the lowest primary value that will not be ignored if shifted\r
- int lowestpvalue = m_isAlternateHandlingShifted_\r
- ? m_variableTopValue_ << 16 : 0;\r
- m_srcUtilCEBufferSize_ = 0;\r
- m_tgtUtilCEBufferSize_ = 0;\r
- int result = doPrimaryCompare(doHiragana4, lowestpvalue, source,\r
- target, offset);\r
- if (m_srcUtilCEBufferSize_ == -1\r
- && m_tgtUtilCEBufferSize_ == -1) {\r
- // since the cebuffer is cleared when we have determined that\r
- // either source is greater than target or vice versa, the return\r
- // result is the comparison result and not the hiragana result\r
- return result;\r
- }\r
-\r
- int hiraganaresult = result;\r
-\r
- if (m_utilCompare2_) {\r
- result = doSecondaryCompare(doFrench);\r
- if (result != 0) {\r
- return result;\r
- }\r
- }\r
- // doing the case bit\r
- if (m_utilCompare0_) {\r
- result = doCaseCompare();\r
- if (result != 0) {\r
- return result;\r
- }\r
- }\r
- // Tertiary level\r
- if (m_utilCompare3_) {\r
- result = doTertiaryCompare();\r
- if (result != 0) {\r
- return result;\r
- }\r
- }\r
-\r
- if (doShift4) { // checkQuad\r
- result = doQuaternaryCompare(lowestpvalue);\r
- if (result != 0) {\r
- return result;\r
- }\r
- }\r
- else if (doHiragana4 && hiraganaresult != 0) {\r
- // If we're fine on quaternaries, we might be different\r
- // on Hiragana. This, however, might fail us in shifted.\r
- return hiraganaresult;\r
- }\r
-\r
- // For IDENTICAL comparisons, we use a bitwise character comparison\r
- // as a tiebreaker if all else is equal.\r
- // Getting here should be quite rare - strings are not identical -\r
- // that is checked first, but compared == through all other checks.\r
- if (m_utilCompare5_) {\r
- return doIdenticalCompare(source, target, offset, true);\r
- }\r
- return 0;\r
- }\r
-\r
- /**\r
- * Gets the 2 bytes of primary order and adds it to the primary byte array\r
- * @param ce current ce\r
- * @param notIsContinuation flag indicating if the current bytes belong to\r
- * a continuation ce\r
- * @param doShift flag indicating if ce is to be shifted\r
- * @param leadPrimary lead primary used for compression\r
- * @param commonBottom4 common byte value for Quaternary\r
- * @param bottomCount4 smallest byte value for Quaternary\r
- * @return the new lead primary for compression\r
- */\r
- private final int doPrimaryBytes(int ce, boolean notIsContinuation,\r
- boolean doShift, int leadPrimary,\r
- int commonBottom4, int bottomCount4)\r
- {\r
-\r
- int p2 = (ce >>>= 16) & LAST_BYTE_MASK_; // in ints for unsigned\r
- int p1 = ce >>> 8; // comparison\r
- if (doShift) {\r
- if (m_utilCount4_ > 0) {\r
- while (m_utilCount4_ > bottomCount4) {\r
- m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,\r
- (byte)(commonBottom4 + bottomCount4));\r
- m_utilBytesCount4_ ++;\r
- m_utilCount4_ -= bottomCount4;\r
- }\r
- m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,\r
- (byte)(commonBottom4\r
- + (m_utilCount4_ - 1)));\r
- m_utilBytesCount4_ ++;\r
- m_utilCount4_ = 0;\r
- }\r
- // dealing with a variable and we're treating them as shifted\r
- // This is a shifted ignorable\r
- if (p1 != 0) {\r
- // we need to check this since we could be in continuation\r
- m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,\r
- (byte)p1);\r
- m_utilBytesCount4_ ++;\r
- }\r
- if (p2 != 0) {\r
- m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,\r
- (byte)p2);\r
- m_utilBytesCount4_ ++;\r
- }\r
- }\r
- else {\r
- // Note: This code assumes that the table is well built\r
- // i.e. not having 0 bytes where they are not supposed to be.\r
- // Usually, we'll have non-zero primary1 & primary2, except\r
- // in cases of LatinOne and friends, when primary2 will be\r
- // regular and simple sortkey calc\r
- if (p1 != CollationElementIterator.IGNORABLE) {\r
- if (notIsContinuation) {\r
- if (leadPrimary == p1) {\r
- m_utilBytes1_ = append(m_utilBytes1_,\r
- m_utilBytesCount1_, (byte)p2);\r
- m_utilBytesCount1_ ++;\r
- }\r
- else {\r
- if (leadPrimary != 0) {\r
- m_utilBytes1_ = append(m_utilBytes1_,\r
- m_utilBytesCount1_,\r
- ((p1 > leadPrimary)\r
- ? BYTE_UNSHIFTED_MAX_\r
- : BYTE_UNSHIFTED_MIN_)); \r
- m_utilBytesCount1_ ++;\r
- }\r
- if (p2 == CollationElementIterator.IGNORABLE) {\r
- // one byter, not compressed\r
- m_utilBytes1_ = append(m_utilBytes1_,\r
- m_utilBytesCount1_,\r
- (byte)p1);\r
- m_utilBytesCount1_ ++;\r
- leadPrimary = 0;\r
- }\r
- else if (p1 < BYTE_FIRST_NON_LATIN_PRIMARY_\r
- || (p1 > maxRegularPrimary\r
- //> (RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_[0]\r
- // >>> 24)\r
- && p1 < minImplicitPrimary\r
- //< (RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_[0]\r
- // >>> 24)\r
- )) {\r
- // not compressible\r
- leadPrimary = 0;\r
- m_utilBytes1_ = append(m_utilBytes1_,\r
- m_utilBytesCount1_,\r
- (byte)p1);\r
- m_utilBytesCount1_ ++;\r
- m_utilBytes1_ = append(m_utilBytes1_,\r
- m_utilBytesCount1_,\r
- (byte)p2);\r
- m_utilBytesCount1_ ++;\r
- }\r
- else { // compress\r
- leadPrimary = p1;\r
- m_utilBytes1_ = append(m_utilBytes1_,\r
- m_utilBytesCount1_,\r
- (byte)p1);\r
- m_utilBytesCount1_ ++;\r
- m_utilBytes1_ = append(m_utilBytes1_,\r
- m_utilBytesCount1_, (byte)p2);\r
- m_utilBytesCount1_ ++;\r
- }\r
- }\r
- }\r
- else {\r
- // continuation, add primary to the key, no compression\r
- m_utilBytes1_ = append(m_utilBytes1_,\r
- m_utilBytesCount1_, (byte)p1);\r
- m_utilBytesCount1_ ++;\r
- if (p2 != CollationElementIterator.IGNORABLE) {\r
- m_utilBytes1_ = append(m_utilBytes1_,\r
- m_utilBytesCount1_, (byte)p2);\r
- // second part\r
- m_utilBytesCount1_ ++;\r
- }\r
- }\r
- }\r
- }\r
- return leadPrimary;\r
- }\r
-\r
- /**\r
- * Gets the secondary byte and adds it to the secondary byte array\r
- * @param ce current ce\r
- * @param notIsContinuation flag indicating if the current bytes belong to\r
- * a continuation ce\r
- * @param doFrench flag indicator if french sort is to be performed\r
- */\r
- private final void doSecondaryBytes(int ce, boolean notIsContinuation,\r
- boolean doFrench)\r
- {\r
- int s = (ce >>= 8) & LAST_BYTE_MASK_; // int for comparison\r
- if (s != 0) {\r
- if (!doFrench) {\r
- // This is compression code.\r
- if (s == COMMON_2_ && notIsContinuation) {\r
- m_utilCount2_ ++;\r
- }\r
- else {\r
- if (m_utilCount2_ > 0) {\r
- if (s > COMMON_2_) { // not necessary for 4th level.\r
- while (m_utilCount2_ > TOP_COUNT_2_) {\r
- m_utilBytes2_ = append(m_utilBytes2_,\r
- m_utilBytesCount2_,\r
- (byte)(COMMON_TOP_2_ - TOP_COUNT_2_));\r
- m_utilBytesCount2_ ++;\r
- m_utilCount2_ -= TOP_COUNT_2_;\r
- }\r
- m_utilBytes2_ = append(m_utilBytes2_,\r
- m_utilBytesCount2_,\r
- (byte)(COMMON_TOP_2_\r
- - (m_utilCount2_ - 1)));\r
- m_utilBytesCount2_ ++;\r
- }\r
- else {\r
- while (m_utilCount2_ > BOTTOM_COUNT_2_) {\r
- m_utilBytes2_ = append(m_utilBytes2_,\r
- m_utilBytesCount2_,\r
- (byte)(COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));\r
- m_utilBytesCount2_ ++;\r
- m_utilCount2_ -= BOTTOM_COUNT_2_;\r
- }\r
- m_utilBytes2_ = append(m_utilBytes2_,\r
- m_utilBytesCount2_,\r
- (byte)(COMMON_BOTTOM_2_\r
- + (m_utilCount2_ - 1)));\r
- m_utilBytesCount2_ ++;\r
- }\r
- m_utilCount2_ = 0;\r
- }\r
- m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,\r
- (byte)s);\r
- m_utilBytesCount2_ ++;\r
- }\r
- }\r
- else {\r
- m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,\r
- (byte)s);\r
- m_utilBytesCount2_ ++;\r
- // Do the special handling for French secondaries\r
- // We need to get continuation elements and do intermediate\r
- // restore\r
- // abc1c2c3de with french secondaries need to be edc1c2c3ba\r
- // NOT edc3c2c1ba\r
- if (notIsContinuation) {\r
- if (m_utilFrenchStart_ != -1) {\r
- // reverse secondaries from frenchStartPtr up to\r
- // frenchEndPtr\r
- reverseBuffer(m_utilBytes2_);\r
- m_utilFrenchStart_ = -1;\r
- }\r
- }\r
- else {\r
- if (m_utilFrenchStart_ == -1) {\r
- m_utilFrenchStart_ = m_utilBytesCount2_ - 2;\r
- }\r
- m_utilFrenchEnd_ = m_utilBytesCount2_ - 1;\r
- }\r
- }\r
- }\r
- }\r
-\r
- /**\r
- * Reverse the argument buffer\r
- * @param buffer to reverse\r
- */\r
- private void reverseBuffer(byte buffer[])\r
- {\r
- int start = m_utilFrenchStart_;\r
- int end = m_utilFrenchEnd_;\r
- while (start < end) {\r
- byte b = buffer[start];\r
- buffer[start ++] = buffer[end];\r
- buffer[end --] = b;\r
- }\r
- }\r
-\r
- /**\r
- * Insert the case shifting byte if required\r
- * @param caseshift value\r
- * @return new caseshift value\r
- */\r
- private final int doCaseShift(int caseshift)\r
- {\r
- if (caseshift == 0) {\r
- m_utilBytes0_ = append(m_utilBytes0_, m_utilBytesCount0_,\r
- SORT_CASE_BYTE_START_);\r
- m_utilBytesCount0_ ++;\r
- caseshift = SORT_CASE_SHIFT_START_;\r
- }\r
- return caseshift;\r
- }\r
-\r
- /**\r
- * Performs the casing sort\r
- * @param tertiary byte in ints for easy comparison\r
- * @param notIsContinuation flag indicating if the current bytes belong to\r
- * a continuation ce\r
- * @param caseshift\r
- * @return the new value of case shift\r
- */\r
- private final int doCaseBytes(int tertiary, boolean notIsContinuation,\r
- int caseshift)\r
- {\r
- caseshift = doCaseShift(caseshift);\r
-\r
- if (notIsContinuation && tertiary != 0) {\r
- byte casebits = (byte)(tertiary & 0xC0);\r
- if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {\r
- if (casebits == 0) {\r
- m_utilBytes0_[m_utilBytesCount0_ - 1]\r
- |= (1 << (-- caseshift));\r
- }\r
- else {\r
- // second bit\r
- caseshift = doCaseShift(caseshift - 1);\r
- m_utilBytes0_[m_utilBytesCount0_ - 1]\r
- |= ((casebits >> 6) & 1) << (-- caseshift);\r
- }\r
- }\r
- else {\r
- if (casebits != 0) {\r
- m_utilBytes0_[m_utilBytesCount0_ - 1]\r
- |= 1 << (-- caseshift);\r
- // second bit\r
- caseshift = doCaseShift(caseshift);\r
- m_utilBytes0_[m_utilBytesCount0_ - 1]\r
- |= ((casebits >> 7) & 1) << (-- caseshift);\r
- }\r
- else {\r
- caseshift --;\r
- }\r
- }\r
- }\r
-\r
- return caseshift;\r
- }\r
-\r
- /**\r
- * Gets the tertiary byte and adds it to the tertiary byte array\r
- * @param tertiary byte in int for easy comparison\r
- * @param notIsContinuation flag indicating if the current bytes belong to\r
- * a continuation ce\r
- */\r
- private final void doTertiaryBytes(int tertiary, boolean notIsContinuation)\r
- {\r
- if (tertiary != 0) {\r
- // This is compression code.\r
- // sequence size check is included in the if clause\r
- if (tertiary == m_common3_ && notIsContinuation) {\r
- m_utilCount3_ ++;\r
- }\r
- else {\r
- int common3 = m_common3_ & LAST_BYTE_MASK_;\r
- if (tertiary > common3 && m_common3_ == COMMON_NORMAL_3_) {\r
- tertiary += m_addition3_;\r
- }\r
- else if (tertiary <= common3\r
- && m_common3_ == COMMON_UPPER_FIRST_3_) {\r
- tertiary -= m_addition3_;\r
- }\r
- if (m_utilCount3_ > 0) {\r
- if (tertiary > common3) {\r
- while (m_utilCount3_ > m_topCount3_) {\r
- m_utilBytes3_ = append(m_utilBytes3_,\r
- m_utilBytesCount3_,\r
- (byte)(m_top3_ - m_topCount3_));\r
- m_utilBytesCount3_ ++;\r
- m_utilCount3_ -= m_topCount3_;\r
- }\r
- m_utilBytes3_ = append(m_utilBytes3_,\r
- m_utilBytesCount3_,\r
- (byte)(m_top3_\r
- - (m_utilCount3_ - 1)));\r
- m_utilBytesCount3_ ++;\r
- }\r
- else {\r
- while (m_utilCount3_ > m_bottomCount3_) {\r
- m_utilBytes3_ = append(m_utilBytes3_,\r
- m_utilBytesCount3_,\r
- (byte)(m_bottom3_ + m_bottomCount3_));\r
- m_utilBytesCount3_ ++;\r
- m_utilCount3_ -= m_bottomCount3_;\r
- }\r
- m_utilBytes3_ = append(m_utilBytes3_,\r
- m_utilBytesCount3_,\r
- (byte)(m_bottom3_\r
- + (m_utilCount3_ - 1)));\r
- m_utilBytesCount3_ ++;\r
- }\r
- m_utilCount3_ = 0;\r
- }\r
- m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,\r
- (byte)tertiary);\r
- m_utilBytesCount3_ ++;\r
- }\r
- }\r
- }\r
-\r
- /**\r
- * Gets the Quaternary byte and adds it to the Quaternary byte array\r
- * @param isCodePointHiragana flag indicator if the previous codepoint\r
- * we dealt with was Hiragana\r
- * @param commonBottom4 smallest common Quaternary byte\r
- * @param bottomCount4 smallest Quaternary byte\r
- * @param hiragana4 hiragana Quaternary byte\r
- */\r
- private final void doQuaternaryBytes(boolean isCodePointHiragana,\r
- int commonBottom4, int bottomCount4,\r
- byte hiragana4)\r
- {\r
- if (isCodePointHiragana) { // This was Hiragana, need to note it\r
- if (m_utilCount4_ > 0) { // Close this part\r
- while (m_utilCount4_ > bottomCount4) {\r
- m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,\r
- (byte)(commonBottom4\r
- + bottomCount4));\r
- m_utilBytesCount4_ ++;\r
- m_utilCount4_ -= bottomCount4;\r
- }\r
- m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,\r
- (byte)(commonBottom4\r
- + (m_utilCount4_ - 1)));\r
- m_utilBytesCount4_ ++;\r
- m_utilCount4_ = 0;\r
- }\r
- m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,\r
- hiragana4); // Add the Hiragana\r
- m_utilBytesCount4_ ++;\r
- }\r
- else { // This wasn't Hiragana, so we can continue adding stuff\r
- m_utilCount4_ ++;\r
- }\r
- }\r
-\r
- /**\r
- * Iterates through the argument string for all ces.\r
- * Split the ces into their relevant primaries, secondaries etc.\r
- * @param source normalized string\r
- * @param doFrench flag indicator if special handling of French has to be\r
- * done\r
- * @param hiragana4 offset for Hiragana quaternary\r
- * @param commonBottom4 smallest common quaternary byte\r
- * @param bottomCount4 smallest quaternary byte\r
- */\r
- private final void getSortKeyBytes(String source, boolean doFrench,\r
- byte hiragana4, int commonBottom4,\r
- int bottomCount4)\r
-\r
- {\r
- if (m_srcUtilIter_ == null) {\r
- initUtility(true);\r
- }\r
- int backupDecomposition = getDecomposition();\r
- setDecomposition(NO_DECOMPOSITION); // have to revert to backup later\r
- m_srcUtilIter_.setText(source);\r
- m_srcUtilColEIter_.setText(m_srcUtilIter_);\r
- m_utilFrenchStart_ = -1;\r
- m_utilFrenchEnd_ = -1;\r
-\r
- // scriptorder not implemented yet\r
- // const uint8_t *scriptOrder = coll->scriptOrder;\r
-\r
- boolean doShift = false;\r
- boolean notIsContinuation = false;\r
-\r
- int leadPrimary = 0; // int for easier comparison\r
- int caseShift = 0;\r
-\r
- while (true) {\r
- int ce = m_srcUtilColEIter_.next();\r
- if (ce == CollationElementIterator.NULLORDER) {\r
- break;\r
- }\r
-\r
- if (ce == CollationElementIterator.IGNORABLE) {\r
- continue;\r
- }\r
-\r
- notIsContinuation = !isContinuation(ce);\r
-\r
- /*\r
- * if (notIsContinuation) {\r
- if (scriptOrder != NULL) {\r
- primary1 = scriptOrder[primary1];\r
- }\r
- }*/\r
- boolean isPrimaryByteIgnorable = (ce & CE_PRIMARY_MASK_) == 0;\r
- // actually we can just check that the first byte is 0\r
- // generation stuffs the order left first\r
- boolean isSmallerThanVariableTop = (ce >>> CE_PRIMARY_SHIFT_)\r
- <= m_variableTopValue_;\r
- doShift = (m_isAlternateHandlingShifted_\r
- && ((notIsContinuation && isSmallerThanVariableTop\r
- && !isPrimaryByteIgnorable) // primary byte not 0\r
- || (!notIsContinuation && doShift))\r
- || (doShift && isPrimaryByteIgnorable));\r
- if (doShift && isPrimaryByteIgnorable) {\r
- // amendment to the UCA says that primary ignorables and other\r
- // ignorables should be removed if following a shifted code\r
- // point\r
- // if we were shifted and we got an ignorable code point\r
- // we should just completely ignore it\r
- continue;\r
- }\r
- leadPrimary = doPrimaryBytes(ce, notIsContinuation, doShift,\r
- leadPrimary, commonBottom4,\r
- bottomCount4);\r
- if (doShift) {\r
- continue;\r
- }\r
- if (m_utilCompare2_) {\r
- doSecondaryBytes(ce, notIsContinuation, doFrench);\r
- }\r
-\r
- int t = ce & LAST_BYTE_MASK_;\r
- if (!notIsContinuation) {\r
- t = ce & CE_REMOVE_CONTINUATION_MASK_;\r
- }\r
-\r
- if (m_utilCompare0_ && (!isPrimaryByteIgnorable || m_utilCompare2_)) {\r
- // do the case level if we need to do it. We don't want to calculate\r
- // case level for primary ignorables if we have only primary strength and case level\r
- // otherwise we would break well formedness of CEs \r
- caseShift = doCaseBytes(t, notIsContinuation, caseShift);\r
- }\r
- else if (notIsContinuation) {\r
- t ^= m_caseSwitch_;\r
- }\r
-\r
- t &= m_mask3_;\r
-\r
- if (m_utilCompare3_) {\r
- doTertiaryBytes(t, notIsContinuation);\r
- }\r
-\r
- if (m_utilCompare4_ && notIsContinuation) { // compare quad\r
- doQuaternaryBytes(m_srcUtilColEIter_.m_isCodePointHiragana_,\r
- commonBottom4, bottomCount4, hiragana4);\r
- }\r
- }\r
- setDecomposition(backupDecomposition); // reverts to original\r
- if (m_utilFrenchStart_ != -1) {\r
- // one last round of checks\r
- reverseBuffer(m_utilBytes2_);\r
- }\r
- }\r
-\r
- /**\r
- * From the individual strength byte results the final compact sortkey\r
- * will be calculated.\r
- * @param source text string\r
- * @param doFrench flag indicating that special handling of French has to\r
- * be done\r
- * @param commonBottom4 smallest common quaternary byte\r
- * @param bottomCount4 smallest quaternary byte\r
- * @param key output RawCollationKey to store results, key cannot be null\r
- */\r
- private final void getSortKey(String source, boolean doFrench,\r
- int commonBottom4, \r
- int bottomCount4,\r
- RawCollationKey key)\r
- {\r
- // we have done all the CE's, now let's put them together to form\r
- // a key\r
- if (m_utilCompare2_) {\r
- doSecondary(doFrench);\r
- }\r
- // adding case level should be independent of secondary level\r
- if (m_utilCompare0_) {\r
- doCase();\r
- }\r
- if (m_utilCompare3_) {\r
- doTertiary();\r
- if (m_utilCompare4_) {\r
- doQuaternary(commonBottom4, bottomCount4);\r
- if (m_utilCompare5_) {\r
- doIdentical(source);\r
- }\r
-\r
- }\r
- }\r
- m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte)0);\r
- m_utilBytesCount1_ ++;\r
-\r
- key.set(m_utilBytes1_, 0, m_utilBytesCount1_);\r
- }\r
-\r
- /**\r
- * Packs the French bytes\r
- */\r
- private final void doFrench()\r
- {\r
- for (int i = 0; i < m_utilBytesCount2_; i ++) {\r
- byte s = m_utilBytes2_[m_utilBytesCount2_ - i - 1];\r
- // This is compression code.\r
- if (s == COMMON_2_) {\r
- ++ m_utilCount2_;\r
- }\r
- else {\r
- if (m_utilCount2_ > 0) {\r
- // getting the unsigned value\r
- if ((s & LAST_BYTE_MASK_) > COMMON_2_) {\r
- // not necessary for 4th level.\r
- while (m_utilCount2_ > TOP_COUNT_2_) {\r
- m_utilBytes1_ = append(m_utilBytes1_,\r
- m_utilBytesCount1_,\r
- (byte)(COMMON_TOP_2_ - TOP_COUNT_2_));\r
- m_utilBytesCount1_ ++;\r
- m_utilCount2_ -= TOP_COUNT_2_;\r
- }\r
- m_utilBytes1_ = append(m_utilBytes1_,\r
- m_utilBytesCount1_,\r
- (byte)(COMMON_TOP_2_\r
- - (m_utilCount2_ - 1)));\r
- m_utilBytesCount1_ ++;\r
- }\r
- else {\r
- while (m_utilCount2_ > BOTTOM_COUNT_2_) {\r
- m_utilBytes1_ = append(m_utilBytes1_,\r
- m_utilBytesCount1_,\r
- (byte)(COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));\r
- m_utilBytesCount1_ ++;\r
- m_utilCount2_ -= BOTTOM_COUNT_2_;\r
- }\r
- m_utilBytes1_ = append(m_utilBytes1_,\r
- m_utilBytesCount1_,\r
- (byte)(COMMON_BOTTOM_2_\r
- + (m_utilCount2_ - 1)));\r
- m_utilBytesCount1_ ++;\r
- }\r
- m_utilCount2_ = 0;\r
- }\r
- m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, s);\r
- m_utilBytesCount1_ ++;\r
- }\r
- }\r
- if (m_utilCount2_ > 0) {\r
- while (m_utilCount2_ > BOTTOM_COUNT_2_) {\r
- m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,\r
- (byte)(COMMON_BOTTOM_2_\r
- + BOTTOM_COUNT_2_));\r
- m_utilBytesCount1_ ++;\r
- m_utilCount2_ -= BOTTOM_COUNT_2_;\r
- }\r
- m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,\r
- (byte)(COMMON_BOTTOM_2_\r
- + (m_utilCount2_ - 1)));\r
- m_utilBytesCount1_ ++;\r
- }\r
- }\r
-\r
- /**\r
- * Compacts the secondary bytes and stores them into the primary array\r
- * @param doFrench flag indicator that French has to be handled specially\r
- */\r
- private final void doSecondary(boolean doFrench)\r
- {\r
- if (m_utilCount2_ > 0) {\r
- while (m_utilCount2_ > BOTTOM_COUNT_2_) {\r
- m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,\r
- (byte)(COMMON_BOTTOM_2_\r
- + BOTTOM_COUNT_2_));\r
- m_utilBytesCount2_ ++;\r
- m_utilCount2_ -= BOTTOM_COUNT_2_;\r
- }\r
- m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,\r
- (byte)(COMMON_BOTTOM_2_ +\r
- (m_utilCount2_ - 1)));\r
- m_utilBytesCount2_ ++;\r
- }\r
-\r
- m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,\r
- SORT_LEVEL_TERMINATOR_);\r
- m_utilBytesCount1_ ++;\r
-\r
- if (doFrench) { // do the reverse copy\r
- doFrench();\r
- }\r
- else {\r
- if (m_utilBytes1_.length <= m_utilBytesCount1_\r
- + m_utilBytesCount2_) {\r
- m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,\r
- m_utilBytesCount2_);\r
- }\r
- System.arraycopy(m_utilBytes2_, 0, m_utilBytes1_,\r
- m_utilBytesCount1_, m_utilBytesCount2_);\r
- m_utilBytesCount1_ += m_utilBytesCount2_;\r
- }\r
- }\r
-\r
- /**\r
- * Increase buffer size\r
- * @param buffer array of bytes\r
- * @param size of the byte array\r
- * @param incrementsize size to increase\r
- * @return the new buffer\r
- */\r
- private static final byte[] increase(byte buffer[], int size,\r
- int incrementsize)\r
- {\r
- byte result[] = new byte[buffer.length + incrementsize];\r
- System.arraycopy(buffer, 0, result, 0, size);\r
- return result;\r
- }\r
-\r
- /**\r
- * Increase buffer size\r
- * @param buffer array of ints\r
- * @param size of the byte array\r
- * @param incrementsize size to increase\r
- * @return the new buffer\r
- */\r
- private static final int[] increase(int buffer[], int size,\r
- int incrementsize)\r
- {\r
- int result[] = new int[buffer.length + incrementsize];\r
- System.arraycopy(buffer, 0, result, 0, size);\r
- return result;\r
- }\r
-\r
- /**\r
- * Compacts the case bytes and stores them into the primary array\r
- */\r
- private final void doCase()\r
- {\r
- m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,\r
- SORT_LEVEL_TERMINATOR_);\r
- m_utilBytesCount1_ ++;\r
- if (m_utilBytes1_.length <= m_utilBytesCount1_ + m_utilBytesCount0_) {\r
- m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,\r
- m_utilBytesCount0_);\r
- }\r
- System.arraycopy(m_utilBytes0_, 0, m_utilBytes1_, m_utilBytesCount1_,\r
- m_utilBytesCount0_);\r
- m_utilBytesCount1_ += m_utilBytesCount0_;\r
- }\r
-\r
- /**\r
- * Compacts the tertiary bytes and stores them into the primary array\r
- */\r
- private final void doTertiary()\r
- {\r
- if (m_utilCount3_ > 0) {\r
- if (m_common3_ != COMMON_BOTTOM_3_) {\r
- while (m_utilCount3_ >= m_topCount3_) {\r
- m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,\r
- (byte)(m_top3_ - m_topCount3_));\r
- m_utilBytesCount3_ ++;\r
- m_utilCount3_ -= m_topCount3_;\r
- }\r
- m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,\r
- (byte)(m_top3_ - m_utilCount3_));\r
- m_utilBytesCount3_ ++;\r
- }\r
- else {\r
- while (m_utilCount3_ > m_bottomCount3_) {\r
- m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,\r
- (byte)(m_bottom3_\r
- + m_bottomCount3_));\r
- m_utilBytesCount3_ ++;\r
- m_utilCount3_ -= m_bottomCount3_;\r
- }\r
- m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,\r
- (byte)(m_bottom3_\r
- + (m_utilCount3_ - 1)));\r
- m_utilBytesCount3_ ++;\r
- }\r
- }\r
- m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,\r
- SORT_LEVEL_TERMINATOR_);\r
- m_utilBytesCount1_ ++;\r
- if (m_utilBytes1_.length <= m_utilBytesCount1_ + m_utilBytesCount3_) {\r
- m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,\r
- m_utilBytesCount3_);\r
- }\r
- System.arraycopy(m_utilBytes3_, 0, m_utilBytes1_, m_utilBytesCount1_,\r
- m_utilBytesCount3_);\r
- m_utilBytesCount1_ += m_utilBytesCount3_;\r
- }\r
-\r
- /**\r
- * Compacts the quaternary bytes and stores them into the primary array\r
- */\r
- private final void doQuaternary(int commonbottom4, int bottomcount4)\r
- {\r
- if (m_utilCount4_ > 0) {\r
- while (m_utilCount4_ > bottomcount4) {\r
- m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,\r
- (byte)(commonbottom4 + bottomcount4));\r
- m_utilBytesCount4_ ++;\r
- m_utilCount4_ -= bottomcount4;\r
- }\r
- m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,\r
- (byte)(commonbottom4\r
- + (m_utilCount4_ - 1)));\r
- m_utilBytesCount4_ ++;\r
- }\r
- m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,\r
- SORT_LEVEL_TERMINATOR_);\r
- m_utilBytesCount1_ ++;\r
- if (m_utilBytes1_.length <= m_utilBytesCount1_ + m_utilBytesCount4_) {\r
- m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,\r
- m_utilBytesCount4_);\r
- }\r
- System.arraycopy(m_utilBytes4_, 0, m_utilBytes1_, m_utilBytesCount1_,\r
- m_utilBytesCount4_);\r
- m_utilBytesCount1_ += m_utilBytesCount4_;\r
- }\r
-\r
- /**\r
- * Deals with the identical sort.\r
- * Appends the BOCSU version of the source string to the ends of the\r
- * byte buffer.\r
- * @param source text string\r
- */\r
- private final void doIdentical(String source)\r
- {\r
- int isize = BOCU.getCompressionLength(source);\r
- m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,\r
- SORT_LEVEL_TERMINATOR_);\r
- m_utilBytesCount1_ ++;\r
- if (m_utilBytes1_.length <= m_utilBytesCount1_ + isize) {\r
- m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,\r
- 1 + isize);\r
- }\r
- m_utilBytesCount1_ = BOCU.compress(source, m_utilBytes1_,\r
- m_utilBytesCount1_);\r
- }\r
-\r
- /**\r
- * Gets the offset of the first unmatched characters in source and target.\r
- * This method returns the offset of the start of a contraction or a\r
- * combining sequence, if the first difference is in the middle of such a\r
- * sequence.\r
- * @param source string\r
- * @param target string\r
- * @return offset of the first unmatched characters in source and target.\r
- */\r
- private final int getFirstUnmatchedOffset(String source, String target)\r
- {\r
- int result = 0;\r
- int slength = source.length();\r
- int tlength = target.length();\r
- int minlength = slength;\r
- if (minlength > tlength) {\r
- minlength = tlength;\r
- }\r
- while (result < minlength\r
- && source.charAt(result) == target.charAt(result)) {\r
- result ++;\r
- }\r
- if (result > 0) {\r
- // There is an identical portion at the beginning of the two\r
- // strings. If the identical portion ends within a contraction or a\r
- // combining character sequence, back up to the start of that\r
- // sequence.\r
- char schar = 0;\r
- char tchar = 0;\r
- if (result < minlength) {\r
- schar = source.charAt(result); // first differing chars\r
- tchar = target.charAt(result);\r
- }\r
- else {\r
- schar = source.charAt(minlength - 1);\r
- if (isUnsafe(schar)) {\r
- tchar = schar;\r
- }\r
- else if (slength == tlength) {\r
- return result;\r
- }\r
- else if (slength < tlength) {\r
- tchar = target.charAt(result);\r
- }\r
- else {\r
- schar = source.charAt(result);\r
- }\r
- }\r
- if (isUnsafe(schar) || isUnsafe(tchar))\r
- {\r
- // We are stopped in the middle of a contraction or combining\r
- // sequence.\r
- // Look backwards for the part of the string for the start of\r
- // the sequence\r
- // It doesn't matter which string we scan, since they are the\r
- // same in this region.\r
- do {\r
- result --;\r
- }\r
- while (result > 0 && isUnsafe(source.charAt(result)));\r
- }\r
- }\r
- return result;\r
- }\r
-\r
- /**\r
- * Appending an byte to an array of bytes and increases it if we run out of\r
- * space\r
- * @param array of byte arrays\r
- * @param appendindex index in the byte array to append\r
- * @param value to append\r
- * @return array if array size can accomodate the new value, otherwise\r
- * a bigger array will be created and returned\r
- */\r
- private static final byte[] append(byte array[], int appendindex,\r
- byte value)\r
- {\r
- try {\r
- array[appendindex] = value;\r
- }\r
- catch (ArrayIndexOutOfBoundsException e) {\r
- array = increase(array, appendindex, SORT_BUFFER_INIT_SIZE_);\r
- array[appendindex] = value;\r
- }\r
- return array;\r
- }\r
-\r
- /**\r
- * This is a trick string compare function that goes in and uses sortkeys\r
- * to compare. It is used when compare gets in trouble and needs to bail\r
- * out.\r
- * @param source text string\r
- * @param target text string\r
- */\r
- private final int compareBySortKeys(String source, String target)\r
-\r
- {\r
- m_utilRawCollationKey_ = getRawCollationKey(source, \r
- m_utilRawCollationKey_);\r
- // this method is very seldom called\r
- RawCollationKey targetkey = getRawCollationKey(target, null);\r
- return m_utilRawCollationKey_.compareTo(targetkey);\r
- }\r
-\r
- /**\r
- * Performs the primary comparisons, and fills up the CE buffer at the\r
- * same time.\r
- * The return value toggles between the comparison result and the hiragana\r
- * result. If either the source is greater than target or vice versa, the\r
- * return result is the comparison result, ie 1 or -1, furthermore the\r
- * cebuffers will be cleared when that happens. If the primary comparisons\r
- * are equal, we'll have to continue with secondary comparison. In this case\r
- * the cebuffer will not be cleared and the return result will be the\r
- * hiragana result.\r
- * @param doHiragana4 flag indicator that Hiragana Quaternary has to be\r
- * observed\r
- * @param lowestpvalue the lowest primary value that will not be ignored if\r
- * alternate handling is shifted\r
- * @param source text string\r
- * @param target text string\r
- * @param textoffset offset in text to start the comparison\r
- * @return comparion result if a primary difference is found, otherwise\r
- * hiragana result\r
- */\r
- private final int doPrimaryCompare(boolean doHiragana4, int lowestpvalue,\r
- String source, String target,\r
- int textoffset)\r
-\r
- {\r
- // Preparing the context objects for iterating over strings\r
- m_srcUtilIter_.setText(source);\r
- m_srcUtilColEIter_.setText(m_srcUtilIter_, textoffset);\r
- m_tgtUtilIter_.setText(target);\r
- m_tgtUtilColEIter_.setText(m_tgtUtilIter_, textoffset);\r
-\r
- // Non shifted primary processing is quite simple\r
- if (!m_isAlternateHandlingShifted_) {\r
- int hiraganaresult = 0;\r
- while (true) {\r
- int sorder = 0;\r
- // We fetch CEs until we hit a non ignorable primary or end.\r
- do {\r
- sorder = m_srcUtilColEIter_.next();\r
- m_srcUtilCEBuffer_ = append(m_srcUtilCEBuffer_,\r
- m_srcUtilCEBufferSize_, sorder);\r
- m_srcUtilCEBufferSize_ ++;\r
- sorder &= CE_PRIMARY_MASK_;\r
- } while (sorder == CollationElementIterator.IGNORABLE);\r
-\r
- int torder = 0;\r
- do {\r
- torder = m_tgtUtilColEIter_.next();\r
- m_tgtUtilCEBuffer_ = append(m_tgtUtilCEBuffer_,\r
- m_tgtUtilCEBufferSize_, torder);\r
- m_tgtUtilCEBufferSize_ ++;\r
- torder &= CE_PRIMARY_MASK_;\r
- } while (torder == CollationElementIterator.IGNORABLE);\r
-\r
- // if both primaries are the same\r
- if (sorder == torder) {\r
- // and there are no more CEs, we advance to the next level\r
- // see if we are at the end of either string\r
- if (m_srcUtilCEBuffer_[m_srcUtilCEBufferSize_ - 1]\r
- == CollationElementIterator.NULLORDER) {\r
- if (m_tgtUtilCEBuffer_[m_tgtUtilCEBufferSize_ - 1] \r
- != CollationElementIterator.NULLORDER) {\r
- return -1;\r
- }\r
- break;\r
- }\r
- else if (m_tgtUtilCEBuffer_[m_tgtUtilCEBufferSize_ - 1]\r
- == CollationElementIterator.NULLORDER) {\r
- return 1;\r
- }\r
- if (doHiragana4 && hiraganaresult == 0\r
- && m_srcUtilColEIter_.m_isCodePointHiragana_ !=\r
- m_tgtUtilColEIter_.m_isCodePointHiragana_) {\r
- if (m_srcUtilColEIter_.m_isCodePointHiragana_) {\r
- hiraganaresult = -1;\r
- }\r
- else {\r
- hiraganaresult = 1;\r
- }\r
- }\r
- }\r
- else {\r
- // if two primaries are different, we are done\r
- return endPrimaryCompare(sorder, torder);\r
- }\r
- }\r
- // no primary difference... do the rest from the buffers\r
- return hiraganaresult;\r
- }\r
- else { // shifted - do a slightly more complicated processing :)\r
- while (true) {\r
- int sorder = getPrimaryShiftedCompareCE(m_srcUtilColEIter_,\r
- lowestpvalue, true);\r
- int torder = getPrimaryShiftedCompareCE(m_tgtUtilColEIter_,\r
- lowestpvalue, false);\r
- if (sorder == torder) {\r
- if (m_srcUtilCEBuffer_[m_srcUtilCEBufferSize_ - 1]\r
- == CollationElementIterator.NULLORDER) {\r
- break;\r
- }\r
- else {\r
- continue;\r
- }\r
- }\r
- else {\r
- return endPrimaryCompare(sorder, torder);\r
- }\r
- } // no primary difference... do the rest from the buffers\r
- }\r
- return 0;\r
- }\r
-\r
- /**\r
- * This is used only for primary strength when we know that sorder is\r
- * already different from torder.\r
- * Compares sorder and torder, returns -1 if sorder is less than torder.\r
- * Clears the cebuffer at the same time.\r
- * @param sorder source strength order\r
- * @param torder target strength order\r
- * @return the comparison result of sorder and torder\r
- */\r
- private final int endPrimaryCompare(int sorder, int torder)\r
- {\r
- // if we reach here, the ce offset accessed is the last ce\r
- // appended to the buffer\r
- boolean isSourceNullOrder = (m_srcUtilCEBuffer_[\r
- m_srcUtilCEBufferSize_ - 1]\r
- == CollationElementIterator.NULLORDER);\r
- boolean isTargetNullOrder = (m_tgtUtilCEBuffer_[\r
- m_tgtUtilCEBufferSize_ - 1]\r
- == CollationElementIterator.NULLORDER);\r
- m_srcUtilCEBufferSize_ = -1;\r
- m_tgtUtilCEBufferSize_ = -1;\r
- if (isSourceNullOrder) {\r
- return -1;\r
- }\r
- if (isTargetNullOrder) {\r
- return 1;\r
- }\r
- // getting rid of the sign\r
- sorder >>>= CE_PRIMARY_SHIFT_;\r
- torder >>>= CE_PRIMARY_SHIFT_;\r
- if (sorder < torder) {\r
- return -1;\r
- }\r
- return 1;\r
- }\r
-\r
- /**\r
- * Calculates the next primary shifted value and fills up cebuffer with the\r
- * next non-ignorable ce.\r
- * @param coleiter collation element iterator\r
- * @param doHiragana4 flag indicator if hiragana quaternary is to be\r
- * handled\r
- * @param lowestpvalue lowest primary shifted value that will not be\r
- * ignored\r
- * @return result next modified ce\r
- */\r
- private final int getPrimaryShiftedCompareCE(\r
- CollationElementIterator coleiter,\r
- int lowestpvalue, boolean isSrc)\r
-\r
- {\r
- boolean shifted = false;\r
- int result = CollationElementIterator.IGNORABLE;\r
- int cebuffer[] = m_srcUtilCEBuffer_;\r
- int cebuffersize = m_srcUtilCEBufferSize_;\r
- if (!isSrc) {\r
- cebuffer = m_tgtUtilCEBuffer_;\r
- cebuffersize = m_tgtUtilCEBufferSize_;\r
- }\r
- while (true) {\r
- result = coleiter.next();\r
- if (result == CollationElementIterator.NULLORDER) {\r
- cebuffer = append(cebuffer, cebuffersize, result);\r
- cebuffersize ++;\r
- break;\r
- }\r
- else if (result == CollationElementIterator.IGNORABLE\r
- || (shifted\r
- && (result & CE_PRIMARY_MASK_)\r
- == CollationElementIterator.IGNORABLE)) {\r
- // UCA amendment - ignore ignorables that follow shifted code\r
- // points\r
- continue;\r
- }\r
- else if (isContinuation(result)) {\r
- if ((result & CE_PRIMARY_MASK_)\r
- != CollationElementIterator.IGNORABLE) {\r
- // There is primary value\r
- if (shifted) {\r
- result = (result & CE_PRIMARY_MASK_)\r
- | CE_CONTINUATION_MARKER_;\r
- // preserve interesting continuation\r
- cebuffer = append(cebuffer, cebuffersize, result);\r
- cebuffersize ++;\r
- continue;\r
- }\r
- else {\r
- cebuffer = append(cebuffer, cebuffersize, result);\r
- cebuffersize ++;\r
- break;\r
- }\r
- }\r
- else { // Just lower level values\r
- if (!shifted) {\r
- cebuffer = append(cebuffer, cebuffersize, result);\r
- cebuffersize ++;\r
- }\r
- }\r
- }\r
- else { // regular\r
- if (Utility.compareUnsigned(result & CE_PRIMARY_MASK_,\r
- lowestpvalue) > 0) {\r
- cebuffer = append(cebuffer, cebuffersize, result);\r
- cebuffersize ++;\r
- break;\r
- }\r
- else {\r
- if ((result & CE_PRIMARY_MASK_) != 0) {\r
- shifted = true;\r
- result &= CE_PRIMARY_MASK_;\r
- cebuffer = append(cebuffer, cebuffersize, result);\r
- cebuffersize ++;\r
- continue;\r
- }\r
- else {\r
- cebuffer = append(cebuffer, cebuffersize, result);\r
- cebuffersize ++;\r
- shifted = false;\r
- continue;\r
- }\r
- }\r
- }\r
- }\r
- if (isSrc) {\r
- m_srcUtilCEBuffer_ = cebuffer;\r
- m_srcUtilCEBufferSize_ = cebuffersize;\r
- }\r
- else {\r
- m_tgtUtilCEBuffer_ = cebuffer;\r
- m_tgtUtilCEBufferSize_ = cebuffersize;\r
- }\r
- result &= CE_PRIMARY_MASK_;\r
- return result;\r
- }\r
-\r
- /**\r
- * Appending an int to an array of ints and increases it if we run out of\r
- * space\r
- * @param array of int arrays\r
- * @param appendindex index at which value will be appended\r
- * @param value to append\r
- * @return array if size is not increased, otherwise a new array will be\r
- * returned\r
- */\r
- private static final int[] append(int array[], int appendindex, int value)\r
- {\r
- if (appendindex + 1 >= array.length) {\r
- array = increase(array, appendindex, CE_BUFFER_SIZE_);\r
- }\r
- array[appendindex] = value;\r
- return array;\r
- }\r
-\r
- /**\r
- * Does secondary strength comparison based on the collected ces.\r
- * @param doFrench flag indicates if French ordering is to be done\r
- * @return the secondary strength comparison result\r
- */\r
- private final int doSecondaryCompare(boolean doFrench)\r
- {\r
- // now, we're gonna reexamine collected CEs\r
- if (!doFrench) { // normal\r
- int soffset = 0;\r
- int toffset = 0;\r
- while (true) {\r
- int sorder = CollationElementIterator.IGNORABLE;\r
- while (sorder == CollationElementIterator.IGNORABLE) {\r
- sorder = m_srcUtilCEBuffer_[soffset ++]\r
- & CE_SECONDARY_MASK_;\r
- }\r
- int torder = CollationElementIterator.IGNORABLE;\r
- while (torder == CollationElementIterator.IGNORABLE) {\r
- torder = m_tgtUtilCEBuffer_[toffset ++]\r
- & CE_SECONDARY_MASK_;\r
- }\r
-\r
- if (sorder == torder) {\r
- if (m_srcUtilCEBuffer_[soffset - 1]\r
- == CollationElementIterator.NULLORDER) {\r
- if (m_tgtUtilCEBuffer_[toffset - 1] \r
- != CollationElementIterator.NULLORDER) {\r
- return -1;\r
- }\r
- break;\r
- }\r
- else if (m_tgtUtilCEBuffer_[toffset - 1]\r
- == CollationElementIterator.NULLORDER) {\r
- return 1;\r
- }\r
- }\r
- else {\r
- if (m_srcUtilCEBuffer_[soffset - 1] ==\r
- CollationElementIterator.NULLORDER) {\r
- return -1;\r
- }\r
- if (m_tgtUtilCEBuffer_[toffset - 1] ==\r
- CollationElementIterator.NULLORDER) {\r
- return 1;\r
- }\r
- return (sorder < torder) ? -1 : 1;\r
- }\r
- }\r
- }\r
- else { // do the French\r
- m_srcUtilContOffset_ = 0;\r
- m_tgtUtilContOffset_ = 0;\r
- m_srcUtilOffset_ = m_srcUtilCEBufferSize_ - 2;\r
- m_tgtUtilOffset_ = m_tgtUtilCEBufferSize_ - 2;\r
- while (true) {\r
- int sorder = getSecondaryFrenchCE(true);\r
- int torder = getSecondaryFrenchCE(false);\r
- if (sorder == torder) {\r
- if ((m_srcUtilOffset_ < 0 && m_tgtUtilOffset_ < 0)\r
- || (m_srcUtilOffset_ >= 0 \r
- && m_srcUtilCEBuffer_[m_srcUtilOffset_]\r
- == CollationElementIterator.NULLORDER)) {\r
- break;\r
- }\r
- }\r
- else {\r
- return (sorder < torder) ? -1 : 1;\r
- }\r
- }\r
- }\r
- return 0;\r
- }\r
-\r
- /**\r
- * Calculates the next secondary french CE.\r
- * @param isSrc flag indicator if we are calculating the src ces\r
- * @return result next modified ce\r
- */\r
- private final int getSecondaryFrenchCE(boolean isSrc)\r
- {\r
- int result = CollationElementIterator.IGNORABLE;\r
- int offset = m_srcUtilOffset_;\r
- int continuationoffset = m_srcUtilContOffset_;\r
- int cebuffer[] = m_srcUtilCEBuffer_;\r
- if (!isSrc) {\r
- offset = m_tgtUtilOffset_;\r
- continuationoffset = m_tgtUtilContOffset_;\r
- cebuffer = m_tgtUtilCEBuffer_;\r
- }\r
-\r
- while (result == CollationElementIterator.IGNORABLE\r
- && offset >= 0) {\r
- if (continuationoffset == 0) {\r
- result = cebuffer[offset];\r
- while (isContinuation(cebuffer[offset --])){\r
- }\r
- // after this, sorder is at the start of continuation,\r
- // and offset points before that\r
- if (isContinuation(cebuffer[offset + 1])) {\r
- // save offset for later\r
- continuationoffset = offset;\r
- offset += 2;\r
- }\r
- }\r
- else {\r
- result = cebuffer[offset ++];\r
- if (!isContinuation(result)) {\r
- // we have finished with this continuation\r
- offset = continuationoffset;\r
- // reset the pointer to before continuation\r
- continuationoffset = 0;\r
- continue;\r
- }\r
- }\r
- result &= CE_SECONDARY_MASK_; // remove continuation bit\r
- }\r
- if (isSrc) {\r
- m_srcUtilOffset_ = offset;\r
- m_srcUtilContOffset_ = continuationoffset;\r
- }\r
- else {\r
- m_tgtUtilOffset_ = offset;\r
- m_tgtUtilContOffset_ = continuationoffset;\r
- }\r
- return result;\r
- }\r
-\r
- /**\r
- * Does case strength comparison based on the collected ces.\r
- * @return the case strength comparison result\r
- */\r
- private final int doCaseCompare()\r
- {\r
- int soffset = 0;\r
- int toffset = 0;\r
- while (true) {\r
- int sorder = CollationElementIterator.IGNORABLE;\r
- int torder = CollationElementIterator.IGNORABLE;\r
- while ((sorder & CE_REMOVE_CASE_)\r
- == CollationElementIterator.IGNORABLE) {\r
- sorder = m_srcUtilCEBuffer_[soffset ++];\r
- if (!isContinuation(sorder) && ((sorder & CE_PRIMARY_MASK_) != 0 || m_utilCompare2_ == true)) {\r
- // primary ignorables should not be considered on the case level when the strength is primary\r
- // otherwise, the CEs stop being well-formed\r
- sorder &= CE_CASE_MASK_3_;\r
- sorder ^= m_caseSwitch_;\r
- }\r
- else {\r
- sorder = CollationElementIterator.IGNORABLE;\r
- }\r
- }\r
-\r
- while ((torder & CE_REMOVE_CASE_)\r
- == CollationElementIterator.IGNORABLE) {\r
- torder = m_tgtUtilCEBuffer_[toffset ++];\r
- if (!isContinuation(torder) && ((torder & CE_PRIMARY_MASK_) != 0 || m_utilCompare2_ == true)) {\r
- // primary ignorables should not be considered on the case level when the strength is primary\r
- // otherwise, the CEs stop being well-formed\r
- torder &= CE_CASE_MASK_3_;\r
- torder ^= m_caseSwitch_;\r
- }\r
- else {\r
- torder = CollationElementIterator.IGNORABLE;\r
- }\r
- }\r
-\r
- sorder &= CE_CASE_BIT_MASK_;\r
- torder &= CE_CASE_BIT_MASK_;\r
- if (sorder == torder) {\r
- // checking end of strings\r
- if (m_srcUtilCEBuffer_[soffset - 1]\r
- == CollationElementIterator.NULLORDER) {\r
- if (m_tgtUtilCEBuffer_[toffset - 1] \r
- != CollationElementIterator.NULLORDER) {\r
- return -1;\r
- }\r
- break;\r
- }\r
- else if (m_tgtUtilCEBuffer_[toffset - 1]\r
- == CollationElementIterator.NULLORDER) {\r
- return 1;\r
- }\r
- }\r
- else {\r
- if (m_srcUtilCEBuffer_[soffset - 1]\r
- == CollationElementIterator.NULLORDER) {\r
- return -1;\r
- }\r
- if (m_tgtUtilCEBuffer_[soffset - 1]\r
- == CollationElementIterator.NULLORDER) {\r
- return 1;\r
- }\r
- return (sorder < torder) ? -1 : 1;\r
- }\r
- }\r
- return 0;\r
- }\r
-\r
- /**\r
- * Does tertiary strength comparison based on the collected ces.\r
- * @return the tertiary strength comparison result\r
- */\r
- private final int doTertiaryCompare()\r
- {\r
- int soffset = 0;\r
- int toffset = 0;\r
- while (true) {\r
- int sorder = CollationElementIterator.IGNORABLE;\r
- int torder = CollationElementIterator.IGNORABLE;\r
- while ((sorder & CE_REMOVE_CASE_)\r
- == CollationElementIterator.IGNORABLE) {\r
- sorder = m_srcUtilCEBuffer_[soffset ++] & m_mask3_;\r
- if (!isContinuation(sorder)) {\r
- sorder ^= m_caseSwitch_;\r
- }\r
- else {\r
- sorder &= CE_REMOVE_CASE_;\r
- }\r
- }\r
-\r
- while ((torder & CE_REMOVE_CASE_)\r
- == CollationElementIterator.IGNORABLE) {\r
- torder = m_tgtUtilCEBuffer_[toffset ++] & m_mask3_;\r
- if (!isContinuation(torder)) {\r
- torder ^= m_caseSwitch_;\r
- }\r
- else {\r
- torder &= CE_REMOVE_CASE_;\r
- }\r
- }\r
-\r
- if (sorder == torder) {\r
- if (m_srcUtilCEBuffer_[soffset - 1]\r
- == CollationElementIterator.NULLORDER) {\r
- if (m_tgtUtilCEBuffer_[toffset - 1]\r
- != CollationElementIterator.NULLORDER) {\r
- return -1;\r
- }\r
- break;\r
- }\r
- else if (m_tgtUtilCEBuffer_[toffset - 1]\r
- == CollationElementIterator.NULLORDER) {\r
- return 1;\r
- }\r
- }\r
- else {\r
- if (m_srcUtilCEBuffer_[soffset - 1] ==\r
- CollationElementIterator.NULLORDER) {\r
- return -1;\r
- }\r
- if (m_tgtUtilCEBuffer_[toffset - 1] ==\r
- CollationElementIterator.NULLORDER) {\r
- return 1;\r
- }\r
- return (sorder < torder) ? -1 : 1;\r
- }\r
- }\r
- return 0;\r
- }\r
-\r
- /**\r
- * Does quaternary strength comparison based on the collected ces.\r
- * @param lowestpvalue the lowest primary value that will not be ignored if\r
- * alternate handling is shifted\r
- * @return the quaternary strength comparison result\r
- */\r
- private final int doQuaternaryCompare(int lowestpvalue)\r
- {\r
- boolean sShifted = true;\r
- boolean tShifted = true;\r
- int soffset = 0;\r
- int toffset = 0;\r
- while (true) {\r
- int sorder = CollationElementIterator.IGNORABLE;\r
- int torder = CollationElementIterator.IGNORABLE;\r
- while (sorder == CollationElementIterator.IGNORABLE\r
- || (isContinuation(sorder) && !sShifted)) {\r
- sorder = m_srcUtilCEBuffer_[soffset ++];\r
- if (isContinuation(sorder)) {\r
- if (!sShifted) {\r
- continue;\r
- }\r
- }\r
- else if (Utility.compareUnsigned(sorder, lowestpvalue) > 0\r
- || (sorder & CE_PRIMARY_MASK_)\r
- == CollationElementIterator.IGNORABLE) {\r
- // non continuation\r
- sorder = CE_PRIMARY_MASK_;\r
- sShifted = false;\r
- }\r
- else {\r
- sShifted = true;\r
- }\r
- }\r
- sorder >>>= CE_PRIMARY_SHIFT_;\r
- while (torder == CollationElementIterator.IGNORABLE\r
- || (isContinuation(torder) && !tShifted)) {\r
- torder = m_tgtUtilCEBuffer_[toffset ++];\r
- if (isContinuation(torder)) {\r
- if (!tShifted) {\r
- continue;\r
- }\r
- }\r
- else if (Utility.compareUnsigned(torder, lowestpvalue) > 0\r
- || (torder & CE_PRIMARY_MASK_)\r
- == CollationElementIterator.IGNORABLE) {\r
- // non continuation\r
- torder = CE_PRIMARY_MASK_;\r
- tShifted = false;\r
- }\r
- else {\r
- tShifted = true;\r
- }\r
- }\r
- torder >>>= CE_PRIMARY_SHIFT_;\r
-\r
- if (sorder == torder) {\r
- if (m_srcUtilCEBuffer_[soffset - 1]\r
- == CollationElementIterator.NULLORDER) {\r
- if (m_tgtUtilCEBuffer_[toffset - 1]\r
- != CollationElementIterator.NULLORDER) {\r
- return -1;\r
- }\r
- break;\r
- }\r
- else if (m_tgtUtilCEBuffer_[toffset - 1]\r
- == CollationElementIterator.NULLORDER) {\r
- return 1;\r
- }\r
- }\r
- else {\r
- if (m_srcUtilCEBuffer_[soffset - 1] ==\r
- CollationElementIterator.NULLORDER) {\r
- return -1;\r
- }\r
- if (m_tgtUtilCEBuffer_[toffset - 1] ==\r
- CollationElementIterator.NULLORDER) {\r
- return 1;\r
- }\r
- return (sorder < torder) ? -1 : 1;\r
- }\r
- }\r
- return 0;\r
- }\r
-\r
- /**\r
- * Internal function. Does byte level string compare. Used by strcoll if\r
- * strength == identical and strings are otherwise equal. This is a rare\r
- * case. Comparison must be done on NFD normalized strings. FCD is not good\r
- * enough.\r
- * @param source text\r
- * @param target text\r
- * @param offset of the first difference in the text strings\r
- * @param normalize flag indicating if we are to normalize the text before\r
- * comparison\r
- * @return 1 if source is greater than target, -1 less than and 0 if equals\r
- */\r
- private static final int doIdenticalCompare(String source, String target,\r
- int offset, boolean normalize)\r
-\r
- {\r
- if (normalize) {\r
- if (Normalizer.quickCheck(source, Normalizer.NFD,0)\r
- != Normalizer.YES) {\r
- source = Normalizer.decompose(source, false);\r
- }\r
-\r
- if (Normalizer.quickCheck(target, Normalizer.NFD,0)\r
- != Normalizer.YES) {\r
- target = Normalizer.decompose(target, false);\r
- }\r
- offset = 0;\r
- }\r
-\r
- return doStringCompare(source, target, offset);\r
- }\r
-\r
- /**\r
- * Compares string for their codepoint order.\r
- * This comparison handles surrogate characters and place them after the\r
- * all non surrogate characters.\r
- * @param source text\r
- * @param target text\r
- * @param offset start offset for comparison\r
- * @return 1 if source is greater than target, -1 less than and 0 if equals\r
- */\r
- private static final int doStringCompare(String source,\r
- String target,\r
- int offset)\r
- {\r
- // compare identical prefixes - they do not need to be fixed up\r
- char schar = 0;\r
- char tchar = 0;\r
- int slength = source.length();\r
- int tlength = target.length();\r
- int minlength = Math.min(slength, tlength);\r
- while (offset < minlength) {\r
- schar = source.charAt(offset);\r
- tchar = target.charAt(offset ++);\r
- if (schar != tchar) {\r
- break;\r
- }\r
- }\r
-\r
- if (schar == tchar && offset == minlength) {\r
- if (slength > minlength) {\r
- return 1;\r
- }\r
- if (tlength > minlength) {\r
- return -1;\r
- }\r
- return 0;\r
- }\r
-\r
- // if both values are in or above the surrogate range, Fix them up.\r
- if (schar >= UTF16.LEAD_SURROGATE_MIN_VALUE\r
- && tchar >= UTF16.LEAD_SURROGATE_MIN_VALUE) {\r
- schar = fixupUTF16(schar);\r
- tchar = fixupUTF16(tchar);\r
- }\r
-\r
- // now c1 and c2 are in UTF-32-compatible order\r
- return (schar < tchar) ? -1 : 1; // schar and tchar has to be different\r
- }\r
-\r
- /**\r
- * Rotate surrogates to the top to get code point order\r
- */\r
- private static final char fixupUTF16(char ch)\r
- {\r
- if (ch >= 0xe000) {\r
- ch -= 0x800;\r
- }\r
- else {\r
- ch += 0x2000;\r
- }\r
- return ch;\r
- }\r
-\r
- /**\r
- * Resets the internal case data members and compression values.\r
- */\r
- private void updateInternalState()\r
- {\r
- if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {\r
- m_caseSwitch_ = CASE_SWITCH_;\r
- }\r
- else {\r
- m_caseSwitch_ = NO_CASE_SWITCH_;\r
- }\r
-\r
- if (m_isCaseLevel_ || m_caseFirst_ == AttributeValue.OFF_) {\r
- m_mask3_ = CE_REMOVE_CASE_;\r
- m_common3_ = COMMON_NORMAL_3_;\r
- m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_OFF_;\r
- m_top3_ = COMMON_TOP_CASE_SWITCH_OFF_3_;\r
- m_bottom3_ = COMMON_BOTTOM_3_;\r
- }\r
- else {\r
- m_mask3_ = CE_KEEP_CASE_;\r
- m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_ON_;\r
- if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {\r
- m_common3_ = COMMON_UPPER_FIRST_3_;\r
- m_top3_ = COMMON_TOP_CASE_SWITCH_UPPER_3_;\r
- m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_UPPER_3_;\r
- } else {\r
- m_common3_ = COMMON_NORMAL_3_;\r
- m_top3_ = COMMON_TOP_CASE_SWITCH_LOWER_3_;\r
- m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_LOWER_3_;\r
- }\r
- }\r
-\r
- // Set the compression values\r
- int total3 = m_top3_ - COMMON_BOTTOM_3_ - 1;\r
- // we multilply double with int, but need only int\r
- m_topCount3_ = (int)(PROPORTION_3_ * total3);\r
- m_bottomCount3_ = total3 - m_topCount3_;\r
-\r
- if (!m_isCaseLevel_ && getStrength() == AttributeValue.TERTIARY_\r
- && !m_isFrenchCollation_ && !m_isAlternateHandlingShifted_) {\r
- m_isSimple3_ = true;\r
- }\r
- else {\r
- m_isSimple3_ = false;\r
- }\r
- if(!m_isCaseLevel_ && getStrength() <= AttributeValue.TERTIARY_ && !m_isNumericCollation_\r
- && !m_isAlternateHandlingShifted_ && !latinOneFailed_) {\r
- if(latinOneCEs_ == null || latinOneRegenTable_) {\r
- if(setUpLatinOne()) { // if we succeed in building latin1 table, we'll use it\r
- latinOneUse_ = true;\r
- } else {\r
- latinOneUse_ = false;\r
- latinOneFailed_ = true;\r
- }\r
- latinOneRegenTable_ = false;\r
- } else { // latin1Table exists and it doesn't need to be regenerated, just use it\r
- latinOneUse_ = true;\r
- }\r
- } else {\r
- latinOneUse_ = false;\r
- }\r
-\r
- }\r
-\r
- /**\r
- * Initializes the RuleBasedCollator\r
- */\r
- private final void init()\r
- {\r
- for (m_minUnsafe_ = 0; m_minUnsafe_ < DEFAULT_MIN_HEURISTIC_;\r
- m_minUnsafe_ ++) {\r
- // Find the smallest unsafe char.\r
- if (isUnsafe(m_minUnsafe_)) {\r
- break;\r
- }\r
- }\r
-\r
- for (m_minContractionEnd_ = 0;\r
- m_minContractionEnd_ < DEFAULT_MIN_HEURISTIC_;\r
- m_minContractionEnd_ ++) {\r
- // Find the smallest contraction-ending char.\r
- if (isContractionEnd(m_minContractionEnd_)) {\r
- break;\r
- }\r
- }\r
- latinOneFailed_ = true;\r
- setStrength(m_defaultStrength_);\r
- setDecomposition(m_defaultDecomposition_);\r
- m_variableTopValue_ = m_defaultVariableTopValue_;\r
- m_isFrenchCollation_ = m_defaultIsFrenchCollation_;\r
- m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_;\r
- m_isCaseLevel_ = m_defaultIsCaseLevel_;\r
- m_caseFirst_ = m_defaultCaseFirst_;\r
- m_isHiragana4_ = m_defaultIsHiragana4_;\r
- m_isNumericCollation_ = m_defaultIsNumericCollation_;\r
- latinOneFailed_ = false;\r
- updateInternalState();\r
- }\r
-\r
- /**\r
- * Initializes utility iterators and byte buffer used by compare\r
- */\r
- private final void initUtility(boolean allocate) {\r
- if (allocate) {\r
- if (m_srcUtilIter_ == null) {\r
- m_srcUtilIter_ = new StringUCharacterIterator();\r
- m_srcUtilColEIter_ = new CollationElementIterator(m_srcUtilIter_, this);\r
- m_tgtUtilIter_ = new StringUCharacterIterator();\r
- m_tgtUtilColEIter_ = new CollationElementIterator(m_tgtUtilIter_, this);\r
- m_utilBytes0_ = new byte[SORT_BUFFER_INIT_SIZE_CASE_]; // case\r
- m_utilBytes1_ = new byte[SORT_BUFFER_INIT_SIZE_1_]; // primary\r
- m_utilBytes2_ = new byte[SORT_BUFFER_INIT_SIZE_2_]; // secondary\r
- m_utilBytes3_ = new byte[SORT_BUFFER_INIT_SIZE_3_]; // tertiary\r
- m_utilBytes4_ = new byte[SORT_BUFFER_INIT_SIZE_4_]; // Quaternary\r
- m_srcUtilCEBuffer_ = new int[CE_BUFFER_SIZE_];\r
- m_tgtUtilCEBuffer_ = new int[CE_BUFFER_SIZE_];\r
- }\r
- } else {\r
- m_srcUtilIter_ = null;\r
- m_srcUtilColEIter_ = null;\r
- m_tgtUtilIter_ = null;\r
- m_tgtUtilColEIter_ = null;\r
- m_utilBytes0_ = null;\r
- m_utilBytes1_ = null;\r
- m_utilBytes2_ = null;\r
- m_utilBytes3_ = null;\r
- m_utilBytes4_ = null;\r
- m_srcUtilCEBuffer_ = null;\r
- m_tgtUtilCEBuffer_ = null;\r
- }\r
- }\r
-\r
- // Consts for Latin-1 special processing\r
- private static final int ENDOFLATINONERANGE_ = 0xFF;\r
- private static final int LATINONETABLELEN_ = (ENDOFLATINONERANGE_+50);\r
- private static final int BAIL_OUT_CE_ = 0xFF000000;\r
-\r
- /**\r
- * Generate latin-1 tables\r
- */\r
-\r
- private class shiftValues {\r
- int primShift = 24;\r
- int secShift = 24;\r
- int terShift = 24;\r
- }\r
-\r
- private final void\r
- addLatinOneEntry(char ch, int CE, shiftValues sh) {\r
- int primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;\r
- boolean reverseSecondary = false;\r
- if(!isContinuation(CE)) {\r
- tertiary = ((CE & m_mask3_));\r
- tertiary ^= m_caseSwitch_;\r
- reverseSecondary = true;\r
- } else {\r
- tertiary = (byte)((CE & CE_REMOVE_CONTINUATION_MASK_));\r
- tertiary &= CE_REMOVE_CASE_;\r
- reverseSecondary = false;\r
- }\r
-\r
- secondary = ((CE >>>= 8) & LAST_BYTE_MASK_);\r
- primary2 = ((CE >>>= 8) & LAST_BYTE_MASK_);\r
- primary1 = (CE >>> 8);\r
-\r
- if(primary1 != 0) {\r
- latinOneCEs_[ch] |= (primary1 << sh.primShift);\r
- sh.primShift -= 8;\r
- }\r
- if(primary2 != 0) {\r
- if(sh.primShift < 0) {\r
- latinOneCEs_[ch] = BAIL_OUT_CE_;\r
- latinOneCEs_[latinOneTableLen_+ch] = BAIL_OUT_CE_;\r
- latinOneCEs_[2*latinOneTableLen_+ch] = BAIL_OUT_CE_;\r
- return;\r
- }\r
- latinOneCEs_[ch] |= (primary2 << sh.primShift);\r
- sh.primShift -= 8;\r
- }\r
- if(secondary != 0) {\r
- if(reverseSecondary && m_isFrenchCollation_) { // reverse secondary\r
- latinOneCEs_[latinOneTableLen_+ch] >>>= 8; // make space for secondary\r
- latinOneCEs_[latinOneTableLen_+ch] |= (secondary << 24);\r
- } else { // normal case\r
- latinOneCEs_[latinOneTableLen_+ch] |= (secondary << sh.secShift);\r
- }\r
- sh.secShift -= 8;\r
- }\r
- if(tertiary != 0) {\r
- latinOneCEs_[2*latinOneTableLen_+ch] |= (tertiary << sh.terShift);\r
- sh.terShift -= 8;\r
- }\r
- }\r
-\r
- private final void\r
- resizeLatinOneTable(int newSize) {\r
- int newTable[] = new int[3*newSize];\r
- int sizeToCopy = ((newSize<latinOneTableLen_)?newSize:latinOneTableLen_);\r
- //uprv_memset(newTable, 0, newSize*sizeof(uint32_t)*3); // automatically cleared.\r
- System.arraycopy(latinOneCEs_, 0, newTable, 0, sizeToCopy);\r
- System.arraycopy(latinOneCEs_, latinOneTableLen_, newTable, newSize, sizeToCopy);\r
- System.arraycopy(latinOneCEs_, 2*latinOneTableLen_, newTable, 2*newSize, sizeToCopy);\r
- latinOneTableLen_ = newSize;\r
- latinOneCEs_ = newTable;\r
- }\r
-\r
- private final boolean setUpLatinOne() {\r
- if(latinOneCEs_ == null || m_reallocLatinOneCEs_) {\r
- latinOneCEs_ = new int[3*LATINONETABLELEN_];\r
- latinOneTableLen_ = LATINONETABLELEN_;\r
- m_reallocLatinOneCEs_ = false;\r
- } else {\r
- Arrays.fill(latinOneCEs_, 0);\r
- }\r
- if(m_ContInfo_ == null) {\r
- m_ContInfo_ = new ContractionInfo();\r
- }\r
- char ch = 0;\r
- //StringBuffer sCh = new StringBuffer();\r
- //CollationElementIterator it = getCollationElementIterator(sCh.toString());\r
- CollationElementIterator it = getCollationElementIterator("");\r
-\r
- shiftValues s = new shiftValues();\r
- int CE = 0;\r
- char contractionOffset = ENDOFLATINONERANGE_+1;\r
-\r
- for(ch = 0; ch <= ENDOFLATINONERANGE_; ch++) {\r
- s.primShift = 24; s.secShift = 24; s.terShift = 24;\r
- if(ch < 0x100) {\r
- CE = m_trie_.getLatin1LinearValue(ch);\r
- } else {\r
- CE = m_trie_.getLeadValue(ch);\r
- if(CE == CollationElementIterator.CE_NOT_FOUND_) {\r
- CE = UCA_.m_trie_.getLeadValue(ch);\r
- }\r
- }\r
- if(!isSpecial(CE)) {\r
- addLatinOneEntry(ch, CE, s);\r
- } else {\r
- switch (RuleBasedCollator.getTag(CE)) {\r
- case CollationElementIterator.CE_EXPANSION_TAG_:\r
- case CollationElementIterator.CE_DIGIT_TAG_:\r
- //sCh.delete(0, sCh.length());\r
- //sCh.append(ch);\r
- //it.setText(sCh.toString());\r
- it.setText(UCharacter.toString(ch));\r
- while((CE = it.next()) != CollationElementIterator.NULLORDER) {\r
- if(s.primShift < 0 || s.secShift < 0 || s.terShift < 0) {\r
- latinOneCEs_[ch] = BAIL_OUT_CE_;\r
- latinOneCEs_[latinOneTableLen_+ch] = BAIL_OUT_CE_;\r
- latinOneCEs_[2*latinOneTableLen_+ch] = BAIL_OUT_CE_;\r
- break;\r
- }\r
- addLatinOneEntry(ch, CE, s);\r
- }\r
- break;\r
- case CollationElementIterator.CE_CONTRACTION_TAG_:\r
- // here is the trick\r
- // F2 is contraction. We do something very similar to contractions\r
- // but have two indices, one in the real contraction table and the\r
- // other to where we stuffed things. This hopes that we don't have\r
- // many contractions (this should work for latin-1 tables).\r
- {\r
- if((CE & 0x00FFF000) != 0) {\r
- latinOneFailed_ = true;\r
- return false;\r
- }\r
-\r
- int UCharOffset = (CE & 0xFFFFFF) - m_contractionOffset_; //getContractionOffset(CE)]\r
-\r
- CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table\r
-\r
- latinOneCEs_[ch] = CE;\r
- latinOneCEs_[latinOneTableLen_+ch] = CE;\r
- latinOneCEs_[2*latinOneTableLen_+ch] = CE;\r
-\r
- // We're going to jump into contraction table, pick the elements\r
- // and use them\r
- do {\r
- //CE = *(contractionCEs + (UCharOffset - contractionIndex));\r
- CE = m_contractionCE_[UCharOffset];\r
- if(isSpecial(CE) \r
- && getTag(CE) \r
- == CollationElementIterator.CE_EXPANSION_TAG_) {\r
- int i; /* general counter */\r
- //uint32_t *CEOffset = (uint32_t *)image+getExpansionOffset(CE); /* find the offset to expansion table */\r
- int offset = ((CE & 0xFFFFF0) >> 4) - m_expansionOffset_; //it.getExpansionOffset(this, CE);\r
- int size = CE & 0xF; // getExpansionCount(CE);\r
- //CE = *CEOffset++;\r
- if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */\r
- for(i = 0; i<size; i++) {\r
- if(s.primShift < 0 || s.secShift < 0 || s.terShift < 0) {\r
- latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;\r
- latinOneCEs_[latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;\r
- latinOneCEs_[2*latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;\r
- break;\r
- }\r
- addLatinOneEntry(contractionOffset, m_expansion_[offset+i], s);\r
- }\r
- } else { /* else, we do */\r
- while(m_expansion_[offset] != 0) {\r
- if(s.primShift < 0 || s.secShift < 0 || s.terShift < 0) {\r
- latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;\r
- latinOneCEs_[latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;\r
- latinOneCEs_[2*latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;\r
- break;\r
- }\r
- addLatinOneEntry(contractionOffset, m_expansion_[offset++], s);\r
- }\r
- }\r
- contractionOffset++;\r
- } else if(!isSpecial(CE)) {\r
- addLatinOneEntry(contractionOffset++, CE, s);\r
- } else {\r
- latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;\r
- latinOneCEs_[latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;\r
- latinOneCEs_[2*latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;\r
- contractionOffset++;\r
- }\r
- UCharOffset++;\r
- s.primShift = 24; s.secShift = 24; s.terShift = 24;\r
- if(contractionOffset == latinOneTableLen_) { // we need to reallocate\r
- resizeLatinOneTable(2*latinOneTableLen_);\r
- }\r
- } while(m_contractionIndex_[UCharOffset] != 0xFFFF);\r
- }\r
- break;\r
- case CollationElementIterator.CE_SPEC_PROC_TAG_:\r
- {\r
- // 0xB7 is a precontext character defined in UCA5.1, a special\r
- // handle is implemeted in order to save LatinOne table for\r
- // most locales.\r
- if (ch == 0xb7) {\r
- addLatinOneEntry(ch, CE, s);\r
- }\r
- else {\r
- latinOneFailed_ = true;\r
- return false;\r
- }\r
- }\r
- break;\r
- default:\r
- latinOneFailed_ = true;\r
- return false;\r
- }\r
- }\r
- }\r
- // compact table\r
- if(contractionOffset < latinOneTableLen_) {\r
- resizeLatinOneTable(contractionOffset);\r
- }\r
- return true;\r
- }\r
-\r
- private class ContractionInfo {\r
- int index;\r
- }\r
-\r
- ContractionInfo m_ContInfo_;\r
-\r
- private int\r
- getLatinOneContraction(int strength, int CE, String s) {\r
- //int strength, int CE, String s, Integer ind) {\r
- int len = s.length();\r
- //const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);\r
- int UCharOffset = (CE & 0xFFF) - m_contractionOffset_;\r
- int offset = 1;\r
- int latinOneOffset = (CE & 0x00FFF000) >>> 12;\r
- char schar = 0, tchar = 0;\r
-\r
- for(;;) {\r
- /*\r
- if(len == -1) {\r
- if(s[*index] == 0) { // end of string\r
- return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);\r
- } else {\r
- schar = s[*index];\r
- }\r
- } else {\r
- */\r
- if(m_ContInfo_.index == len) {\r
- return(latinOneCEs_[strength*latinOneTableLen_+latinOneOffset]);\r
- } else {\r
- schar = s.charAt(m_ContInfo_.index);\r
- }\r
- //}\r
-\r
- while(schar > (tchar = m_contractionIndex_[UCharOffset+offset]/**(UCharOffset+offset)*/)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */\r
- offset++;\r
- }\r
-\r
- if (schar == tchar) {\r
- m_ContInfo_.index++;\r
- return(latinOneCEs_[strength*latinOneTableLen_+latinOneOffset+offset]);\r
- }\r
- else\r
- {\r
- if(schar > ENDOFLATINONERANGE_ /*& 0xFF00*/) {\r
- return BAIL_OUT_CE_;\r
- }\r
- // skip completely ignorables\r
- int isZeroCE = m_trie_.getLeadValue(schar); //UTRIE_GET32_FROM_LEAD(coll->mapping, schar);\r
- if(isZeroCE == 0) { // we have to ignore completely ignorables\r
- m_ContInfo_.index++;\r
- continue;\r
- }\r
-\r
- return(latinOneCEs_[strength*latinOneTableLen_+latinOneOffset]);\r
- }\r
- }\r
- }\r
-\r
-\r
- /**\r
- * This is a fast strcoll, geared towards text in Latin-1.\r
- * It supports contractions of size two, French secondaries\r
- * and case switching. You can use it with strengths primary\r
- * to tertiary. It does not support shifted and case level.\r
- * It relies on the table build by setupLatin1Table. If it\r
- * doesn't understand something, it will go to the regular\r
- * strcoll.\r
- */\r
- private final int\r
- compareUseLatin1(String source, String target, int startOffset)\r
- {\r
- int sLen = source.length();\r
- int tLen = target.length();\r
-\r
- int strength = getStrength();\r
-\r
- int sIndex = startOffset, tIndex = startOffset;\r
- char sChar = 0, tChar = 0;\r
- int sOrder=0, tOrder=0;\r
-\r
- boolean endOfSource = false;\r
-\r
- //uint32_t *elements = coll->latinOneCEs;\r
-\r
- boolean haveContractions = false; // if we have contractions in our string\r
- // we cannot do French secondary\r
-\r
- int offset = latinOneTableLen_;\r
-\r
- // Do the primary level\r
- primLoop:\r
- for(;;) {\r
- while(sOrder==0) { // this loop skips primary ignorables\r
- // sOrder=getNextlatinOneCE(source);\r
- if(sIndex==sLen) {\r
- endOfSource = true;\r
- break;\r
- }\r
- sChar=source.charAt(sIndex++); //[sIndex++];\r
- //}\r
- if(sChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out\r
- //fprintf(stderr, "R");\r
- return compareRegular(source, target, startOffset);\r
- }\r
- sOrder = latinOneCEs_[sChar];\r
- if(isSpecial(sOrder)) { // if we got a special\r
- // specials can basically be either contractions or bail-out signs. If we get anything\r
- // else, we'll bail out anywasy\r
- if(getTag(sOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) {\r
- m_ContInfo_.index = sIndex;\r
- sOrder = getLatinOneContraction(0, sOrder, source);\r
- sIndex = m_ContInfo_.index;\r
- haveContractions = true; // if there are contractions, we cannot do French secondary\r
- // However, if there are contractions in the table, but we always use just one char,\r
- // we might be able to do French. This should be checked out.\r
- }\r
- if(isSpecial(sOrder) /*== UCOL_BAIL_OUT_CE*/) {\r
- //fprintf(stderr, "S");\r
- return compareRegular(source, target, startOffset);\r
- }\r
- }\r
- }\r
-\r
- while(tOrder==0) { // this loop skips primary ignorables\r
- // tOrder=getNextlatinOneCE(target);\r
- if(tIndex==tLen) {\r
- if(endOfSource) {\r
- break primLoop;\r
- } else {\r
- return 1;\r
- }\r
- }\r
- tChar=target.charAt(tIndex++); //[tIndex++];\r
- if(tChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out\r
- //fprintf(stderr, "R");\r
- return compareRegular(source, target, startOffset);\r
- }\r
- tOrder = latinOneCEs_[tChar];\r
- if(isSpecial(tOrder)) {\r
- // Handling specials, see the comments for source\r
- if(getTag(tOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) {\r
- m_ContInfo_.index = tIndex;\r
- tOrder = getLatinOneContraction(0, tOrder, target);\r
- tIndex = m_ContInfo_.index;\r
- haveContractions = true;\r
- }\r
- if(isSpecial(tOrder)/*== UCOL_BAIL_OUT_CE*/) {\r
- //fprintf(stderr, "S");\r
- return compareRegular(source, target, startOffset);\r
- }\r
- }\r
- }\r
- if(endOfSource) { // source is finished, but target is not, say the result.\r
- return -1;\r
- }\r
-\r
- if(sOrder == tOrder) { // if we have same CEs, we continue the loop\r
- sOrder = 0; tOrder = 0;\r
- continue;\r
- } else {\r
- // compare current top bytes\r
- if(((sOrder^tOrder)&0xFF000000)!=0) {\r
- // top bytes differ, return difference\r
- if(sOrder >>> 8 < tOrder >>> 8) {\r
- return -1;\r
- } else {\r
- return 1;\r
- }\r
- // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);\r
- // since we must return enum value\r
- }\r
-\r
- // top bytes match, continue with following bytes\r
- sOrder<<=8;\r
- tOrder<<=8;\r
- }\r
- }\r
-\r
- // after primary loop, we definitely know the sizes of strings,\r
- // so we set it and use simpler loop for secondaries and tertiaries\r
- //sLen = sIndex; tLen = tIndex;\r
- if(strength >= SECONDARY) {\r
- // adjust the table beggining\r
- //latinOneCEs_ += coll->latinOneTableLen;\r
- endOfSource = false;\r
-\r
- if(!m_isFrenchCollation_) { // non French\r
- // This loop is a simplified copy of primary loop\r
- // at this point we know that whole strings are latin-1, so we don't\r
- // check for that. We also know that we only have contractions as\r
- // specials.\r
- //sIndex = 0; tIndex = 0;\r
- sIndex = startOffset; tIndex = startOffset;\r
- secLoop:\r
- for(;;) {\r
- while(sOrder==0) {\r
- if(sIndex==sLen) {\r
- endOfSource = true;\r
- break;\r
- }\r
- sChar=source.charAt(sIndex++); //[sIndex++];\r
- sOrder = latinOneCEs_[offset+sChar];\r
- if(isSpecial(sOrder)) {\r
- m_ContInfo_.index = sIndex;\r
- sOrder = getLatinOneContraction(1, sOrder, source);\r
- sIndex = m_ContInfo_.index;\r
- }\r
- }\r
-\r
- while(tOrder==0) {\r
- if(tIndex==tLen) {\r
- if(endOfSource) {\r
- break secLoop;\r
- } else {\r
- return 1;\r
- }\r
- }\r
- tChar=target.charAt(tIndex++); //[tIndex++];\r
- tOrder = latinOneCEs_[offset+tChar];\r
- if(isSpecial(tOrder)) {\r
- m_ContInfo_.index = tIndex;\r
- tOrder = getLatinOneContraction(1, tOrder, target);\r
- tIndex = m_ContInfo_.index;\r
- }\r
- }\r
- if(endOfSource) {\r
- return -1;\r
- }\r
-\r
- if(sOrder == tOrder) {\r
- sOrder = 0; tOrder = 0;\r
- continue;\r
- } else {\r
- // see primary loop for comments on this\r
- if(((sOrder^tOrder)&0xFF000000)!=0) {\r
- if(sOrder >>> 8 < tOrder >>> 8) {\r
- return -1;\r
- } else {\r
- return 1;\r
- }\r
- }\r
- sOrder<<=8;\r
- tOrder<<=8;\r
- }\r
- }\r
- } else { // French\r
- if(haveContractions) { // if we have contractions, we have to bail out\r
- // since we don't really know how to handle them here\r
- return compareRegular(source, target, startOffset);\r
- }\r
- // For French, we go backwards\r
- sIndex = sLen; tIndex = tLen;\r
- secFLoop:\r
- for(;;) {\r
- while(sOrder==0) {\r
- if(sIndex==startOffset) {\r
- endOfSource = true;\r
- break;\r
- }\r
- sChar=source.charAt(--sIndex); //[--sIndex];\r
- sOrder = latinOneCEs_[offset+sChar];\r
- // don't even look for contractions\r
- }\r
-\r
- while(tOrder==0) {\r
- if(tIndex==startOffset) {\r
- if(endOfSource) {\r
- break secFLoop;\r
- } else {\r
- return 1;\r
- }\r
- }\r
- tChar=target.charAt(--tIndex); //[--tIndex];\r
- tOrder = latinOneCEs_[offset+tChar];\r
- // don't even look for contractions\r
- }\r
- if(endOfSource) {\r
- return -1;\r
- }\r
-\r
- if(sOrder == tOrder) {\r
- sOrder = 0; tOrder = 0;\r
- continue;\r
- } else {\r
- // see the primary loop for comments\r
- if(((sOrder^tOrder)&0xFF000000)!=0) {\r
- if(sOrder >>> 8 < tOrder >>> 8) {\r
- return -1;\r
- } else {\r
- return 1;\r
- }\r
- }\r
- sOrder<<=8;\r
- tOrder<<=8;\r
- }\r
- }\r
- }\r
- }\r
-\r
- if(strength >= TERTIARY) {\r
- // tertiary loop is the same as secondary (except no French)\r
- offset += latinOneTableLen_;\r
- //sIndex = 0; tIndex = 0;\r
- sIndex = startOffset; tIndex = startOffset;\r
- endOfSource = false;\r
- for(;;) {\r
- while(sOrder==0) {\r
- if(sIndex==sLen) {\r
- endOfSource = true;\r
- break;\r
- }\r
- sChar=source.charAt(sIndex++); //[sIndex++];\r
- sOrder = latinOneCEs_[offset+sChar];\r
- if(isSpecial(sOrder)) {\r
- m_ContInfo_.index = sIndex;\r
- sOrder = getLatinOneContraction(2, sOrder, source);\r
- sIndex = m_ContInfo_.index;\r
- }\r
- }\r
- while(tOrder==0) {\r
- if(tIndex==tLen) {\r
- if(endOfSource) {\r
- return 0; // if both strings are at the end, they are equal\r
- } else {\r
- return 1;\r
- }\r
- }\r
- tChar=target.charAt(tIndex++); //[tIndex++];\r
- tOrder = latinOneCEs_[offset+tChar];\r
- if(isSpecial(tOrder)) {\r
- m_ContInfo_.index = tIndex;\r
- tOrder = getLatinOneContraction(2, tOrder, target);\r
- tIndex = m_ContInfo_.index;\r
- }\r
- }\r
- if(endOfSource) {\r
- return -1;\r
- }\r
- if(sOrder == tOrder) {\r
- sOrder = 0; tOrder = 0;\r
- continue;\r
- } else {\r
- if(((sOrder^tOrder)&0xff000000)!=0) {\r
- if(sOrder >>> 8 < tOrder >>> 8) {\r
- return -1;\r
- } else {\r
- return 1;\r
- }\r
- }\r
- sOrder<<=8;\r
- tOrder<<=8;\r
- }\r
- }\r
- }\r
- return 0;\r
- }\r
- /** \r
- * Get the version of this collator object.\r
- * @return the version object associated with this collator\r
- * @stable ICU 2.8\r
- */\r
- public VersionInfo getVersion() {\r
- /* RunTime version */\r
- int rtVersion = VersionInfo.UCOL_RUNTIME_VERSION.getMajor();\r
- /* Builder version*/\r
- int bdVersion = m_version_.getMajor();\r
-\r
- /* Charset Version. Need to get the version from cnv files\r
- * makeconv should populate cnv files with version and\r
- * an api has to be provided in ucnv.h to obtain this version\r
- */\r
- int csVersion = 0;\r
-\r
- /* combine the version info */\r
- int cmbVersion = ((rtVersion<<11) | (bdVersion<<6) | (csVersion)) & 0xFFFF;\r
- \r
- /* Tailoring rules */\r
- return VersionInfo.getInstance(cmbVersion>>8, \r
- cmbVersion & 0xFF, \r
- m_version_.getMinor(), \r
- UCA_.m_UCA_version_.getMajor());\r
-\r
-// versionInfo[0] = (uint8_t)(cmbVersion>>8);\r
-// versionInfo[1] = (uint8_t)cmbVersion;\r
-// versionInfo[2] = coll->image->version[1];\r
-// versionInfo[3] = coll->UCA->image->UCAVersion[0];\r
- }\r
- \r
- /** \r
- * Get the UCA version of this collator object.\r
- * @return the version object associated with this collator\r
- * @stable ICU 2.8\r
- */\r
- public VersionInfo getUCAVersion() {\r
- return UCA_.m_UCA_version_;\r
- }\r
-\r
- private transient boolean m_reallocLatinOneCEs_;\r
-}\r
+//##header J2SE15
+/**
+*******************************************************************************
+* Copyright (C) 1996-2009, International Business Machines Corporation and *
+* others. All Rights Reserved. *
+*******************************************************************************
+*/
+package com.ibm.icu.text;
+
+import java.io.IOException;
+import java.text.CharacterIterator;
+import java.text.ParseException;
+import java.util.Arrays;
+import java.util.MissingResourceException;
+
+//#if defined(FOUNDATION10) || defined(J2SE13) || defined(ECLIPSE_FRAGMENT)
+//##import com.ibm.icu.impl.ByteBuffer;
+//#else
+import java.nio.ByteBuffer;
+//#endif
+
+import com.ibm.icu.impl.BOCU;
+import com.ibm.icu.impl.ICUDebug;
+import com.ibm.icu.impl.ICUResourceBundle;
+import com.ibm.icu.impl.ImplicitCEGenerator;
+import com.ibm.icu.impl.IntTrie;
+import com.ibm.icu.impl.StringUCharacterIterator;
+import com.ibm.icu.impl.Trie;
+import com.ibm.icu.impl.TrieIterator;
+import com.ibm.icu.impl.Utility;
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.util.RangeValueIterator;
+import com.ibm.icu.util.ULocale;
+import com.ibm.icu.util.UResourceBundle;
+import com.ibm.icu.util.VersionInfo;
+
+/**
+ * <p>RuleBasedCollator is a concrete subclass of Collator. It allows
+ * customization of the Collator via user-specified rule sets.
+ * RuleBasedCollator is designed to be fully compliant to the <a
+ * href="http://www.unicode.org/unicode/reports/tr10/">Unicode
+ * Collation Algorithm (UCA)</a> and conforms to ISO 14651.</p>
+ *
+ * <p>Users are strongly encouraged to read <a
+ * href="http://www.icu-project.org/userguide/Collate_Intro.html">
+ * the users guide</a> for more information about the collation
+ * service before using this class.</p>
+ *
+ * <p>Create a RuleBasedCollator from a locale by calling the
+ * getInstance(Locale) factory method in the base class Collator.
+ * Collator.getInstance(Locale) creates a RuleBasedCollator object
+ * based on the collation rules defined by the argument locale. If a
+ * customized collation ordering ar attributes is required, use the
+ * RuleBasedCollator(String) constructor with the appropriate
+ * rules. The customized RuleBasedCollator will base its ordering on
+ * UCA, while re-adjusting the attributes and orders of the characters
+ * in the specified rule accordingly.</p>
+ *
+ * <p>RuleBasedCollator provides correct collation orders for most
+ * locales supported in ICU. If specific data for a locale is not
+ * available, the orders eventually falls back to the <a
+ * href="http://www.unicode.org/unicode/reports/tr10/">UCA collation
+ * order </a>.</p>
+ *
+ * <p>For information about the collation rule syntax and details
+ * about customization, please refer to the
+ * <a href="http://www.icu-project.org/userguide/Collate_Customization.html">
+ * Collation customization</a> section of the user's guide.</p>
+ *
+ * <p><strong>Note</strong> that there are some differences between
+ * the Collation rule syntax used in Java and ICU4J:
+ *
+ * <ul>
+ * <li>According to the JDK documentation:
+ * <i>
+ * <p>
+ * Modifier '!' : Turns on Thai/Lao vowel-consonant swapping. If this rule
+ * is in force when a Thai vowel of the range \U0E40-\U0E44 precedes a
+ * Thai consonant of the range \U0E01-\U0E2E OR a Lao vowel of the
+ * range \U0EC0-\U0EC4 precedes a Lao consonant of the range
+ * \U0E81-\U0EAE then the
+ * vowel is placed after the consonant for collation purposes.
+ * </p>
+ * <p>
+ * If a rule is without the modifier '!', the Thai/Lao vowel-consonant
+ * swapping is not turned on.
+ * </p>
+ * </i>
+ * <p>
+ * ICU4J's RuleBasedCollator does not support turning off the Thai/Lao
+ * vowel-consonant swapping, since the UCA clearly states that it has to be
+ * supported to ensure a correct sorting order. If a '!' is encountered, it is
+ * ignored.
+ * </p>
+ * <li>As mentioned in the documentation of the base class Collator,
+ * compatibility decomposition mode is not supported.
+ * </ul>
+ * <p>
+ * <strong>Examples</strong>
+ * </p>
+ * <p>
+ * Creating Customized RuleBasedCollators:
+ * <blockquote>
+ * <pre>
+ * String simple = "& a < b < c < d";
+ * RuleBasedCollator simpleCollator = new RuleBasedCollator(simple);
+ *
+ * String norwegian = "& a , A < b , B < c , C < d , D < e , E "
+ * + "< f , F < g , G < h , H < i , I < j , "
+ * + "J < k , K < l , L < m , M < n , N < "
+ * + "o , O < p , P < q , Q < r , R < s , S < "
+ * + "t , T < u , U < v , V < w , W < x , X "
+ * + "< y , Y < z , Z < \u00E5 = a\u030A "
+ * + ", \u00C5 = A\u030A ; aa , AA < \u00E6 "
+ * + ", \u00C6 < \u00F8 , \u00D8";
+ * RuleBasedCollator norwegianCollator = new RuleBasedCollator(norwegian);
+ * </pre>
+ * </blockquote>
+ *
+ * Concatenating rules to combine <code>Collator</code>s:
+ * <blockquote>
+ * <pre>
+ * // Create an en_US Collator object
+ * RuleBasedCollator en_USCollator = (RuleBasedCollator)
+ * Collator.getInstance(new Locale("en", "US", ""));
+ * // Create a da_DK Collator object
+ * RuleBasedCollator da_DKCollator = (RuleBasedCollator)
+ * Collator.getInstance(new Locale("da", "DK", ""));
+ * // Combine the two
+ * // First, get the collation rules from en_USCollator
+ * String en_USRules = en_USCollator.getRules();
+ * // Second, get the collation rules from da_DKCollator
+ * String da_DKRules = da_DKCollator.getRules();
+ * RuleBasedCollator newCollator =
+ * new RuleBasedCollator(en_USRules + da_DKRules);
+ * // newCollator has the combined rules
+ * </pre>
+ * </blockquote>
+ *
+ * Making changes to an existing RuleBasedCollator to create a new
+ * <code>Collator</code> object, by appending changes to the existing rule:
+ * <blockquote>
+ * <pre>
+ * // Create a new Collator object with additional rules
+ * String addRules = "& C < ch, cH, Ch, CH";
+ * RuleBasedCollator myCollator =
+ * new RuleBasedCollator(en_USCollator.getRules() + addRules);
+ * // myCollator contains the new rules
+ * </pre>
+ * </blockquote>
+ *
+ * How to change the order of non-spacing accents:
+ * <blockquote>
+ * <pre>
+ * // old rule with main accents
+ * String oldRules = "= \u0301 ; \u0300 ; \u0302 ; \u0308 "
+ * + "; \u0327 ; \u0303 ; \u0304 ; \u0305 "
+ * + "; \u0306 ; \u0307 ; \u0309 ; \u030A "
+ * + "; \u030B ; \u030C ; \u030D ; \u030E "
+ * + "; \u030F ; \u0310 ; \u0311 ; \u0312 "
+ * + "< a , A ; ae, AE ; \u00e6 , \u00c6 "
+ * + "< b , B < c, C < e, E & C < d , D";
+ * // change the order of accent characters
+ * String addOn = "& \u0300 ; \u0308 ; \u0302";
+ * RuleBasedCollator myCollator = new RuleBasedCollator(oldRules + addOn);
+ * </pre>
+ * </blockquote>
+ *
+ * Putting in a new primary ordering before the default setting,
+ * e.g. sort English characters before or after Japanese characters in the Japanese
+ * <code>Collator</code>:
+ * <blockquote>
+ * <pre>
+ * // get en_US Collator rules
+ * RuleBasedCollator en_USCollator
+ * = (RuleBasedCollator)Collator.getInstance(Locale.US);
+ * // add a few Japanese characters to sort before English characters
+ * // suppose the last character before the first base letter 'a' in
+ * // the English collation rule is \u2212
+ * String jaString = "& \u2212 < \u3041, \u3042 < \u3043, "
+ * + "\u3044";
+ * RuleBasedCollator myJapaneseCollator
+ * = new RuleBasedCollator(en_USCollator.getRules() + jaString);
+ * </pre>
+ * </blockquote>
+ * </p>
+ * <p>
+ * This class is not subclassable
+ * </p>
+ * @author Syn Wee Quek
+ * @stable ICU 2.8
+ */
+public final class RuleBasedCollator extends Collator
+{
+ // public constructors ---------------------------------------------------
+
+ /**
+ * <p>
+ * Constructor that takes the argument rules for
+ * customization. The collator will be based on UCA,
+ * with the attributes and re-ordering of the characters specified in the
+ * argument rules.
+ * </p>
+ * <p>See the user guide's section on
+ * <a href="http://www.icu-project.org/userguide/Collate_Customization.html">
+ * Collation Customization</a> for details on the rule syntax.
+ * </p>
+ * @param rules the collation rules to build the collation table from.
+ * @exception ParseException and IOException thrown. ParseException thrown
+ * when argument rules have an invalid syntax. IOException
+ * thrown when an error occured while reading internal data.
+ * @stable ICU 2.8
+ */
+ public RuleBasedCollator(String rules) throws Exception
+ {
+ checkUCA();
+ if (rules == null) {
+ throw new IllegalArgumentException(
+ "Collation rules can not be null");
+ }
+ init(rules);
+ }
+
+ // public methods --------------------------------------------------------
+
+ /**
+ * Clones the RuleBasedCollator
+ * @return a new instance of this RuleBasedCollator object
+ * @stable ICU 2.8
+ */
+ public Object clone() throws CloneNotSupportedException
+ {
+ RuleBasedCollator result = (RuleBasedCollator)super.clone();
+ if (latinOneCEs_ != null) {
+ result.m_reallocLatinOneCEs_ = true;
+ result.m_ContInfo_ = new ContractionInfo();
+ }
+
+ // since all collation data in the RuleBasedCollator do not change
+ // we can safely assign the result.fields to this collator
+ result.initUtility(false); // let the new clone have their own util
+ // iterators
+ return result;
+ }
+
+ /**
+ * Return a CollationElementIterator for the given String.
+ * @see CollationElementIterator
+ * @stable ICU 2.8
+ */
+ public CollationElementIterator getCollationElementIterator(String source)
+ {
+ return new CollationElementIterator(source, this);
+ }
+
+ /**
+ * Return a CollationElementIterator for the given CharacterIterator.
+ * The source iterator's integrity will be preserved since a new copy
+ * will be created for use.
+ * @see CollationElementIterator
+ * @stable ICU 2.8
+ */
+ public CollationElementIterator getCollationElementIterator(
+ CharacterIterator source)
+ {
+ CharacterIterator newsource = (CharacterIterator)source.clone();
+ return new CollationElementIterator(newsource, this);
+ }
+
+ /**
+ * Return a CollationElementIterator for the given UCharacterIterator.
+ * The source iterator's integrity will be preserved since a new copy
+ * will be created for use.
+ * @see CollationElementIterator
+ * @stable ICU 2.8
+ */
+ public CollationElementIterator getCollationElementIterator(
+ UCharacterIterator source)
+ {
+ return new CollationElementIterator(source, this);
+ }
+
+ // public setters --------------------------------------------------------
+
+ /**
+ * Sets the Hiragana Quaternary mode to be on or off.
+ * When the Hiragana Quaternary mode is turned on, the collator
+ * positions Hiragana characters before all non-ignorable characters in
+ * QUATERNARY strength. This is to produce a correct JIS collation order,
+ * distinguishing between Katakana and Hiragana characters.
+ * @param flag true if Hiragana Quaternary mode is to be on, false
+ * otherwise
+ * @see #setHiraganaQuaternaryDefault
+ * @see #isHiraganaQuaternary
+ * @stable ICU 2.8
+ */
+ public void setHiraganaQuaternary(boolean flag)
+ {
+ m_isHiragana4_ = flag;
+ updateInternalState();
+ }
+
+ /**
+ * Sets the Hiragana Quaternary mode to the initial mode set during
+ * construction of the RuleBasedCollator.
+ * See setHiraganaQuaternary(boolean) for more details.
+ * @see #setHiraganaQuaternary(boolean)
+ * @see #isHiraganaQuaternary
+ * @stable ICU 2.8
+ */
+ public void setHiraganaQuaternaryDefault()
+ {
+ m_isHiragana4_ = m_defaultIsHiragana4_;
+ updateInternalState();
+ }
+
+ /**
+ * Sets whether uppercase characters sort before lowercase
+ * characters or vice versa, in strength TERTIARY. The default
+ * mode is false, and so lowercase characters sort before uppercase
+ * characters.
+ * If true, sort upper case characters first.
+ * @param upperfirst true to sort uppercase characters before
+ * lowercase characters, false to sort lowercase
+ * characters before uppercase characters
+ * @see #isLowerCaseFirst
+ * @see #isUpperCaseFirst
+ * @see #setLowerCaseFirst
+ * @see #setCaseFirstDefault
+ * @stable ICU 2.8
+ */
+ public void setUpperCaseFirst(boolean upperfirst)
+ {
+ if (upperfirst) {
+ if(m_caseFirst_ != AttributeValue.UPPER_FIRST_) {
+ latinOneRegenTable_ = true;
+ }
+ m_caseFirst_ = AttributeValue.UPPER_FIRST_;
+ }
+ else {
+ if(m_caseFirst_ != AttributeValue.OFF_) {
+ latinOneRegenTable_ = true;
+ }
+ m_caseFirst_ = AttributeValue.OFF_;
+ }
+ updateInternalState();
+ }
+
+ /**
+ * Sets the orders of lower cased characters to sort before upper cased
+ * characters, in strength TERTIARY. The default
+ * mode is false.
+ * If true is set, the RuleBasedCollator will sort lower cased characters
+ * before the upper cased ones.
+ * Otherwise, if false is set, the RuleBasedCollator will ignore case
+ * preferences.
+ * @param lowerfirst true for sorting lower cased characters before
+ * upper cased characters, false to ignore case
+ * preferences.
+ * @see #isLowerCaseFirst
+ * @see #isUpperCaseFirst
+ * @see #setUpperCaseFirst
+ * @see #setCaseFirstDefault
+ * @stable ICU 2.8
+ */
+ public void setLowerCaseFirst(boolean lowerfirst)
+ {
+ if (lowerfirst) {
+ if(m_caseFirst_ != AttributeValue.LOWER_FIRST_) {
+ latinOneRegenTable_ = true;
+ }
+ m_caseFirst_ = AttributeValue.LOWER_FIRST_;
+ }
+ else {
+ if(m_caseFirst_ != AttributeValue.OFF_) {
+ latinOneRegenTable_ = true;
+ }
+ m_caseFirst_ = AttributeValue.OFF_;
+ }
+ updateInternalState();
+ }
+
+ /**
+ * Sets the case first mode to the initial mode set during
+ * construction of the RuleBasedCollator.
+ * See setUpperCaseFirst(boolean) and setLowerCaseFirst(boolean) for more
+ * details.
+ * @see #isLowerCaseFirst
+ * @see #isUpperCaseFirst
+ * @see #setLowerCaseFirst(boolean)
+ * @see #setUpperCaseFirst(boolean)
+ * @stable ICU 2.8
+ */
+ public final void setCaseFirstDefault()
+ {
+ if(m_caseFirst_ != m_defaultCaseFirst_) {
+ latinOneRegenTable_ = true;
+ }
+ m_caseFirst_ = m_defaultCaseFirst_;
+ updateInternalState();
+ }
+
+ /**
+ * Sets the alternate handling mode to the initial mode set during
+ * construction of the RuleBasedCollator.
+ * See setAlternateHandling(boolean) for more details.
+ * @see #setAlternateHandlingShifted(boolean)
+ * @see #isAlternateHandlingShifted()
+ * @stable ICU 2.8
+ */
+ public void setAlternateHandlingDefault()
+ {
+ m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_;
+ updateInternalState();
+ }
+
+ /**
+ * Sets the case level mode to the initial mode set during
+ * construction of the RuleBasedCollator.
+ * See setCaseLevel(boolean) for more details.
+ * @see #setCaseLevel(boolean)
+ * @see #isCaseLevel
+ * @stable ICU 2.8
+ */
+ public void setCaseLevelDefault()
+ {
+ m_isCaseLevel_ = m_defaultIsCaseLevel_;
+ updateInternalState();
+ }
+
+ /**
+ * Sets the decomposition mode to the initial mode set during construction
+ * of the RuleBasedCollator.
+ * See setDecomposition(int) for more details.
+ * @see #getDecomposition
+ * @see #setDecomposition(int)
+ * @stable ICU 2.8
+ */
+ public void setDecompositionDefault()
+ {
+ setDecomposition(m_defaultDecomposition_);
+ updateInternalState();
+ }
+
+ /**
+ * Sets the French collation mode to the initial mode set during
+ * construction of the RuleBasedCollator.
+ * See setFrenchCollation(boolean) for more details.
+ * @see #isFrenchCollation
+ * @see #setFrenchCollation(boolean)
+ * @stable ICU 2.8
+ */
+ public void setFrenchCollationDefault()
+ {
+ if(m_isFrenchCollation_ != m_defaultIsFrenchCollation_) {
+ latinOneRegenTable_ = true;
+ }
+ m_isFrenchCollation_ = m_defaultIsFrenchCollation_;
+ updateInternalState();
+ }
+
+ /**
+ * Sets the collation strength to the initial mode set during the
+ * construction of the RuleBasedCollator.
+ * See setStrength(int) for more details.
+ * @see #setStrength(int)
+ * @see #getStrength
+ * @stable ICU 2.8
+ */
+ public void setStrengthDefault()
+ {
+ setStrength(m_defaultStrength_);
+ updateInternalState();
+ }
+
+ /**
+ * Method to set numeric collation to its default value.
+ * When numeric collation is turned on, this Collator generates a collation
+ * key for the numeric value of substrings of digits. This is a way to get
+ * '100' to sort AFTER '2'
+ * @see #getNumericCollation
+ * @see #setNumericCollation
+ * @stable ICU 2.8
+ */
+ public void setNumericCollationDefault()
+ {
+ setNumericCollation(m_defaultIsNumericCollation_);
+ updateInternalState();
+ }
+
+ /**
+ * Sets the mode for the direction of SECONDARY weights to be used in
+ * French collation.
+ * The default value is false, which treats SECONDARY weights in the order
+ * they appear.
+ * If set to true, the SECONDARY weights will be sorted backwards.
+ * See the section on
+ * <a href="http://www.icu-project.org/userguide/Collate_ServiceArchitecture.html">
+ * French collation</a> for more information.
+ * @param flag true to set the French collation on, false to set it off
+ * @stable ICU 2.8
+ * @see #isFrenchCollation
+ * @see #setFrenchCollationDefault
+ */
+ public void setFrenchCollation(boolean flag)
+ {
+ if(m_isFrenchCollation_ != flag) {
+ latinOneRegenTable_ = true;
+ }
+ m_isFrenchCollation_ = flag;
+ updateInternalState();
+ }
+
+ /**
+ * Sets the alternate handling for QUATERNARY strength to be either
+ * shifted or non-ignorable.
+ * See the UCA definition on
+ * <a href="http://www.unicode.org/unicode/reports/tr10/#Variable_Weighting">
+ * Alternate Weighting</a>.
+ * This attribute will only be effective when QUATERNARY strength is set.
+ * The default value for this mode is false, corresponding to the
+ * NON_IGNORABLE mode in UCA. In the NON-IGNORABLE mode, the
+ * RuleBasedCollator will treats all the codepoints with non-ignorable
+ * primary weights in the same way.
+ * If the mode is set to true, the behaviour corresponds to SHIFTED defined
+ * in UCA, this causes codepoints with PRIMARY orders that are equal or
+ * below the variable top value to be ignored in PRIMARY order and
+ * moved to the QUATERNARY order.
+ * @param shifted true if SHIFTED behaviour for alternate handling is
+ * desired, false for the NON_IGNORABLE behaviour.
+ * @see #isAlternateHandlingShifted
+ * @see #setAlternateHandlingDefault
+ * @stable ICU 2.8
+ */
+ public void setAlternateHandlingShifted(boolean shifted)
+ {
+ m_isAlternateHandlingShifted_ = shifted;
+ updateInternalState();
+ }
+
+ /**
+ * <p>
+ * When case level is set to true, an additional weight is formed
+ * between the SECONDARY and TERTIARY weight, known as the case level.
+ * The case level is used to distinguish large and small Japanese Kana
+ * characters. Case level could also be used in other situations.
+ * For example to distinguish certain Pinyin characters.
+ * The default value is false, which means the case level is not generated.
+ * The contents of the case level are affected by the case first
+ * mode. A simple way to ignore accent differences in a string is to set
+ * the strength to PRIMARY and enable case level.
+ * </p>
+ * <p>
+ * See the section on
+ * <a href="http://www.icu-project.org/userguide/Collate_ServiceArchitecture.html">
+ * case level</a> for more information.
+ * </p>
+ * @param flag true if case level sorting is required, false otherwise
+ * @stable ICU 2.8
+ * @see #setCaseLevelDefault
+ * @see #isCaseLevel
+ */
+ public void setCaseLevel(boolean flag)
+ {
+ m_isCaseLevel_ = flag;
+ updateInternalState();
+ }
+
+ /**
+ * <p>
+ * Sets this Collator's strength property. The strength property
+ * determines the minimum level of difference considered significant
+ * during comparison.
+ * </p>
+ * <p>See the Collator class description for an example of use.</p>
+ * @param newStrength the new strength value.
+ * @see #getStrength
+ * @see #setStrengthDefault
+ * @see #PRIMARY
+ * @see #SECONDARY
+ * @see #TERTIARY
+ * @see #QUATERNARY
+ * @see #IDENTICAL
+ * @exception IllegalArgumentException If the new strength value is not one
+ * of PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL.
+ * @stable ICU 2.8
+ */
+ public void setStrength(int newStrength)
+ {
+ super.setStrength(newStrength);
+ updateInternalState();
+ }
+
+ /**
+ * <p>
+ * Variable top is a two byte primary value which causes all the codepoints
+ * with primary values that are less or equal than the variable top to be
+ * shifted when alternate handling is set to SHIFTED.
+ * </p>
+ * <p>
+ * Sets the variable top to a collation element value of a string supplied.
+ * </p>
+ * @param varTop one or more (if contraction) characters to which the
+ * variable top should be set
+ * @return a int value containing the value of the variable top in upper 16
+ * bits. Lower 16 bits are undefined.
+ * @exception IllegalArgumentException is thrown if varTop argument is not
+ * a valid variable top element. A variable top element is
+ * invalid when
+ * <ul>
+ * <li>it is a contraction that does not exist in the
+ * Collation order
+ * <li>when the PRIMARY strength collation element for the
+ * variable top has more than two bytes
+ * <li>when the varTop argument is null or zero in length.
+ * </ul>
+ * @see #getVariableTop
+ * @see RuleBasedCollator#setAlternateHandlingShifted
+ * @stable ICU 2.6
+ */
+ public int setVariableTop(String varTop)
+ {
+ if (varTop == null || varTop.length() == 0) {
+ throw new IllegalArgumentException(
+ "Variable top argument string can not be null or zero in length.");
+ }
+ if (m_srcUtilIter_ == null) {
+ initUtility(true);
+ }
+
+ m_srcUtilColEIter_.setText(varTop);
+ int ce = m_srcUtilColEIter_.next();
+
+ // here we check if we have consumed all characters
+ // you can put in either one character or a contraction
+ // you shouldn't put more...
+ if (m_srcUtilColEIter_.getOffset() != varTop.length()
+ || ce == CollationElementIterator.NULLORDER) {
+ throw new IllegalArgumentException(
+ "Variable top argument string is a contraction that does not exist "
+ + "in the Collation order");
+ }
+
+ int nextCE = m_srcUtilColEIter_.next();
+
+ if ((nextCE != CollationElementIterator.NULLORDER)
+ && (!isContinuation(nextCE) || (nextCE & CE_PRIMARY_MASK_) != 0)) {
+ throw new IllegalArgumentException(
+ "Variable top argument string can only have a single collation "
+ + "element that has less than or equal to two PRIMARY strength "
+ + "bytes");
+ }
+
+ m_variableTopValue_ = (ce & CE_PRIMARY_MASK_) >> 16;
+
+ return ce & CE_PRIMARY_MASK_;
+ }
+
+ /**
+ * Sets the variable top to a collation element value supplied.
+ * Variable top is set to the upper 16 bits.
+ * Lower 16 bits are ignored.
+ * @param varTop Collation element value, as returned by setVariableTop or
+ * getVariableTop
+ * @see #getVariableTop
+ * @see #setVariableTop(String)
+ * @stable ICU 2.6
+ */
+ public void setVariableTop(int varTop)
+ {
+ m_variableTopValue_ = (varTop & CE_PRIMARY_MASK_) >> 16;
+ }
+
+ /**
+ * When numeric collation is turned on, this Collator generates a collation
+ * key for the numeric value of substrings of digits. This is a way to get
+ * '100' to sort AFTER '2'
+ * @param flag true to turn numeric collation on and false to turn it off
+ * @see #getNumericCollation
+ * @see #setNumericCollationDefault
+ * @stable ICU 2.8
+ */
+ public void setNumericCollation(boolean flag)
+ {
+ // sort substrings of digits as numbers
+ m_isNumericCollation_ = flag;
+ updateInternalState();
+ }
+
+ // public getters --------------------------------------------------------
+
+ /**
+ * Gets the collation rules for this RuleBasedCollator.
+ * Equivalent to String getRules(RuleOption.FULL_RULES).
+ * @return returns the collation rules
+ * @see #getRules(boolean)
+ * @stable ICU 2.8
+ */
+ public String getRules()
+ {
+ return m_rules_;
+ }
+
+ /**
+ * Returns current rules. The argument defines whether full rules
+ * (UCA + tailored) rules are returned or just the tailoring.
+ * @param fullrules true if the rules that defines the full set of
+ * collation order is required, otherwise false for returning only
+ * the tailored rules
+ * @return the current rules that defines this Collator.
+ * @see #getRules()
+ * @stable ICU 2.6
+ */
+ public String getRules(boolean fullrules)
+ {
+ if (!fullrules) {
+ return m_rules_;
+ }
+ // take the UCA rules and append real rules at the end
+ return UCA_.m_rules_.concat(m_rules_);
+ }
+
+ /**
+ * Get an UnicodeSet that contains all the characters and sequences
+ * tailored in this collator.
+ * @return a pointer to a UnicodeSet object containing all the
+ * code points and sequences that may sort differently than
+ * in the UCA.
+ * @exception ParseException thrown when argument rules have an
+ * invalid syntax. IOException
+ * @stable ICU 2.4
+ */
+ public UnicodeSet getTailoredSet()
+ {
+ try {
+ CollationRuleParser src = new CollationRuleParser(getRules());
+ return src.getTailoredSet();
+ } catch(Exception e) {
+ throw new IllegalStateException("A tailoring rule should not " +
+ "have errors. Something is quite wrong!");
+ }
+ }
+
+ private class contContext {
+ RuleBasedCollator coll;
+ UnicodeSet contractions;
+ UnicodeSet expansions;
+ UnicodeSet removedContractions;
+ boolean addPrefixes;
+ contContext(RuleBasedCollator coll, UnicodeSet contractions, UnicodeSet expansions,
+ UnicodeSet removedContractions, boolean addPrefixes) {
+ this.coll = coll;
+ this.contractions = contractions;
+ this.expansions = expansions;
+ this.removedContractions = removedContractions;
+ this.addPrefixes = addPrefixes;
+ }
+ }
+
+ private void
+ addSpecial(contContext c, StringBuffer buffer, int CE)
+ {
+ StringBuffer b = new StringBuffer();
+ int offset = (CE & 0xFFFFFF) - c.coll.m_contractionOffset_;
+ int newCE = c.coll.m_contractionCE_[offset];
+ // we might have a contraction that ends from previous level
+ if(newCE != CollationElementIterator.CE_NOT_FOUND_) {
+ if(isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_
+ && isSpecial(newCE) && getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_
+ && c.addPrefixes) {
+ addSpecial(c, buffer, newCE);
+ }
+ if(buffer.length() > 1) {
+ if(c.contractions != null) {
+ c.contractions.add(buffer.toString());
+ }
+ if(c.expansions != null && isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) {
+ c.expansions.add(buffer.toString());
+ }
+ }
+ }
+
+ offset++;
+ // check whether we're doing contraction or prefix
+ if(getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) {
+ while(c.coll.m_contractionIndex_[offset] != 0xFFFF) {
+ b.delete(0, b.length());
+ b.append(buffer);
+ newCE = c.coll.m_contractionCE_[offset];
+ b.insert(0, c.coll.m_contractionIndex_[offset]);
+ if(isSpecial(newCE) && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) {
+ addSpecial(c, b, newCE);
+ } else {
+ if(c.contractions != null) {
+ c.contractions.add(b.toString());
+ }
+ if(c.expansions != null && isSpecial(newCE) && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) {
+ c.expansions.add(b.toString());
+ }
+ }
+ offset++;
+ }
+ } else if(getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_) {
+ while(c.coll.m_contractionIndex_[offset] != 0xFFFF) {
+ b.delete(0, b.length());
+ b.append(buffer);
+ newCE = c.coll.m_contractionCE_[offset];
+ b.append(c.coll.m_contractionIndex_[offset]);
+ if(isSpecial(newCE) && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) {
+ addSpecial(c, b, newCE);
+ } else {
+ if(c.contractions != null) {
+ c.contractions.add(b.toString());
+ }
+ if(c.expansions != null && isSpecial(newCE) && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) {
+ c.expansions.add(b.toString());
+ }
+ }
+ offset++;
+ }
+ }
+ }
+
+ private
+ void processSpecials(contContext c)
+ {
+ int internalBufferSize = 512;
+ TrieIterator trieiterator
+ = new TrieIterator(c.coll.m_trie_);
+ RangeValueIterator.Element element = new RangeValueIterator.Element();
+ while (trieiterator.next(element)) {
+ int start = element.start;
+ int limit = element.limit;
+ int CE = element.value;
+ StringBuffer contraction = new StringBuffer(internalBufferSize);
+
+ if(isSpecial(CE)) {
+ if(((getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) || getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_)) {
+ while(start < limit) {
+ // if there are suppressed contractions, we don't
+ // want to add them.
+ if(c.removedContractions != null && c.removedContractions.contains(start)) {
+ start++;
+ continue;
+ }
+ // we start our contraction from middle, since we don't know if it
+ // will grow toward right or left
+ contraction.append((char) start);
+ addSpecial(c, contraction, CE);
+ start++;
+ }
+ } else if(c.expansions != null && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) {
+ while(start < limit) {
+ c.expansions.add(start++);
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Gets unicode sets containing contractions and/or expansions of a collator
+ * @param contractions if not null, set to contain contractions
+ * @param expansions if not null, set to contain expansions
+ * @param addPrefixes add the prefix contextual elements to contractions
+ * @throws Exception
+ * @stable ICU 3.4
+ */
+ public void
+ getContractionsAndExpansions(UnicodeSet contractions, UnicodeSet expansions,
+ boolean addPrefixes) throws Exception {
+ if(contractions != null) {
+ contractions.clear();
+ }
+ if(expansions != null) {
+ expansions.clear();
+ }
+ String rules = getRules();
+ try {
+ CollationRuleParser src = new CollationRuleParser(rules);
+ contContext c = new contContext(RuleBasedCollator.UCA_,
+ contractions, expansions, src.m_removeSet_, addPrefixes);
+
+ // Add the UCA contractions
+ processSpecials(c);
+ // This is collator specific. Add contractions from a collator
+ c.coll = this;
+ c.removedContractions = null;
+ processSpecials(c);
+ } catch (Exception e) {
+ throw e;
+ }
+ }
+
+ /**
+ * <p>
+ * Get a Collation key for the argument String source from this
+ * RuleBasedCollator.
+ * </p>
+ * <p>
+ * General recommendation: <br>
+ * If comparison are to be done to the same String multiple times, it would
+ * be more efficient to generate CollationKeys for the Strings and use
+ * CollationKey.compareTo(CollationKey) for the comparisons.
+ * If the each Strings are compared to only once, using the method
+ * RuleBasedCollator.compare(String, String) will have a better performance.
+ * </p>
+ * <p>
+ * See the class documentation for an explanation about CollationKeys.
+ * </p>
+ * @param source the text String to be transformed into a collation key.
+ * @return the CollationKey for the given String based on this
+ * RuleBasedCollator's collation rules. If the source String is
+ * null, a null CollationKey is returned.
+ * @see CollationKey
+ * @see #compare(String, String)
+ * @see #getRawCollationKey
+ * @stable ICU 2.8
+ */
+ public CollationKey getCollationKey(String source) {
+ if (source == null) {
+ return null;
+ }
+ m_utilRawCollationKey_ = getRawCollationKey(source,
+ m_utilRawCollationKey_);
+ return new CollationKey(source, m_utilRawCollationKey_);
+ }
+
+ /**
+ * Gets the simpler form of a CollationKey for the String source following
+ * the rules of this Collator and stores the result into the user provided
+ * argument key.
+ * If key has a internal byte array of length that's too small for the
+ * result, the internal byte array will be grown to the exact required
+ * size.
+ * @param source the text String to be transformed into a RawCollationKey
+ * @param key output RawCollationKey to store results
+ * @return If key is null, a new instance of RawCollationKey will be
+ * created and returned, otherwise the user provided key will be
+ * returned.
+ * @see #getCollationKey
+ * @see #compare(String, String)
+ * @see RawCollationKey
+ * @stable ICU 2.8
+ */
+ public RawCollationKey getRawCollationKey(String source,
+ RawCollationKey key)
+ {
+ if (source == null) {
+ return null;
+ }
+ int strength = getStrength();
+ m_utilCompare0_ = m_isCaseLevel_;
+ //m_utilCompare1_ = true;
+ m_utilCompare2_ = strength >= SECONDARY;
+ m_utilCompare3_ = strength >= TERTIARY;
+ m_utilCompare4_ = strength >= QUATERNARY;
+ m_utilCompare5_ = strength == IDENTICAL;
+
+ m_utilBytesCount0_ = 0;
+ m_utilBytesCount1_ = 0;
+ m_utilBytesCount2_ = 0;
+ m_utilBytesCount3_ = 0;
+ m_utilBytesCount4_ = 0;
+ //m_utilBytesCount5_ = 0;
+ //m_utilCount0_ = 0;
+ //m_utilCount1_ = 0;
+ m_utilCount2_ = 0;
+ m_utilCount3_ = 0;
+ m_utilCount4_ = 0;
+ //m_utilCount5_ = 0;
+ boolean doFrench = m_isFrenchCollation_ && m_utilCompare2_;
+ // TODO: UCOL_COMMON_BOT4 should be a function of qShifted.
+ // If we have no qShifted, we don't need to set UCOL_COMMON_BOT4 so
+ // high.
+ int commonBottom4 = ((m_variableTopValue_ >>> 8) + 1) & LAST_BYTE_MASK_;
+ byte hiragana4 = 0;
+ if (m_isHiragana4_ && m_utilCompare4_) {
+ // allocate one more space for hiragana, value for hiragana
+ hiragana4 = (byte)commonBottom4;
+ commonBottom4 ++;
+ }
+
+ int bottomCount4 = 0xFF - commonBottom4;
+ // If we need to normalize, we'll do it all at once at the beginning!
+ if (m_utilCompare5_ && Normalizer.quickCheck(source, Normalizer.NFD,0)
+ != Normalizer.YES) {
+ // if it is identical strength, we have to normalize the string to
+ // NFD so that it will be appended correctly to the end of the sort
+ // key
+ source = Normalizer.decompose(source, false);
+ }
+ else if (getDecomposition() != NO_DECOMPOSITION
+ && Normalizer.quickCheck(source, Normalizer.FCD,0)
+ != Normalizer.YES) {
+ // for the rest of the strength, if decomposition is on, FCD is
+ // enough for us to work on.
+ source = Normalizer.normalize(source,Normalizer.FCD);
+ }
+ getSortKeyBytes(source, doFrench, hiragana4, commonBottom4,
+ bottomCount4);
+ if (key == null) {
+ key = new RawCollationKey();
+ }
+ getSortKey(source, doFrench, commonBottom4, bottomCount4, key);
+ return key;
+ }
+
+ /**
+ * Return true if an uppercase character is sorted before the corresponding lowercase character.
+ * See setCaseFirst(boolean) for details.
+ * @see #setUpperCaseFirst
+ * @see #setLowerCaseFirst
+ * @see #isLowerCaseFirst
+ * @see #setCaseFirstDefault
+ * @return true if upper cased characters are sorted before lower cased
+ * characters, false otherwise
+ * @stable ICU 2.8
+ */
+ public boolean isUpperCaseFirst()
+ {
+ return (m_caseFirst_ == AttributeValue.UPPER_FIRST_);
+ }
+
+ /**
+ * Return true if a lowercase character is sorted before the corresponding uppercase character.
+ * See setCaseFirst(boolean) for details.
+ * @see #setUpperCaseFirst
+ * @see #setLowerCaseFirst
+ * @see #isUpperCaseFirst
+ * @see #setCaseFirstDefault
+ * @return true lower cased characters are sorted before upper cased
+ * characters, false otherwise
+ * @stable ICU 2.8
+ */
+ public boolean isLowerCaseFirst()
+ {
+ return (m_caseFirst_ == AttributeValue.LOWER_FIRST_);
+ }
+
+ /**
+ * Checks if the alternate handling behaviour is the UCA defined SHIFTED or
+ * NON_IGNORABLE.
+ * If return value is true, then the alternate handling attribute for the
+ * Collator is SHIFTED. Otherwise if return value is false, then the
+ * alternate handling attribute for the Collator is NON_IGNORABLE
+ * See setAlternateHandlingShifted(boolean) for more details.
+ * @return true or false
+ * @see #setAlternateHandlingShifted(boolean)
+ * @see #setAlternateHandlingDefault
+ * @stable ICU 2.8
+ */
+ public boolean isAlternateHandlingShifted()
+ {
+ return m_isAlternateHandlingShifted_;
+ }
+
+ /**
+ * Checks if case level is set to true.
+ * See setCaseLevel(boolean) for details.
+ * @return the case level mode
+ * @see #setCaseLevelDefault
+ * @see #isCaseLevel
+ * @see #setCaseLevel(boolean)
+ * @stable ICU 2.8
+ */
+ public boolean isCaseLevel()
+ {
+ return m_isCaseLevel_;
+ }
+
+ /**
+ * Checks if French Collation is set to true.
+ * See setFrenchCollation(boolean) for details.
+ * @return true if French Collation is set to true, false otherwise
+ * @see #setFrenchCollation(boolean)
+ * @see #setFrenchCollationDefault
+ * @stable ICU 2.8
+ */
+ public boolean isFrenchCollation()
+ {
+ return m_isFrenchCollation_;
+ }
+
+ /**
+ * Checks if the Hiragana Quaternary mode is set on.
+ * See setHiraganaQuaternary(boolean) for more details.
+ * @return flag true if Hiragana Quaternary mode is on, false otherwise
+ * @see #setHiraganaQuaternaryDefault
+ * @see #setHiraganaQuaternary(boolean)
+ * @stable ICU 2.8
+ */
+ public boolean isHiraganaQuaternary()
+ {
+ return m_isHiragana4_;
+ }
+
+ /**
+ * Gets the variable top value of a Collator.
+ * Lower 16 bits are undefined and should be ignored.
+ * @return the variable top value of a Collator.
+ * @see #setVariableTop
+ * @stable ICU 2.6
+ */
+ public int getVariableTop()
+ {
+ return m_variableTopValue_ << 16;
+ }
+
+ /**
+ * Method to retrieve the numeric collation value.
+ * When numeric collation is turned on, this Collator generates a collation
+ * key for the numeric value of substrings of digits. This is a way to get
+ * '100' to sort AFTER '2'
+ * @see #setNumericCollation
+ * @see #setNumericCollationDefault
+ * @return true if numeric collation is turned on, false otherwise
+ * @stable ICU 2.8
+ */
+ public boolean getNumericCollation()
+ {
+ return m_isNumericCollation_;
+ }
+
+ // public other methods -------------------------------------------------
+
+ /**
+ * Compares the equality of two RuleBasedCollator objects.
+ * RuleBasedCollator objects are equal if they have the same collation
+ * rules and the same attributes.
+ * @param obj the RuleBasedCollator to be compared to.
+ * @return true if this RuleBasedCollator has exactly the same
+ * collation behaviour as obj, false otherwise.
+ * @stable ICU 2.8
+ */
+ public boolean equals(Object obj)
+ {
+ if (obj == null) {
+ return false; // super does class check
+ }
+ if (this == obj) {
+ return true;
+ }
+ if (getClass() != obj.getClass()) {
+ return false;
+ }
+ RuleBasedCollator other = (RuleBasedCollator)obj;
+ // all other non-transient information is also contained in rules.
+ if (getStrength() != other.getStrength()
+ || getDecomposition() != other.getDecomposition()
+ || other.m_caseFirst_ != m_caseFirst_
+ || other.m_caseSwitch_ != m_caseSwitch_
+ || other.m_isAlternateHandlingShifted_
+ != m_isAlternateHandlingShifted_
+ || other.m_isCaseLevel_ != m_isCaseLevel_
+ || other.m_isFrenchCollation_ != m_isFrenchCollation_
+ || other.m_isHiragana4_ != m_isHiragana4_) {
+ return false;
+ }
+ boolean rules = m_rules_ == other.m_rules_;
+ if (!rules && (m_rules_ != null && other.m_rules_ != null)) {
+ rules = m_rules_.equals(other.m_rules_);
+ }
+ if (!rules || !ICUDebug.enabled("collation")) {
+ return rules;
+ }
+ if (m_addition3_ != other.m_addition3_
+ || m_bottom3_ != other.m_bottom3_
+ || m_bottomCount3_ != other.m_bottomCount3_
+ || m_common3_ != other.m_common3_
+ || m_isSimple3_ != other.m_isSimple3_
+ || m_mask3_ != other.m_mask3_
+ || m_minContractionEnd_ != other.m_minContractionEnd_
+ || m_minUnsafe_ != other.m_minUnsafe_
+ || m_top3_ != other.m_top3_
+ || m_topCount3_ != other.m_topCount3_
+ || !Arrays.equals(m_unsafe_, other.m_unsafe_)) {
+ return false;
+ }
+ if (!m_trie_.equals(other.m_trie_)) {
+ // we should use the trie iterator here, but then this part is
+ // only used in the test.
+ for (int i = UCharacter.MAX_VALUE; i >= UCharacter.MIN_VALUE; i --)
+ {
+ int v = m_trie_.getCodePointValue(i);
+ int otherv = other.m_trie_.getCodePointValue(i);
+ if (v != otherv) {
+ int mask = v & (CE_TAG_MASK_ | CE_SPECIAL_FLAG_);
+ if (mask == (otherv & 0xff000000)) {
+ v &= 0xffffff;
+ otherv &= 0xffffff;
+ if (mask == 0xf1000000) {
+ v -= (m_expansionOffset_ << 4);
+ otherv -= (other.m_expansionOffset_ << 4);
+ }
+ else if (mask == 0xf2000000) {
+ v -= m_contractionOffset_;
+ otherv -= other.m_contractionOffset_;
+ }
+ if (v == otherv) {
+ continue;
+ }
+ }
+ return false;
+ }
+ }
+ }
+ if (Arrays.equals(m_contractionCE_, other.m_contractionCE_)
+ && Arrays.equals(m_contractionEnd_, other.m_contractionEnd_)
+ && Arrays.equals(m_contractionIndex_, other.m_contractionIndex_)
+ && Arrays.equals(m_expansion_, other.m_expansion_)
+ && Arrays.equals(m_expansionEndCE_, other.m_expansionEndCE_)) {
+ // not comparing paddings
+ for (int i = 0; i < m_expansionEndCE_.length; i ++) {
+ if (m_expansionEndCEMaxSize_[i]
+ != other.m_expansionEndCEMaxSize_[i]) {
+ return false;
+ }
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Generates a unique hash code for this RuleBasedCollator.
+ * @return the unique hash code for this Collator
+ * @stable ICU 2.8
+ */
+ public int hashCode()
+ {
+ String rules = getRules();
+ if (rules == null) {
+ rules = "";
+ }
+ return rules.hashCode();
+ }
+
+ /**
+ * Compares the source text String to the target text String according to
+ * the collation rules, strength and decomposition mode for this
+ * RuleBasedCollator.
+ * Returns an integer less than,
+ * equal to or greater than zero depending on whether the source String is
+ * less than, equal to or greater than the target String. See the Collator
+ * class description for an example of use.
+ * </p>
+ * <p>
+ * General recommendation: <br>
+ * If comparison are to be done to the same String multiple times, it would
+ * be more efficient to generate CollationKeys for the Strings and use
+ * CollationKey.compareTo(CollationKey) for the comparisons.
+ * If speed performance is critical and object instantiation is to be
+ * reduced, further optimization may be achieved by generating a simpler
+ * key of the form RawCollationKey and reusing this RawCollationKey
+ * object with the method RuleBasedCollator.getRawCollationKey. Internal
+ * byte representation can be directly accessed via RawCollationKey and
+ * stored for future use. Like CollationKey, RawCollationKey provides a
+ * method RawCollationKey.compareTo for key comparisons.
+ * If the each Strings are compared to only once, using the method
+ * RuleBasedCollator.compare(String, String) will have a better performance.
+ * </p>
+ * @param source the source text String.
+ * @param target the target text String.
+ * @return Returns an integer value. Value is less than zero if source is
+ * less than target, value is zero if source and target are equal,
+ * value is greater than zero if source is greater than target.
+ * @see CollationKey
+ * @see #getCollationKey
+ * @stable ICU 2.8
+ */
+ public int compare(String source, String target)
+ {
+ if (source == target) {
+ return 0;
+ }
+
+ // Find the length of any leading portion that is equal
+ int offset = getFirstUnmatchedOffset(source, target);
+ //return compareRegular(source, target, offset);
+ if(latinOneUse_) {
+ if ((offset < source.length()
+ && source.charAt(offset) > ENDOFLATINONERANGE_)
+ || (offset < target.length()
+ && target.charAt(offset) > ENDOFLATINONERANGE_)) {
+ // source or target start with non-latin-1
+ return compareRegular(source, target, offset);
+ } else {
+ return compareUseLatin1(source, target, offset);
+ }
+ } else {
+ return compareRegular(source, target, offset);
+ }
+ }
+
+ // package private inner interfaces --------------------------------------
+
+ /**
+ * Attribute values to be used when setting the Collator options
+ */
+ static interface AttributeValue
+ {
+ /**
+ * Indicates that the default attribute value will be used.
+ * See individual attribute for details on its default value.
+ */
+ static final int DEFAULT_ = -1;
+ /**
+ * Primary collation strength
+ */
+ static final int PRIMARY_ = Collator.PRIMARY;
+ /**
+ * Secondary collation strength
+ */
+ static final int SECONDARY_ = Collator.SECONDARY;
+ /**
+ * Tertiary collation strength
+ */
+ static final int TERTIARY_ = Collator.TERTIARY;
+ /**
+ * Default collation strength
+ */
+ static final int DEFAULT_STRENGTH_ = Collator.TERTIARY;
+ /**
+ * Internal use for strength checks in Collation elements
+ */
+ static final int CE_STRENGTH_LIMIT_ = Collator.TERTIARY + 1;
+ /**
+ * Quaternary collation strength
+ */
+ static final int QUATERNARY_ = 3;
+ /**
+ * Identical collation strength
+ */
+ static final int IDENTICAL_ = Collator.IDENTICAL;
+ /**
+ * Internal use for strength checks
+ */
+ static final int STRENGTH_LIMIT_ = Collator.IDENTICAL + 1;
+ /**
+ * Turn the feature off - works for FRENCH_COLLATION, CASE_LEVEL,
+ * HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE
+ */
+ static final int OFF_ = 16;
+ /**
+ * Turn the feature on - works for FRENCH_COLLATION, CASE_LEVEL,
+ * HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE
+ */
+ static final int ON_ = 17;
+ /**
+ * Valid for ALTERNATE_HANDLING. Alternate handling will be shifted
+ */
+ static final int SHIFTED_ = 20;
+ /**
+ * Valid for ALTERNATE_HANDLING. Alternate handling will be non
+ * ignorable
+ */
+ static final int NON_IGNORABLE_ = 21;
+ /**
+ * Valid for CASE_FIRST - lower case sorts before upper case
+ */
+ static final int LOWER_FIRST_ = 24;
+ /**
+ * Upper case sorts before lower case
+ */
+ static final int UPPER_FIRST_ = 25;
+ /**
+ * Number of attribute values
+ */
+ static final int LIMIT_ = 29;
+ }
+
+ /**
+ * Attributes that collation service understands. All the attributes can
+ * take DEFAULT value, as well as the values specific to each one.
+ */
+ static interface Attribute
+ {
+ /**
+ * Attribute for direction of secondary weights - used in French.
+ * Acceptable values are ON, which results in secondary weights being
+ * considered backwards and OFF which treats secondary weights in the
+ * order they appear.
+ */
+ static final int FRENCH_COLLATION_ = 0;
+ /**
+ * Attribute for handling variable elements. Acceptable values are
+ * NON_IGNORABLE (default) which treats all the codepoints with
+ * non-ignorable primary weights in the same way, and SHIFTED which
+ * causes codepoints with primary weights that are equal or below the
+ * variable top value to be ignored on primary level and moved to the
+ * quaternary level.
+ */
+ static final int ALTERNATE_HANDLING_ = 1;
+ /**
+ * Controls the ordering of upper and lower case letters. Acceptable
+ * values are OFF (default), which orders upper and lower case letters
+ * in accordance to their tertiary weights, UPPER_FIRST which forces
+ * upper case letters to sort before lower case letters, and
+ * LOWER_FIRST which does the opposite.
+ */
+ static final int CASE_FIRST_ = 2;
+ /**
+ * Controls whether an extra case level (positioned before the third
+ * level) is generated or not. Acceptable values are OFF (default),
+ * when case level is not generated, and ON which causes the case
+ * level to be generated. Contents of the case level are affected by
+ * the value of CASE_FIRST attribute. A simple way to ignore accent
+ * differences in a string is to set the strength to PRIMARY and
+ * enable case level.
+ */
+ static final int CASE_LEVEL_ = 3;
+ /**
+ * Controls whether the normalization check and necessary
+ * normalizations are performed. When set to OFF (default) no
+ * normalization check is performed. The correctness of the result is
+ * guaranteed only if the input data is in so-called FCD form (see
+ * users manual for more info). When set to ON, an incremental check
+ * is performed to see whether the input data is in the FCD form. If
+ * the data is not in the FCD form, incremental NFD normalization is
+ * performed.
+ */
+ static final int NORMALIZATION_MODE_ = 4;
+ /**
+ * The strength attribute. Can be either PRIMARY, SECONDARY, TERTIARY,
+ * QUATERNARY or IDENTICAL. The usual strength for most locales
+ * (except Japanese) is tertiary. Quaternary strength is useful when
+ * combined with shifted setting for alternate handling attribute and
+ * for JIS x 4061 collation, when it is used to distinguish between
+ * Katakana and Hiragana (this is achieved by setting the
+ * HIRAGANA_QUATERNARY mode to on. Otherwise, quaternary level is
+ * affected only by the number of non ignorable code points in the
+ * string. Identical strength is rarely useful, as it amounts to
+ * codepoints of the NFD form of the string.
+ */
+ static final int STRENGTH_ = 5;
+ /**
+ * When turned on, this attribute positions Hiragana before all
+ * non-ignorables on quaternary level. This is a sneaky way to produce
+ * JIS sort order.
+ */
+ static final int HIRAGANA_QUATERNARY_MODE_ = 6;
+ /**
+ * Attribute count
+ */
+ static final int LIMIT_ = 7;
+ }
+
+ /**
+ * DataManipulate singleton
+ */
+ static class DataManipulate implements Trie.DataManipulate
+ {
+ // public methods ----------------------------------------------------
+
+ /**
+ * Internal method called to parse a lead surrogate's ce for the offset
+ * to the next trail surrogate data.
+ * @param ce collation element of the lead surrogate
+ * @return data offset or 0 for the next trail surrogate
+ * @stable ICU 2.8
+ */
+ public final int getFoldingOffset(int ce)
+ {
+ if (isSpecial(ce) && getTag(ce) == CE_SURROGATE_TAG_) {
+ return (ce & 0xFFFFFF);
+ }
+ return 0;
+ }
+
+ /**
+ * Get singleton object
+ */
+ public static final DataManipulate getInstance()
+ {
+ if (m_instance_ == null) {
+ m_instance_ = new DataManipulate();
+ }
+ return m_instance_;
+ }
+
+ // private data member ----------------------------------------------
+
+ /**
+ * Singleton instance
+ */
+ private static DataManipulate m_instance_;
+
+ // private constructor ----------------------------------------------
+
+ /**
+ * private to prevent initialization
+ */
+ private DataManipulate()
+ {
+ }
+ }
+
+ /**
+ * UCAConstants
+ */
+ static final class UCAConstants
+ {
+ int FIRST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000
+ int LAST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000
+ int FIRST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x00008705
+ int FIRST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000000
+ int LAST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000500
+ int LAST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x0000DD05
+ int FIRST_VARIABLE_[] = new int[2]; // 0x05070505
+ int LAST_VARIABLE_[] = new int[2]; // 0x13CF0505
+ int FIRST_NON_VARIABLE_[] = new int[2]; // 0x16200505
+ int LAST_NON_VARIABLE_[] = new int[2]; // 0x767C0505
+ int RESET_TOP_VALUE_[] = new int[2]; // 0x9F000303
+ int FIRST_IMPLICIT_[] = new int[2];
+ int LAST_IMPLICIT_[] = new int[2];
+ int FIRST_TRAILING_[] = new int[2];
+ int LAST_TRAILING_[] = new int[2];
+ int PRIMARY_TOP_MIN_;
+ int PRIMARY_IMPLICIT_MIN_; // 0xE8000000
+ int PRIMARY_IMPLICIT_MAX_; // 0xF0000000
+ int PRIMARY_TRAILING_MIN_; // 0xE8000000
+ int PRIMARY_TRAILING_MAX_; // 0xF0000000
+ int PRIMARY_SPECIAL_MIN_; // 0xE8000000
+ int PRIMARY_SPECIAL_MAX_; // 0xF0000000
+ }
+
+ // package private data member -------------------------------------------
+
+ static final byte BYTE_FIRST_TAILORED_ = (byte)0x04;
+ static final byte BYTE_COMMON_ = (byte)0x05;
+ static final int COMMON_TOP_2_ = 0x86; // int for unsigness
+ static final int COMMON_BOTTOM_2_ = BYTE_COMMON_;
+ static final int COMMON_BOTTOM_3 = 0x05;
+ /**
+ * Case strength mask
+ */
+ static final int CE_CASE_BIT_MASK_ = 0xC0;
+ static final int CE_TAG_SHIFT_ = 24;
+ static final int CE_TAG_MASK_ = 0x0F000000;
+
+ static final int CE_SPECIAL_FLAG_ = 0xF0000000;
+ /**
+ * Lead surrogate that is tailored and doesn't start a contraction
+ */
+ static final int CE_SURROGATE_TAG_ = 5;
+ /**
+ * Mask to get the primary strength of the collation element
+ */
+ static final int CE_PRIMARY_MASK_ = 0xFFFF0000;
+ /**
+ * Mask to get the secondary strength of the collation element
+ */
+ static final int CE_SECONDARY_MASK_ = 0xFF00;
+ /**
+ * Mask to get the tertiary strength of the collation element
+ */
+ static final int CE_TERTIARY_MASK_ = 0xFF;
+ /**
+ * Primary strength shift
+ */
+ static final int CE_PRIMARY_SHIFT_ = 16;
+ /**
+ * Secondary strength shift
+ */
+ static final int CE_SECONDARY_SHIFT_ = 8;
+ /**
+ * Continuation marker
+ */
+ static final int CE_CONTINUATION_MARKER_ = 0xC0;
+
+ /**
+ * Size of collator raw data headers and options before the expansion
+ * data. This is used when expansion ces are to be retrieved. ICU4C uses
+ * the expansion offset starting from UCollator.UColHeader, hence ICU4J
+ * will have to minus that off to get the right expansion ce offset. In
+ * number of ints.
+ */
+ int m_expansionOffset_;
+ /**
+ * Size of collator raw data headers, options and expansions before
+ * contraction data. This is used when contraction ces are to be retrieved.
+ * ICU4C uses contraction offset starting from UCollator.UColHeader, hence
+ * ICU4J will have to minus that off to get the right contraction ce
+ * offset. In number of chars.
+ */
+ int m_contractionOffset_;
+ /**
+ * Flag indicator if Jamo is special
+ */
+ boolean m_isJamoSpecial_;
+
+ // Collator options ------------------------------------------------------
+
+ int m_defaultVariableTopValue_;
+ boolean m_defaultIsFrenchCollation_;
+ boolean m_defaultIsAlternateHandlingShifted_;
+ int m_defaultCaseFirst_;
+ boolean m_defaultIsCaseLevel_;
+ int m_defaultDecomposition_;
+ int m_defaultStrength_;
+ boolean m_defaultIsHiragana4_;
+ boolean m_defaultIsNumericCollation_;
+
+ /**
+ * Value of the variable top
+ */
+ int m_variableTopValue_;
+ /**
+ * Attribute for special Hiragana
+ */
+ boolean m_isHiragana4_;
+ /**
+ * Case sorting customization
+ */
+ int m_caseFirst_;
+ /**
+ * Numeric collation option
+ */
+ boolean m_isNumericCollation_;
+
+ // end Collator options --------------------------------------------------
+
+ /**
+ * Expansion table
+ */
+ int m_expansion_[];
+ /**
+ * Contraction index table
+ */
+ char m_contractionIndex_[];
+ /**
+ * Contraction CE table
+ */
+ int m_contractionCE_[];
+ /**
+ * Data trie
+ */
+ IntTrie m_trie_;
+ /**
+ * Table to store all collation elements that are the last element of an
+ * expansion. This is for use in StringSearch.
+ */
+ int m_expansionEndCE_[];
+ /**
+ * Table to store the maximum size of any expansions that end with the
+ * corresponding collation element in m_expansionEndCE_. For use in
+ * StringSearch too
+ */
+ byte m_expansionEndCEMaxSize_[];
+ /**
+ * Heuristic table to store information on whether a char character is
+ * considered "unsafe". "Unsafe" character are combining marks or those
+ * belonging to some contraction sequence from the offset 1 onwards.
+ * E.g. if "ABC" is the only contraction, then 'B' and 'C' are considered
+ * unsafe. If we have another contraction "ZA" with the one above, then
+ * 'A', 'B', 'C' are "unsafe" but 'Z' is not.
+ */
+ byte m_unsafe_[];
+ /**
+ * Table to store information on whether a codepoint can occur as the last
+ * character in a contraction
+ */
+ byte m_contractionEnd_[];
+ /**
+ * Original collation rules
+ */
+ String m_rules_;
+ /**
+ * The smallest "unsafe" codepoint
+ */
+ char m_minUnsafe_;
+ /**
+ * The smallest codepoint that could be the end of a contraction
+ */
+ char m_minContractionEnd_;
+ /**
+ * General version of the collator
+ */
+ VersionInfo m_version_;
+ /**
+ * UCA version
+ */
+ VersionInfo m_UCA_version_;
+ /**
+ * UCD version
+ */
+ VersionInfo m_UCD_version_;
+
+ /**
+ * UnicodeData.txt property object
+ */
+ static final RuleBasedCollator UCA_;
+ /**
+ * UCA Constants
+ */
+ static final UCAConstants UCA_CONSTANTS_;
+ /**
+ * Table for UCA and builder use
+ */
+ static final char UCA_CONTRACTIONS_[];
+
+ private static boolean UCA_INIT_COMPLETE;
+
+ /**
+ * Implicit generator
+ */
+ static final ImplicitCEGenerator impCEGen_;
+// /**
+// * Implicit constants
+// */
+// static final int IMPLICIT_BASE_BYTE_;
+// static final int IMPLICIT_LIMIT_BYTE_;
+// static final int IMPLICIT_4BYTE_BOUNDARY_;
+// static final int LAST_MULTIPLIER_;
+// static final int LAST2_MULTIPLIER_;
+// static final int IMPLICIT_BASE_3BYTE_;
+// static final int IMPLICIT_BASE_4BYTE_;
+// static final int BYTES_TO_AVOID_ = 3;
+// static final int OTHER_COUNT_ = 256 - BYTES_TO_AVOID_;
+// static final int LAST_COUNT_ = OTHER_COUNT_ / 2;
+// /**
+// * Room for intervening, without expanding to 5 bytes
+// */
+// static final int LAST_COUNT2_ = OTHER_COUNT_ / 21;
+// static final int IMPLICIT_3BYTE_COUNT_ = 1;
+//
+ static final byte SORT_LEVEL_TERMINATOR_ = 1;
+
+// These are values from UCA required for
+// implicit generation and supressing sort key compression
+// they should regularly be in the UCA, but if one
+// is running without UCA, it could be a problem
+ static final int maxRegularPrimary = 0xA0;
+ static final int minImplicitPrimary = 0xE0;
+ static final int maxImplicitPrimary = 0xE4;
+
+
+ // block to initialise character property database
+ static
+ {
+ // take pains to let static class init succeed, otherwise the class itself won't exist and
+ // clients will get a NoClassDefFoundException. Instead, make the constructors fail if
+ // we can't load the UCA data.
+
+ RuleBasedCollator iUCA_ = null;
+ UCAConstants iUCA_CONSTANTS_ = null;
+ char iUCA_CONTRACTIONS_[] = null;
+ ImplicitCEGenerator iimpCEGen_ = null;
+ try
+ {
+ // !!! note what's going on here...
+ // even though the static init of the class is not yet complete, we
+ // instantiate an instance of the class. So we'd better be sure that
+ // instantiation doesn't rely on the static initialization that's
+ // not complete yet!
+ iUCA_ = new RuleBasedCollator();
+ iUCA_CONSTANTS_ = new UCAConstants();
+ iUCA_CONTRACTIONS_ = CollatorReader.read(iUCA_, iUCA_CONSTANTS_);
+
+ // called before doing canonical closure for the UCA.
+ iimpCEGen_ = new ImplicitCEGenerator(minImplicitPrimary, maxImplicitPrimary);
+ //iimpCEGen_ = new ImplicitCEGenerator(iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_, iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_);
+ iUCA_.init();
+ ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_COLLATION_BASE_NAME, ULocale.ENGLISH);
+ iUCA_.m_rules_ = (String)rb.getObject("UCARules");
+ }
+ catch (MissingResourceException ex)
+ {
+// throw ex;
+ }
+ catch (IOException e)
+ {
+ // e.printStackTrace();
+// throw new MissingResourceException(e.getMessage(),"","");
+ }
+
+ UCA_ = iUCA_;
+ UCA_CONSTANTS_ = iUCA_CONSTANTS_;
+ UCA_CONTRACTIONS_ = iUCA_CONTRACTIONS_;
+ impCEGen_ = iimpCEGen_;
+
+ UCA_INIT_COMPLETE = true;
+ }
+
+
+ private static void checkUCA() throws MissingResourceException {
+ if (UCA_INIT_COMPLETE && UCA_ == null) {
+ throw new MissingResourceException("Collator UCA data unavailable", "", "");
+ }
+ }
+
+ // package private constructors ------------------------------------------
+
+ /**
+ * <p>Private contructor for use by subclasses.
+ * Public access to creating Collators is handled by the API
+ * Collator.getInstance() or RuleBasedCollator(String rules).
+ * </p>
+ * <p>
+ * This constructor constructs the UCA collator internally
+ * </p>
+ */
+ RuleBasedCollator()
+ {
+ checkUCA();
+ initUtility(false);
+ }
+
+ /**
+ * Constructors a RuleBasedCollator from the argument locale.
+ * If no resource bundle is associated with the locale, UCA is used
+ * instead.
+ * @param locale
+ */
+ RuleBasedCollator(ULocale locale)
+ {
+ checkUCA();
+ ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_COLLATION_BASE_NAME, locale);
+ initUtility(false);
+ if (rb != null) {
+ try {
+ // Use keywords, if supplied for lookup
+ String collkey = locale.getKeywordValue("collation");
+ if(collkey == null) {
+ collkey = rb.getStringWithFallback("collations/default");
+ }
+
+ // collations/default will always give a string back
+ // keyword for the real collation data
+ // if "collations/collkey" will return null if collkey == null
+ ICUResourceBundle elements = rb.getWithFallback("collations/" + collkey);
+ if (elements != null) {
+ // TODO: Determine actual & valid locale correctly
+ ULocale uloc = rb.getULocale();
+ setLocale(uloc, uloc);
+
+ m_rules_ = elements.getString("Sequence");
+ ByteBuffer buf = elements.get("%%CollationBin").getBinary();
+ // %%CollationBin
+ if(buf!=null){
+ // m_rules_ = (String)rules[1][1];
+ byte map[] = buf.array();
+ CollatorReader.initRBC(this, map);
+ /*
+ BufferedInputStream input =
+ new BufferedInputStream(
+ new ByteArrayInputStream(map));
+ /*
+ CollatorReader reader = new CollatorReader(input, false);
+ if (map.length > MIN_BINARY_DATA_SIZE_) {
+ reader.read(this, null);
+ }
+ else {
+ reader.readHeader(this);
+ reader.readOptions(this);
+ // duplicating UCA_'s data
+ setWithUCATables();
+ }
+ */
+ // at this point, we have read in the collator
+ // now we need to check whether the binary image has
+ // the right UCA and other versions
+ if(!m_UCA_version_.equals(UCA_.m_UCA_version_) ||
+ !m_UCD_version_.equals(UCA_.m_UCD_version_)) {
+ init(m_rules_);
+ return;
+ }
+ init();
+ return;
+ }
+ else {
+ init(m_rules_);
+ return;
+ }
+ }
+ }
+ catch (Exception e) {
+ // e.printStackTrace();
+ // if failed use UCA.
+ }
+ }
+ setWithUCAData();
+ }
+
+ // package private methods -----------------------------------------------
+
+ /**
+ * Sets this collator to use the tables in UCA. Note options not taken
+ * care of here.
+ */
+ final void setWithUCATables()
+ {
+ m_contractionOffset_ = UCA_.m_contractionOffset_;
+ m_expansionOffset_ = UCA_.m_expansionOffset_;
+ m_expansion_ = UCA_.m_expansion_;
+ m_contractionIndex_ = UCA_.m_contractionIndex_;
+ m_contractionCE_ = UCA_.m_contractionCE_;
+ m_trie_ = UCA_.m_trie_;
+ m_expansionEndCE_ = UCA_.m_expansionEndCE_;
+ m_expansionEndCEMaxSize_ = UCA_.m_expansionEndCEMaxSize_;
+ m_unsafe_ = UCA_.m_unsafe_;
+ m_contractionEnd_ = UCA_.m_contractionEnd_;
+ m_minUnsafe_ = UCA_.m_minUnsafe_;
+ m_minContractionEnd_ = UCA_.m_minContractionEnd_;
+ }
+
+ /**
+ * Sets this collator to use the all options and tables in UCA.
+ */
+ final void setWithUCAData()
+ {
+ latinOneFailed_ = true;
+
+ m_addition3_ = UCA_.m_addition3_;
+ m_bottom3_ = UCA_.m_bottom3_;
+ m_bottomCount3_ = UCA_.m_bottomCount3_;
+ m_caseFirst_ = UCA_.m_caseFirst_;
+ m_caseSwitch_ = UCA_.m_caseSwitch_;
+ m_common3_ = UCA_.m_common3_;
+ m_contractionOffset_ = UCA_.m_contractionOffset_;
+ setDecomposition(UCA_.getDecomposition());
+ m_defaultCaseFirst_ = UCA_.m_defaultCaseFirst_;
+ m_defaultDecomposition_ = UCA_.m_defaultDecomposition_;
+ m_defaultIsAlternateHandlingShifted_
+ = UCA_.m_defaultIsAlternateHandlingShifted_;
+ m_defaultIsCaseLevel_ = UCA_.m_defaultIsCaseLevel_;
+ m_defaultIsFrenchCollation_ = UCA_.m_defaultIsFrenchCollation_;
+ m_defaultIsHiragana4_ = UCA_.m_defaultIsHiragana4_;
+ m_defaultStrength_ = UCA_.m_defaultStrength_;
+ m_defaultVariableTopValue_ = UCA_.m_defaultVariableTopValue_;
+ m_defaultIsNumericCollation_ = UCA_.m_defaultIsNumericCollation_;
+ m_expansionOffset_ = UCA_.m_expansionOffset_;
+ m_isAlternateHandlingShifted_ = UCA_.m_isAlternateHandlingShifted_;
+ m_isCaseLevel_ = UCA_.m_isCaseLevel_;
+ m_isFrenchCollation_ = UCA_.m_isFrenchCollation_;
+ m_isHiragana4_ = UCA_.m_isHiragana4_;
+ m_isJamoSpecial_ = UCA_.m_isJamoSpecial_;
+ m_isSimple3_ = UCA_.m_isSimple3_;
+ m_mask3_ = UCA_.m_mask3_;
+ m_minContractionEnd_ = UCA_.m_minContractionEnd_;
+ m_minUnsafe_ = UCA_.m_minUnsafe_;
+ m_rules_ = UCA_.m_rules_;
+ setStrength(UCA_.getStrength());
+ m_top3_ = UCA_.m_top3_;
+ m_topCount3_ = UCA_.m_topCount3_;
+ m_variableTopValue_ = UCA_.m_variableTopValue_;
+ m_isNumericCollation_ = UCA_.m_isNumericCollation_;
+ setWithUCATables();
+ latinOneFailed_ = false;
+ }
+
+ /**
+ * Test whether a char character is potentially "unsafe" for use as a
+ * collation starting point. "Unsafe" characters are combining marks or
+ * those belonging to some contraction sequence from the offset 1 onwards.
+ * E.g. if "ABC" is the only contraction, then 'B' and
+ * 'C' are considered unsafe. If we have another contraction "ZA" with
+ * the one above, then 'A', 'B', 'C' are "unsafe" but 'Z' is not.
+ * @param ch character to determin
+ * @return true if ch is unsafe, false otherwise
+ */
+ final boolean isUnsafe(char ch)
+ {
+ if (ch < m_minUnsafe_) {
+ return false;
+ }
+
+ if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) {
+ if (UTF16.isLeadSurrogate(ch)
+ || UTF16.isTrailSurrogate(ch)) {
+ // Trail surrogate are always considered unsafe.
+ return true;
+ }
+ ch &= HEURISTIC_OVERFLOW_MASK_;
+ ch += HEURISTIC_OVERFLOW_OFFSET_;
+ }
+ int value = m_unsafe_[ch >> HEURISTIC_SHIFT_];
+ return ((value >> (ch & HEURISTIC_MASK_)) & 1) != 0;
+ }
+
+ /**
+ * Approximate determination if a char character is at a contraction end.
+ * Guaranteed to be true if a character is at the end of a contraction,
+ * otherwise it is not deterministic.
+ * @param ch character to be determined
+ */
+ final boolean isContractionEnd(char ch)
+ {
+ if (UTF16.isTrailSurrogate(ch)) {
+ return true;
+ }
+
+ if (ch < m_minContractionEnd_) {
+ return false;
+ }
+
+ if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) {
+ ch &= HEURISTIC_OVERFLOW_MASK_;
+ ch += HEURISTIC_OVERFLOW_OFFSET_;
+ }
+ int value = m_contractionEnd_[ch >> HEURISTIC_SHIFT_];
+ return ((value >> (ch & HEURISTIC_MASK_)) & 1) != 0;
+ }
+
+ /**
+ * Retrieve the tag of a special ce
+ * @param ce ce to test
+ * @return tag of ce
+ */
+ static int getTag(int ce)
+ {
+ return (ce & CE_TAG_MASK_) >> CE_TAG_SHIFT_;
+ }
+
+ /**
+ * Checking if ce is special
+ * @param ce to check
+ * @return true if ce is special
+ */
+ static boolean isSpecial(int ce)
+ {
+ return (ce & CE_SPECIAL_FLAG_) == CE_SPECIAL_FLAG_;
+ }
+
+ /**
+ * Checks if the argument ce is a continuation
+ * @param ce collation element to test
+ * @return true if ce is a continuation
+ */
+ static final boolean isContinuation(int ce)
+ {
+ return ce != CollationElementIterator.NULLORDER
+ && (ce & CE_CONTINUATION_TAG_) == CE_CONTINUATION_TAG_;
+ }
+
+ // private inner classes ------------------------------------------------
+
+ // private variables -----------------------------------------------------
+
+ /**
+ * The smallest natural unsafe or contraction end char character before
+ * tailoring.
+ * This is a combining mark.
+ */
+ private static final int DEFAULT_MIN_HEURISTIC_ = 0x300;
+ /**
+ * Heuristic table table size. Size is 32 bytes, 1 bit for each
+ * latin 1 char, and some power of two for hashing the rest of the chars.
+ * Size in bytes.
+ */
+ private static final char HEURISTIC_SIZE_ = 1056;
+ /**
+ * Mask value down to "some power of two" - 1,
+ * number of bits, not num of bytes.
+ */
+ private static final char HEURISTIC_OVERFLOW_MASK_ = 0x1fff;
+ /**
+ * Unsafe character shift
+ */
+ private static final int HEURISTIC_SHIFT_ = 3;
+ /**
+ * Unsafe character addition for character too large, it has to be folded
+ * then incremented.
+ */
+ private static final char HEURISTIC_OVERFLOW_OFFSET_ = 256;
+ /**
+ * Mask value to get offset in heuristic table.
+ */
+ private static final char HEURISTIC_MASK_ = 7;
+
+ private int m_caseSwitch_;
+ private int m_common3_;
+ private int m_mask3_;
+ /**
+ * When switching case, we need to add or subtract different values.
+ */
+ private int m_addition3_;
+ /**
+ * Upper range when compressing
+ */
+ private int m_top3_;
+ /**
+ * Upper range when compressing
+ */
+ private int m_bottom3_;
+ private int m_topCount3_;
+ private int m_bottomCount3_;
+ /**
+ * Case first constants
+ */
+ private static final int CASE_SWITCH_ = 0xC0;
+ private static final int NO_CASE_SWITCH_ = 0;
+ /**
+ * Case level constants
+ */
+ private static final int CE_REMOVE_CASE_ = 0x3F;
+ private static final int CE_KEEP_CASE_ = 0xFF;
+ /**
+ * Case strength mask
+ */
+ private static final int CE_CASE_MASK_3_ = 0xFF;
+ /**
+ * Sortkey size factor. Values can be changed.
+ */
+ private static final double PROPORTION_2_ = 0.5;
+ private static final double PROPORTION_3_ = 0.667;
+
+ // These values come from the UCA ----------------------------------------
+
+ /**
+ * This is an enum that lists magic special byte values from the
+ * fractional UCA
+ */
+ //private static final byte BYTE_ZERO_ = 0x0;
+ //private static final byte BYTE_LEVEL_SEPARATOR_ = (byte)0x01;
+ //private static final byte BYTE_SORTKEY_GLUE_ = (byte)0x02;
+ private static final byte BYTE_SHIFT_PREFIX_ = (byte)0x03;
+ /*private*/ static final byte BYTE_UNSHIFTED_MIN_ = BYTE_SHIFT_PREFIX_;
+ //private static final byte BYTE_FIRST_UCA_ = BYTE_COMMON_;
+ static final byte CODAN_PLACEHOLDER = 0x27;
+ //private static final byte BYTE_LAST_LATIN_PRIMARY_ = (byte)0x4C;
+ private static final byte BYTE_FIRST_NON_LATIN_PRIMARY_ = (byte)0x4D;
+ private static final byte BYTE_UNSHIFTED_MAX_ = (byte)0xFF;
+ private static final int TOTAL_2_ = COMMON_TOP_2_ - COMMON_BOTTOM_2_ - 1;
+ private static final int FLAG_BIT_MASK_CASE_SWITCH_OFF_ = 0x80;
+ private static final int FLAG_BIT_MASK_CASE_SWITCH_ON_ = 0x40;
+ private static final int COMMON_TOP_CASE_SWITCH_OFF_3_ = 0x85;
+ private static final int COMMON_TOP_CASE_SWITCH_LOWER_3_ = 0x45;
+ private static final int COMMON_TOP_CASE_SWITCH_UPPER_3_ = 0xC5;
+ private static final int COMMON_BOTTOM_3_ = 0x05;
+ private static final int COMMON_BOTTOM_CASE_SWITCH_UPPER_3_ = 0x86;
+ private static final int COMMON_BOTTOM_CASE_SWITCH_LOWER_3_ =
+ COMMON_BOTTOM_3_;
+ private static final int TOP_COUNT_2_ = (int)(PROPORTION_2_ * TOTAL_2_);
+ private static final int BOTTOM_COUNT_2_ = TOTAL_2_ - TOP_COUNT_2_;
+ private static final int COMMON_2_ = COMMON_BOTTOM_2_;
+ private static final int COMMON_UPPER_FIRST_3_ = 0xC5;
+ private static final int COMMON_NORMAL_3_ = COMMON_BOTTOM_3_;
+ //private static final int COMMON_4_ = (byte)0xFF;
+
+
+
+ /*
+ * Minimum size required for the binary collation data in bytes.
+ * Size of UCA header + size of options to 4 bytes
+ */
+ //private static final int MIN_BINARY_DATA_SIZE_ = (42 + 25) << 2;
+
+ /**
+ * If this collator is to generate only simple tertiaries for fast path
+ */
+ private boolean m_isSimple3_;
+
+ /**
+ * French collation sorting flag
+ */
+ private boolean m_isFrenchCollation_;
+ /**
+ * Flag indicating if shifted is requested for Quaternary alternate
+ * handling. If this is not true, the default for alternate handling will
+ * be non-ignorable.
+ */
+ private boolean m_isAlternateHandlingShifted_;
+ /**
+ * Extra case level for sorting
+ */
+ private boolean m_isCaseLevel_;
+
+ private static final int SORT_BUFFER_INIT_SIZE_ = 128;
+ private static final int SORT_BUFFER_INIT_SIZE_1_ =
+ SORT_BUFFER_INIT_SIZE_ << 3;
+ private static final int SORT_BUFFER_INIT_SIZE_2_ = SORT_BUFFER_INIT_SIZE_;
+ private static final int SORT_BUFFER_INIT_SIZE_3_ = SORT_BUFFER_INIT_SIZE_;
+ private static final int SORT_BUFFER_INIT_SIZE_CASE_ =
+ SORT_BUFFER_INIT_SIZE_ >> 2;
+ private static final int SORT_BUFFER_INIT_SIZE_4_ = SORT_BUFFER_INIT_SIZE_;
+
+ private static final int CE_CONTINUATION_TAG_ = 0xC0;
+ private static final int CE_REMOVE_CONTINUATION_MASK_ = 0xFFFFFF3F;
+
+ private static final int LAST_BYTE_MASK_ = 0xFF;
+
+ //private static final int CE_RESET_TOP_VALUE_ = 0x9F000303;
+ //private static final int CE_NEXT_TOP_VALUE_ = 0xE8960303;
+
+ private static final byte SORT_CASE_BYTE_START_ = (byte)0x80;
+ private static final byte SORT_CASE_SHIFT_START_ = (byte)7;
+
+ /**
+ * CE buffer size
+ */
+ private static final int CE_BUFFER_SIZE_ = 512;
+
+ // variables for Latin-1 processing
+ boolean latinOneUse_ = false;
+ boolean latinOneRegenTable_ = false;
+ boolean latinOneFailed_ = false;
+
+ int latinOneTableLen_ = 0;
+ int latinOneCEs_[] = null;
+ /**
+ * Bunch of utility iterators
+ */
+ private StringUCharacterIterator m_srcUtilIter_;
+ private CollationElementIterator m_srcUtilColEIter_;
+ private StringUCharacterIterator m_tgtUtilIter_;
+ private CollationElementIterator m_tgtUtilColEIter_;
+ /**
+ * Utility comparison flags
+ */
+ private boolean m_utilCompare0_;
+ //private boolean m_utilCompare1_;
+ private boolean m_utilCompare2_;
+ private boolean m_utilCompare3_;
+ private boolean m_utilCompare4_;
+ private boolean m_utilCompare5_;
+ /**
+ * Utility byte buffer
+ */
+ private byte m_utilBytes0_[];
+ private byte m_utilBytes1_[];
+ private byte m_utilBytes2_[];
+ private byte m_utilBytes3_[];
+ private byte m_utilBytes4_[];
+ //private byte m_utilBytes5_[];
+ private RawCollationKey m_utilRawCollationKey_;
+
+ private int m_utilBytesCount0_;
+ private int m_utilBytesCount1_;
+ private int m_utilBytesCount2_;
+ private int m_utilBytesCount3_;
+ private int m_utilBytesCount4_;
+ //private int m_utilBytesCount5_;
+ //private int m_utilCount0_;
+ //private int m_utilCount1_;
+ private int m_utilCount2_;
+ private int m_utilCount3_;
+ private int m_utilCount4_;
+ //private int m_utilCount5_;
+
+ private int m_utilFrenchStart_;
+ private int m_utilFrenchEnd_;
+
+ /**
+ * Preparing the CE buffers. will be filled during the primary phase
+ */
+ private int m_srcUtilCEBuffer_[];
+ private int m_tgtUtilCEBuffer_[];
+ private int m_srcUtilCEBufferSize_;
+ private int m_tgtUtilCEBufferSize_;
+
+ private int m_srcUtilContOffset_;
+ private int m_tgtUtilContOffset_;
+
+ private int m_srcUtilOffset_;
+ private int m_tgtUtilOffset_;
+
+ // private methods -------------------------------------------------------
+
+ private void init(String rules) throws Exception
+ {
+ setWithUCAData();
+ CollationParsedRuleBuilder builder
+ = new CollationParsedRuleBuilder(rules);
+ builder.setRules(this);
+ m_rules_ = rules;
+ init();
+ initUtility(false);
+ }
+
+ private final int compareRegular(String source, String target, int offset) {
+ if (m_srcUtilIter_ == null) {
+ initUtility(true);
+ }
+ int strength = getStrength();
+ // setting up the collator parameters
+ m_utilCompare0_ = m_isCaseLevel_;
+ //m_utilCompare1_ = true;
+ m_utilCompare2_ = strength >= SECONDARY;
+ m_utilCompare3_ = strength >= TERTIARY;
+ m_utilCompare4_ = strength >= QUATERNARY;
+ m_utilCompare5_ = strength == IDENTICAL;
+ boolean doFrench = m_isFrenchCollation_ && m_utilCompare2_;
+ boolean doShift4 = m_isAlternateHandlingShifted_ && m_utilCompare4_;
+ boolean doHiragana4 = m_isHiragana4_ && m_utilCompare4_;
+
+ if (doHiragana4 && doShift4) {
+ String sourcesub = source.substring(offset);
+ String targetsub = target.substring(offset);
+ return compareBySortKeys(sourcesub, targetsub);
+ }
+
+ // This is the lowest primary value that will not be ignored if shifted
+ int lowestpvalue = m_isAlternateHandlingShifted_
+ ? m_variableTopValue_ << 16 : 0;
+ m_srcUtilCEBufferSize_ = 0;
+ m_tgtUtilCEBufferSize_ = 0;
+ int result = doPrimaryCompare(doHiragana4, lowestpvalue, source,
+ target, offset);
+ if (m_srcUtilCEBufferSize_ == -1
+ && m_tgtUtilCEBufferSize_ == -1) {
+ // since the cebuffer is cleared when we have determined that
+ // either source is greater than target or vice versa, the return
+ // result is the comparison result and not the hiragana result
+ return result;
+ }
+
+ int hiraganaresult = result;
+
+ if (m_utilCompare2_) {
+ result = doSecondaryCompare(doFrench);
+ if (result != 0) {
+ return result;
+ }
+ }
+ // doing the case bit
+ if (m_utilCompare0_) {
+ result = doCaseCompare();
+ if (result != 0) {
+ return result;
+ }
+ }
+ // Tertiary level
+ if (m_utilCompare3_) {
+ result = doTertiaryCompare();
+ if (result != 0) {
+ return result;
+ }
+ }
+
+ if (doShift4) { // checkQuad
+ result = doQuaternaryCompare(lowestpvalue);
+ if (result != 0) {
+ return result;
+ }
+ }
+ else if (doHiragana4 && hiraganaresult != 0) {
+ // If we're fine on quaternaries, we might be different
+ // on Hiragana. This, however, might fail us in shifted.
+ return hiraganaresult;
+ }
+
+ // For IDENTICAL comparisons, we use a bitwise character comparison
+ // as a tiebreaker if all else is equal.
+ // Getting here should be quite rare - strings are not identical -
+ // that is checked first, but compared == through all other checks.
+ if (m_utilCompare5_) {
+ return doIdenticalCompare(source, target, offset, true);
+ }
+ return 0;
+ }
+
+ /**
+ * Gets the 2 bytes of primary order and adds it to the primary byte array
+ * @param ce current ce
+ * @param notIsContinuation flag indicating if the current bytes belong to
+ * a continuation ce
+ * @param doShift flag indicating if ce is to be shifted
+ * @param leadPrimary lead primary used for compression
+ * @param commonBottom4 common byte value for Quaternary
+ * @param bottomCount4 smallest byte value for Quaternary
+ * @return the new lead primary for compression
+ */
+ private final int doPrimaryBytes(int ce, boolean notIsContinuation,
+ boolean doShift, int leadPrimary,
+ int commonBottom4, int bottomCount4)
+ {
+
+ int p2 = (ce >>>= 16) & LAST_BYTE_MASK_; // in ints for unsigned
+ int p1 = ce >>> 8; // comparison
+ if (doShift) {
+ if (m_utilCount4_ > 0) {
+ while (m_utilCount4_ > bottomCount4) {
+ m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
+ (byte)(commonBottom4 + bottomCount4));
+ m_utilBytesCount4_ ++;
+ m_utilCount4_ -= bottomCount4;
+ }
+ m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
+ (byte)(commonBottom4
+ + (m_utilCount4_ - 1)));
+ m_utilBytesCount4_ ++;
+ m_utilCount4_ = 0;
+ }
+ // dealing with a variable and we're treating them as shifted
+ // This is a shifted ignorable
+ if (p1 != 0) {
+ // we need to check this since we could be in continuation
+ m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
+ (byte)p1);
+ m_utilBytesCount4_ ++;
+ }
+ if (p2 != 0) {
+ m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
+ (byte)p2);
+ m_utilBytesCount4_ ++;
+ }
+ }
+ else {
+ // Note: This code assumes that the table is well built
+ // i.e. not having 0 bytes where they are not supposed to be.
+ // Usually, we'll have non-zero primary1 & primary2, except
+ // in cases of LatinOne and friends, when primary2 will be
+ // regular and simple sortkey calc
+ if (p1 != CollationElementIterator.IGNORABLE) {
+ if (notIsContinuation) {
+ if (leadPrimary == p1) {
+ m_utilBytes1_ = append(m_utilBytes1_,
+ m_utilBytesCount1_, (byte)p2);
+ m_utilBytesCount1_ ++;
+ }
+ else {
+ if (leadPrimary != 0) {
+ m_utilBytes1_ = append(m_utilBytes1_,
+ m_utilBytesCount1_,
+ ((p1 > leadPrimary)
+ ? BYTE_UNSHIFTED_MAX_
+ : BYTE_UNSHIFTED_MIN_));
+ m_utilBytesCount1_ ++;
+ }
+ if (p2 == CollationElementIterator.IGNORABLE) {
+ // one byter, not compressed
+ m_utilBytes1_ = append(m_utilBytes1_,
+ m_utilBytesCount1_,
+ (byte)p1);
+ m_utilBytesCount1_ ++;
+ leadPrimary = 0;
+ }
+ else if (p1 < BYTE_FIRST_NON_LATIN_PRIMARY_
+ || (p1 > maxRegularPrimary
+ //> (RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_[0]
+ // >>> 24)
+ && p1 < minImplicitPrimary
+ //< (RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_[0]
+ // >>> 24)
+ )) {
+ // not compressible
+ leadPrimary = 0;
+ m_utilBytes1_ = append(m_utilBytes1_,
+ m_utilBytesCount1_,
+ (byte)p1);
+ m_utilBytesCount1_ ++;
+ m_utilBytes1_ = append(m_utilBytes1_,
+ m_utilBytesCount1_,
+ (byte)p2);
+ m_utilBytesCount1_ ++;
+ }
+ else { // compress
+ leadPrimary = p1;
+ m_utilBytes1_ = append(m_utilBytes1_,
+ m_utilBytesCount1_,
+ (byte)p1);
+ m_utilBytesCount1_ ++;
+ m_utilBytes1_ = append(m_utilBytes1_,
+ m_utilBytesCount1_, (byte)p2);
+ m_utilBytesCount1_ ++;
+ }
+ }
+ }
+ else {
+ // continuation, add primary to the key, no compression
+ m_utilBytes1_ = append(m_utilBytes1_,
+ m_utilBytesCount1_, (byte)p1);
+ m_utilBytesCount1_ ++;
+ if (p2 != CollationElementIterator.IGNORABLE) {
+ m_utilBytes1_ = append(m_utilBytes1_,
+ m_utilBytesCount1_, (byte)p2);
+ // second part
+ m_utilBytesCount1_ ++;
+ }
+ }
+ }
+ }
+ return leadPrimary;
+ }
+
+ /**
+ * Gets the secondary byte and adds it to the secondary byte array
+ * @param ce current ce
+ * @param notIsContinuation flag indicating if the current bytes belong to
+ * a continuation ce
+ * @param doFrench flag indicator if french sort is to be performed
+ */
+ private final void doSecondaryBytes(int ce, boolean notIsContinuation,
+ boolean doFrench)
+ {
+ int s = (ce >>= 8) & LAST_BYTE_MASK_; // int for comparison
+ if (s != 0) {
+ if (!doFrench) {
+ // This is compression code.
+ if (s == COMMON_2_ && notIsContinuation) {
+ m_utilCount2_ ++;
+ }
+ else {
+ if (m_utilCount2_ > 0) {
+ if (s > COMMON_2_) { // not necessary for 4th level.
+ while (m_utilCount2_ > TOP_COUNT_2_) {
+ m_utilBytes2_ = append(m_utilBytes2_,
+ m_utilBytesCount2_,
+ (byte)(COMMON_TOP_2_ - TOP_COUNT_2_));
+ m_utilBytesCount2_ ++;
+ m_utilCount2_ -= TOP_COUNT_2_;
+ }
+ m_utilBytes2_ = append(m_utilBytes2_,
+ m_utilBytesCount2_,
+ (byte)(COMMON_TOP_2_
+ - (m_utilCount2_ - 1)));
+ m_utilBytesCount2_ ++;
+ }
+ else {
+ while (m_utilCount2_ > BOTTOM_COUNT_2_) {
+ m_utilBytes2_ = append(m_utilBytes2_,
+ m_utilBytesCount2_,
+ (byte)(COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
+ m_utilBytesCount2_ ++;
+ m_utilCount2_ -= BOTTOM_COUNT_2_;
+ }
+ m_utilBytes2_ = append(m_utilBytes2_,
+ m_utilBytesCount2_,
+ (byte)(COMMON_BOTTOM_2_
+ + (m_utilCount2_ - 1)));
+ m_utilBytesCount2_ ++;
+ }
+ m_utilCount2_ = 0;
+ }
+ m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,
+ (byte)s);
+ m_utilBytesCount2_ ++;
+ }
+ }
+ else {
+ m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,
+ (byte)s);
+ m_utilBytesCount2_ ++;
+ // Do the special handling for French secondaries
+ // We need to get continuation elements and do intermediate
+ // restore
+ // abc1c2c3de with french secondaries need to be edc1c2c3ba
+ // NOT edc3c2c1ba
+ if (notIsContinuation) {
+ if (m_utilFrenchStart_ != -1) {
+ // reverse secondaries from frenchStartPtr up to
+ // frenchEndPtr
+ reverseBuffer(m_utilBytes2_);
+ m_utilFrenchStart_ = -1;
+ }
+ }
+ else {
+ if (m_utilFrenchStart_ == -1) {
+ m_utilFrenchStart_ = m_utilBytesCount2_ - 2;
+ }
+ m_utilFrenchEnd_ = m_utilBytesCount2_ - 1;
+ }
+ }
+ }
+ }
+
+ /**
+ * Reverse the argument buffer
+ * @param buffer to reverse
+ */
+ private void reverseBuffer(byte buffer[])
+ {
+ int start = m_utilFrenchStart_;
+ int end = m_utilFrenchEnd_;
+ while (start < end) {
+ byte b = buffer[start];
+ buffer[start ++] = buffer[end];
+ buffer[end --] = b;
+ }
+ }
+
+ /**
+ * Insert the case shifting byte if required
+ * @param caseshift value
+ * @return new caseshift value
+ */
+ private final int doCaseShift(int caseshift)
+ {
+ if (caseshift == 0) {
+ m_utilBytes0_ = append(m_utilBytes0_, m_utilBytesCount0_,
+ SORT_CASE_BYTE_START_);
+ m_utilBytesCount0_ ++;
+ caseshift = SORT_CASE_SHIFT_START_;
+ }
+ return caseshift;
+ }
+
+ /**
+ * Performs the casing sort
+ * @param tertiary byte in ints for easy comparison
+ * @param notIsContinuation flag indicating if the current bytes belong to
+ * a continuation ce
+ * @param caseshift
+ * @return the new value of case shift
+ */
+ private final int doCaseBytes(int tertiary, boolean notIsContinuation,
+ int caseshift)
+ {
+ caseshift = doCaseShift(caseshift);
+
+ if (notIsContinuation && tertiary != 0) {
+ byte casebits = (byte)(tertiary & 0xC0);
+ if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {
+ if (casebits == 0) {
+ m_utilBytes0_[m_utilBytesCount0_ - 1]
+ |= (1 << (-- caseshift));
+ }
+ else {
+ // second bit
+ caseshift = doCaseShift(caseshift - 1);
+ m_utilBytes0_[m_utilBytesCount0_ - 1]
+ |= ((casebits >> 6) & 1) << (-- caseshift);
+ }
+ }
+ else {
+ if (casebits != 0) {
+ m_utilBytes0_[m_utilBytesCount0_ - 1]
+ |= 1 << (-- caseshift);
+ // second bit
+ caseshift = doCaseShift(caseshift);
+ m_utilBytes0_[m_utilBytesCount0_ - 1]
+ |= ((casebits >> 7) & 1) << (-- caseshift);
+ }
+ else {
+ caseshift --;
+ }
+ }
+ }
+
+ return caseshift;
+ }
+
+ /**
+ * Gets the tertiary byte and adds it to the tertiary byte array
+ * @param tertiary byte in int for easy comparison
+ * @param notIsContinuation flag indicating if the current bytes belong to
+ * a continuation ce
+ */
+ private final void doTertiaryBytes(int tertiary, boolean notIsContinuation)
+ {
+ if (tertiary != 0) {
+ // This is compression code.
+ // sequence size check is included in the if clause
+ if (tertiary == m_common3_ && notIsContinuation) {
+ m_utilCount3_ ++;
+ }
+ else {
+ int common3 = m_common3_ & LAST_BYTE_MASK_;
+ if (tertiary > common3 && m_common3_ == COMMON_NORMAL_3_) {
+ tertiary += m_addition3_;
+ }
+ else if (tertiary <= common3
+ && m_common3_ == COMMON_UPPER_FIRST_3_) {
+ tertiary -= m_addition3_;
+ }
+ if (m_utilCount3_ > 0) {
+ if (tertiary > common3) {
+ while (m_utilCount3_ > m_topCount3_) {
+ m_utilBytes3_ = append(m_utilBytes3_,
+ m_utilBytesCount3_,
+ (byte)(m_top3_ - m_topCount3_));
+ m_utilBytesCount3_ ++;
+ m_utilCount3_ -= m_topCount3_;
+ }
+ m_utilBytes3_ = append(m_utilBytes3_,
+ m_utilBytesCount3_,
+ (byte)(m_top3_
+ - (m_utilCount3_ - 1)));
+ m_utilBytesCount3_ ++;
+ }
+ else {
+ while (m_utilCount3_ > m_bottomCount3_) {
+ m_utilBytes3_ = append(m_utilBytes3_,
+ m_utilBytesCount3_,
+ (byte)(m_bottom3_ + m_bottomCount3_));
+ m_utilBytesCount3_ ++;
+ m_utilCount3_ -= m_bottomCount3_;
+ }
+ m_utilBytes3_ = append(m_utilBytes3_,
+ m_utilBytesCount3_,
+ (byte)(m_bottom3_
+ + (m_utilCount3_ - 1)));
+ m_utilBytesCount3_ ++;
+ }
+ m_utilCount3_ = 0;
+ }
+ m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
+ (byte)tertiary);
+ m_utilBytesCount3_ ++;
+ }
+ }
+ }
+
+ /**
+ * Gets the Quaternary byte and adds it to the Quaternary byte array
+ * @param isCodePointHiragana flag indicator if the previous codepoint
+ * we dealt with was Hiragana
+ * @param commonBottom4 smallest common Quaternary byte
+ * @param bottomCount4 smallest Quaternary byte
+ * @param hiragana4 hiragana Quaternary byte
+ */
+ private final void doQuaternaryBytes(boolean isCodePointHiragana,
+ int commonBottom4, int bottomCount4,
+ byte hiragana4)
+ {
+ if (isCodePointHiragana) { // This was Hiragana, need to note it
+ if (m_utilCount4_ > 0) { // Close this part
+ while (m_utilCount4_ > bottomCount4) {
+ m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
+ (byte)(commonBottom4
+ + bottomCount4));
+ m_utilBytesCount4_ ++;
+ m_utilCount4_ -= bottomCount4;
+ }
+ m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
+ (byte)(commonBottom4
+ + (m_utilCount4_ - 1)));
+ m_utilBytesCount4_ ++;
+ m_utilCount4_ = 0;
+ }
+ m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
+ hiragana4); // Add the Hiragana
+ m_utilBytesCount4_ ++;
+ }
+ else { // This wasn't Hiragana, so we can continue adding stuff
+ m_utilCount4_ ++;
+ }
+ }
+
+ /**
+ * Iterates through the argument string for all ces.
+ * Split the ces into their relevant primaries, secondaries etc.
+ * @param source normalized string
+ * @param doFrench flag indicator if special handling of French has to be
+ * done
+ * @param hiragana4 offset for Hiragana quaternary
+ * @param commonBottom4 smallest common quaternary byte
+ * @param bottomCount4 smallest quaternary byte
+ */
+ private final void getSortKeyBytes(String source, boolean doFrench,
+ byte hiragana4, int commonBottom4,
+ int bottomCount4)
+
+ {
+ if (m_srcUtilIter_ == null) {
+ initUtility(true);
+ }
+ int backupDecomposition = getDecomposition();
+ setDecomposition(NO_DECOMPOSITION); // have to revert to backup later
+ m_srcUtilIter_.setText(source);
+ m_srcUtilColEIter_.setText(m_srcUtilIter_);
+ m_utilFrenchStart_ = -1;
+ m_utilFrenchEnd_ = -1;
+
+ // scriptorder not implemented yet
+ // const uint8_t *scriptOrder = coll->scriptOrder;
+
+ boolean doShift = false;
+ boolean notIsContinuation = false;
+
+ int leadPrimary = 0; // int for easier comparison
+ int caseShift = 0;
+
+ while (true) {
+ int ce = m_srcUtilColEIter_.next();
+ if (ce == CollationElementIterator.NULLORDER) {
+ break;
+ }
+
+ if (ce == CollationElementIterator.IGNORABLE) {
+ continue;
+ }
+
+ notIsContinuation = !isContinuation(ce);
+
+ /*
+ * if (notIsContinuation) {
+ if (scriptOrder != NULL) {
+ primary1 = scriptOrder[primary1];
+ }
+ }*/
+ boolean isPrimaryByteIgnorable = (ce & CE_PRIMARY_MASK_) == 0;
+ // actually we can just check that the first byte is 0
+ // generation stuffs the order left first
+ boolean isSmallerThanVariableTop = (ce >>> CE_PRIMARY_SHIFT_)
+ <= m_variableTopValue_;
+ doShift = (m_isAlternateHandlingShifted_
+ && ((notIsContinuation && isSmallerThanVariableTop
+ && !isPrimaryByteIgnorable) // primary byte not 0
+ || (!notIsContinuation && doShift))
+ || (doShift && isPrimaryByteIgnorable));
+ if (doShift && isPrimaryByteIgnorable) {
+ // amendment to the UCA says that primary ignorables and other
+ // ignorables should be removed if following a shifted code
+ // point
+ // if we were shifted and we got an ignorable code point
+ // we should just completely ignore it
+ continue;
+ }
+ leadPrimary = doPrimaryBytes(ce, notIsContinuation, doShift,
+ leadPrimary, commonBottom4,
+ bottomCount4);
+ if (doShift) {
+ continue;
+ }
+ if (m_utilCompare2_) {
+ doSecondaryBytes(ce, notIsContinuation, doFrench);
+ }
+
+ int t = ce & LAST_BYTE_MASK_;
+ if (!notIsContinuation) {
+ t = ce & CE_REMOVE_CONTINUATION_MASK_;
+ }
+
+ if (m_utilCompare0_ && (!isPrimaryByteIgnorable || m_utilCompare2_)) {
+ // do the case level if we need to do it. We don't want to calculate
+ // case level for primary ignorables if we have only primary strength and case level
+ // otherwise we would break well formedness of CEs
+ caseShift = doCaseBytes(t, notIsContinuation, caseShift);
+ }
+ else if (notIsContinuation) {
+ t ^= m_caseSwitch_;
+ }
+
+ t &= m_mask3_;
+
+ if (m_utilCompare3_) {
+ doTertiaryBytes(t, notIsContinuation);
+ }
+
+ if (m_utilCompare4_ && notIsContinuation) { // compare quad
+ doQuaternaryBytes(m_srcUtilColEIter_.m_isCodePointHiragana_,
+ commonBottom4, bottomCount4, hiragana4);
+ }
+ }
+ setDecomposition(backupDecomposition); // reverts to original
+ if (m_utilFrenchStart_ != -1) {
+ // one last round of checks
+ reverseBuffer(m_utilBytes2_);
+ }
+ }
+
+ /**
+ * From the individual strength byte results the final compact sortkey
+ * will be calculated.
+ * @param source text string
+ * @param doFrench flag indicating that special handling of French has to
+ * be done
+ * @param commonBottom4 smallest common quaternary byte
+ * @param bottomCount4 smallest quaternary byte
+ * @param key output RawCollationKey to store results, key cannot be null
+ */
+ private final void getSortKey(String source, boolean doFrench,
+ int commonBottom4,
+ int bottomCount4,
+ RawCollationKey key)
+ {
+ // we have done all the CE's, now let's put them together to form
+ // a key
+ if (m_utilCompare2_) {
+ doSecondary(doFrench);
+ }
+ // adding case level should be independent of secondary level
+ if (m_utilCompare0_) {
+ doCase();
+ }
+ if (m_utilCompare3_) {
+ doTertiary();
+ if (m_utilCompare4_) {
+ doQuaternary(commonBottom4, bottomCount4);
+ if (m_utilCompare5_) {
+ doIdentical(source);
+ }
+
+ }
+ }
+ m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte)0);
+ m_utilBytesCount1_ ++;
+
+ key.set(m_utilBytes1_, 0, m_utilBytesCount1_);
+ }
+
+ /**
+ * Packs the French bytes
+ */
+ private final void doFrench()
+ {
+ for (int i = 0; i < m_utilBytesCount2_; i ++) {
+ byte s = m_utilBytes2_[m_utilBytesCount2_ - i - 1];
+ // This is compression code.
+ if (s == COMMON_2_) {
+ ++ m_utilCount2_;
+ }
+ else {
+ if (m_utilCount2_ > 0) {
+ // getting the unsigned value
+ if ((s & LAST_BYTE_MASK_) > COMMON_2_) {
+ // not necessary for 4th level.
+ while (m_utilCount2_ > TOP_COUNT_2_) {
+ m_utilBytes1_ = append(m_utilBytes1_,
+ m_utilBytesCount1_,
+ (byte)(COMMON_TOP_2_ - TOP_COUNT_2_));
+ m_utilBytesCount1_ ++;
+ m_utilCount2_ -= TOP_COUNT_2_;
+ }
+ m_utilBytes1_ = append(m_utilBytes1_,
+ m_utilBytesCount1_,
+ (byte)(COMMON_TOP_2_
+ - (m_utilCount2_ - 1)));
+ m_utilBytesCount1_ ++;
+ }
+ else {
+ while (m_utilCount2_ > BOTTOM_COUNT_2_) {
+ m_utilBytes1_ = append(m_utilBytes1_,
+ m_utilBytesCount1_,
+ (byte)(COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
+ m_utilBytesCount1_ ++;
+ m_utilCount2_ -= BOTTOM_COUNT_2_;
+ }
+ m_utilBytes1_ = append(m_utilBytes1_,
+ m_utilBytesCount1_,
+ (byte)(COMMON_BOTTOM_2_
+ + (m_utilCount2_ - 1)));
+ m_utilBytesCount1_ ++;
+ }
+ m_utilCount2_ = 0;
+ }
+ m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, s);
+ m_utilBytesCount1_ ++;
+ }
+ }
+ if (m_utilCount2_ > 0) {
+ while (m_utilCount2_ > BOTTOM_COUNT_2_) {
+ m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
+ (byte)(COMMON_BOTTOM_2_
+ + BOTTOM_COUNT_2_));
+ m_utilBytesCount1_ ++;
+ m_utilCount2_ -= BOTTOM_COUNT_2_;
+ }
+ m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
+ (byte)(COMMON_BOTTOM_2_
+ + (m_utilCount2_ - 1)));
+ m_utilBytesCount1_ ++;
+ }
+ }
+
+ /**
+ * Compacts the secondary bytes and stores them into the primary array
+ * @param doFrench flag indicator that French has to be handled specially
+ */
+ private final void doSecondary(boolean doFrench)
+ {
+ if (m_utilCount2_ > 0) {
+ while (m_utilCount2_ > BOTTOM_COUNT_2_) {
+ m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,
+ (byte)(COMMON_BOTTOM_2_
+ + BOTTOM_COUNT_2_));
+ m_utilBytesCount2_ ++;
+ m_utilCount2_ -= BOTTOM_COUNT_2_;
+ }
+ m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,
+ (byte)(COMMON_BOTTOM_2_ +
+ (m_utilCount2_ - 1)));
+ m_utilBytesCount2_ ++;
+ }
+
+ m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
+ SORT_LEVEL_TERMINATOR_);
+ m_utilBytesCount1_ ++;
+
+ if (doFrench) { // do the reverse copy
+ doFrench();
+ }
+ else {
+ if (m_utilBytes1_.length <= m_utilBytesCount1_
+ + m_utilBytesCount2_) {
+ m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
+ m_utilBytesCount2_);
+ }
+ System.arraycopy(m_utilBytes2_, 0, m_utilBytes1_,
+ m_utilBytesCount1_, m_utilBytesCount2_);
+ m_utilBytesCount1_ += m_utilBytesCount2_;
+ }
+ }
+
+ /**
+ * Increase buffer size
+ * @param buffer array of bytes
+ * @param size of the byte array
+ * @param incrementsize size to increase
+ * @return the new buffer
+ */
+ private static final byte[] increase(byte buffer[], int size,
+ int incrementsize)
+ {
+ byte result[] = new byte[buffer.length + incrementsize];
+ System.arraycopy(buffer, 0, result, 0, size);
+ return result;
+ }
+
+ /**
+ * Increase buffer size
+ * @param buffer array of ints
+ * @param size of the byte array
+ * @param incrementsize size to increase
+ * @return the new buffer
+ */
+ private static final int[] increase(int buffer[], int size,
+ int incrementsize)
+ {
+ int result[] = new int[buffer.length + incrementsize];
+ System.arraycopy(buffer, 0, result, 0, size);
+ return result;
+ }
+
+ /**
+ * Compacts the case bytes and stores them into the primary array
+ */
+ private final void doCase()
+ {
+ m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
+ SORT_LEVEL_TERMINATOR_);
+ m_utilBytesCount1_ ++;
+ if (m_utilBytes1_.length <= m_utilBytesCount1_ + m_utilBytesCount0_) {
+ m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
+ m_utilBytesCount0_);
+ }
+ System.arraycopy(m_utilBytes0_, 0, m_utilBytes1_, m_utilBytesCount1_,
+ m_utilBytesCount0_);
+ m_utilBytesCount1_ += m_utilBytesCount0_;
+ }
+
+ /**
+ * Compacts the tertiary bytes and stores them into the primary array
+ */
+ private final void doTertiary()
+ {
+ if (m_utilCount3_ > 0) {
+ if (m_common3_ != COMMON_BOTTOM_3_) {
+ while (m_utilCount3_ >= m_topCount3_) {
+ m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
+ (byte)(m_top3_ - m_topCount3_));
+ m_utilBytesCount3_ ++;
+ m_utilCount3_ -= m_topCount3_;
+ }
+ m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
+ (byte)(m_top3_ - m_utilCount3_));
+ m_utilBytesCount3_ ++;
+ }
+ else {
+ while (m_utilCount3_ > m_bottomCount3_) {
+ m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
+ (byte)(m_bottom3_
+ + m_bottomCount3_));
+ m_utilBytesCount3_ ++;
+ m_utilCount3_ -= m_bottomCount3_;
+ }
+ m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
+ (byte)(m_bottom3_
+ + (m_utilCount3_ - 1)));
+ m_utilBytesCount3_ ++;
+ }
+ }
+ m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
+ SORT_LEVEL_TERMINATOR_);
+ m_utilBytesCount1_ ++;
+ if (m_utilBytes1_.length <= m_utilBytesCount1_ + m_utilBytesCount3_) {
+ m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
+ m_utilBytesCount3_);
+ }
+ System.arraycopy(m_utilBytes3_, 0, m_utilBytes1_, m_utilBytesCount1_,
+ m_utilBytesCount3_);
+ m_utilBytesCount1_ += m_utilBytesCount3_;
+ }
+
+ /**
+ * Compacts the quaternary bytes and stores them into the primary array
+ */
+ private final void doQuaternary(int commonbottom4, int bottomcount4)
+ {
+ if (m_utilCount4_ > 0) {
+ while (m_utilCount4_ > bottomcount4) {
+ m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
+ (byte)(commonbottom4 + bottomcount4));
+ m_utilBytesCount4_ ++;
+ m_utilCount4_ -= bottomcount4;
+ }
+ m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
+ (byte)(commonbottom4
+ + (m_utilCount4_ - 1)));
+ m_utilBytesCount4_ ++;
+ }
+ m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
+ SORT_LEVEL_TERMINATOR_);
+ m_utilBytesCount1_ ++;
+ if (m_utilBytes1_.length <= m_utilBytesCount1_ + m_utilBytesCount4_) {
+ m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
+ m_utilBytesCount4_);
+ }
+ System.arraycopy(m_utilBytes4_, 0, m_utilBytes1_, m_utilBytesCount1_,
+ m_utilBytesCount4_);
+ m_utilBytesCount1_ += m_utilBytesCount4_;
+ }
+
+ /**
+ * Deals with the identical sort.
+ * Appends the BOCSU version of the source string to the ends of the
+ * byte buffer.
+ * @param source text string
+ */
+ private final void doIdentical(String source)
+ {
+ int isize = BOCU.getCompressionLength(source);
+ m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
+ SORT_LEVEL_TERMINATOR_);
+ m_utilBytesCount1_ ++;
+ if (m_utilBytes1_.length <= m_utilBytesCount1_ + isize) {
+ m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
+ 1 + isize);
+ }
+ m_utilBytesCount1_ = BOCU.compress(source, m_utilBytes1_,
+ m_utilBytesCount1_);
+ }
+
+ /**
+ * Gets the offset of the first unmatched characters in source and target.
+ * This method returns the offset of the start of a contraction or a
+ * combining sequence, if the first difference is in the middle of such a
+ * sequence.
+ * @param source string
+ * @param target string
+ * @return offset of the first unmatched characters in source and target.
+ */
+ private final int getFirstUnmatchedOffset(String source, String target)
+ {
+ int result = 0;
+ int slength = source.length();
+ int tlength = target.length();
+ int minlength = slength;
+ if (minlength > tlength) {
+ minlength = tlength;
+ }
+ while (result < minlength
+ && source.charAt(result) == target.charAt(result)) {
+ result ++;
+ }
+ if (result > 0) {
+ // There is an identical portion at the beginning of the two
+ // strings. If the identical portion ends within a contraction or a
+ // combining character sequence, back up to the start of that
+ // sequence.
+ char schar = 0;
+ char tchar = 0;
+ if (result < minlength) {
+ schar = source.charAt(result); // first differing chars
+ tchar = target.charAt(result);
+ }
+ else {
+ schar = source.charAt(minlength - 1);
+ if (isUnsafe(schar)) {
+ tchar = schar;
+ }
+ else if (slength == tlength) {
+ return result;
+ }
+ else if (slength < tlength) {
+ tchar = target.charAt(result);
+ }
+ else {
+ schar = source.charAt(result);
+ }
+ }
+ if (isUnsafe(schar) || isUnsafe(tchar))
+ {
+ // We are stopped in the middle of a contraction or combining
+ // sequence.
+ // Look backwards for the part of the string for the start of
+ // the sequence
+ // It doesn't matter which string we scan, since they are the
+ // same in this region.
+ do {
+ result --;
+ }
+ while (result > 0 && isUnsafe(source.charAt(result)));
+ }
+ }
+ return result;
+ }
+
+ /**
+ * Appending an byte to an array of bytes and increases it if we run out of
+ * space
+ * @param array of byte arrays
+ * @param appendindex index in the byte array to append
+ * @param value to append
+ * @return array if array size can accomodate the new value, otherwise
+ * a bigger array will be created and returned
+ */
+ private static final byte[] append(byte array[], int appendindex,
+ byte value)
+ {
+ try {
+ array[appendindex] = value;
+ }
+ catch (ArrayIndexOutOfBoundsException e) {
+ array = increase(array, appendindex, SORT_BUFFER_INIT_SIZE_);
+ array[appendindex] = value;
+ }
+ return array;
+ }
+
+ /**
+ * This is a trick string compare function that goes in and uses sortkeys
+ * to compare. It is used when compare gets in trouble and needs to bail
+ * out.
+ * @param source text string
+ * @param target text string
+ */
+ private final int compareBySortKeys(String source, String target)
+
+ {
+ m_utilRawCollationKey_ = getRawCollationKey(source,
+ m_utilRawCollationKey_);
+ // this method is very seldom called
+ RawCollationKey targetkey = getRawCollationKey(target, null);
+ return m_utilRawCollationKey_.compareTo(targetkey);
+ }
+
+ /**
+ * Performs the primary comparisons, and fills up the CE buffer at the
+ * same time.
+ * The return value toggles between the comparison result and the hiragana
+ * result. If either the source is greater than target or vice versa, the
+ * return result is the comparison result, ie 1 or -1, furthermore the
+ * cebuffers will be cleared when that happens. If the primary comparisons
+ * are equal, we'll have to continue with secondary comparison. In this case
+ * the cebuffer will not be cleared and the return result will be the
+ * hiragana result.
+ * @param doHiragana4 flag indicator that Hiragana Quaternary has to be
+ * observed
+ * @param lowestpvalue the lowest primary value that will not be ignored if
+ * alternate handling is shifted
+ * @param source text string
+ * @param target text string
+ * @param textoffset offset in text to start the comparison
+ * @return comparion result if a primary difference is found, otherwise
+ * hiragana result
+ */
+ private final int doPrimaryCompare(boolean doHiragana4, int lowestpvalue,
+ String source, String target,
+ int textoffset)
+
+ {
+ // Preparing the context objects for iterating over strings
+ m_srcUtilIter_.setText(source);
+ m_srcUtilColEIter_.setText(m_srcUtilIter_, textoffset);
+ m_tgtUtilIter_.setText(target);
+ m_tgtUtilColEIter_.setText(m_tgtUtilIter_, textoffset);
+
+ // Non shifted primary processing is quite simple
+ if (!m_isAlternateHandlingShifted_) {
+ int hiraganaresult = 0;
+ while (true) {
+ int sorder = 0;
+ // We fetch CEs until we hit a non ignorable primary or end.
+ do {
+ sorder = m_srcUtilColEIter_.next();
+ m_srcUtilCEBuffer_ = append(m_srcUtilCEBuffer_,
+ m_srcUtilCEBufferSize_, sorder);
+ m_srcUtilCEBufferSize_ ++;
+ sorder &= CE_PRIMARY_MASK_;
+ } while (sorder == CollationElementIterator.IGNORABLE);
+
+ int torder = 0;
+ do {
+ torder = m_tgtUtilColEIter_.next();
+ m_tgtUtilCEBuffer_ = append(m_tgtUtilCEBuffer_,
+ m_tgtUtilCEBufferSize_, torder);
+ m_tgtUtilCEBufferSize_ ++;
+ torder &= CE_PRIMARY_MASK_;
+ } while (torder == CollationElementIterator.IGNORABLE);
+
+ // if both primaries are the same
+ if (sorder == torder) {
+ // and there are no more CEs, we advance to the next level
+ // see if we are at the end of either string
+ if (m_srcUtilCEBuffer_[m_srcUtilCEBufferSize_ - 1]
+ == CollationElementIterator.NULLORDER) {
+ if (m_tgtUtilCEBuffer_[m_tgtUtilCEBufferSize_ - 1]
+ != CollationElementIterator.NULLORDER) {
+ return -1;
+ }
+ break;
+ }
+ else if (m_tgtUtilCEBuffer_[m_tgtUtilCEBufferSize_ - 1]
+ == CollationElementIterator.NULLORDER) {
+ return 1;
+ }
+ if (doHiragana4 && hiraganaresult == 0
+ && m_srcUtilColEIter_.m_isCodePointHiragana_ !=
+ m_tgtUtilColEIter_.m_isCodePointHiragana_) {
+ if (m_srcUtilColEIter_.m_isCodePointHiragana_) {
+ hiraganaresult = -1;
+ }
+ else {
+ hiraganaresult = 1;
+ }
+ }
+ }
+ else {
+ // if two primaries are different, we are done
+ return endPrimaryCompare(sorder, torder);
+ }
+ }
+ // no primary difference... do the rest from the buffers
+ return hiraganaresult;
+ }
+ else { // shifted - do a slightly more complicated processing :)
+ while (true) {
+ int sorder = getPrimaryShiftedCompareCE(m_srcUtilColEIter_,
+ lowestpvalue, true);
+ int torder = getPrimaryShiftedCompareCE(m_tgtUtilColEIter_,
+ lowestpvalue, false);
+ if (sorder == torder) {
+ if (m_srcUtilCEBuffer_[m_srcUtilCEBufferSize_ - 1]
+ == CollationElementIterator.NULLORDER) {
+ break;
+ }
+ else {
+ continue;
+ }
+ }
+ else {
+ return endPrimaryCompare(sorder, torder);
+ }
+ } // no primary difference... do the rest from the buffers
+ }
+ return 0;
+ }
+
+ /**
+ * This is used only for primary strength when we know that sorder is
+ * already different from torder.
+ * Compares sorder and torder, returns -1 if sorder is less than torder.
+ * Clears the cebuffer at the same time.
+ * @param sorder source strength order
+ * @param torder target strength order
+ * @return the comparison result of sorder and torder
+ */
+ private final int endPrimaryCompare(int sorder, int torder)
+ {
+ // if we reach here, the ce offset accessed is the last ce
+ // appended to the buffer
+ boolean isSourceNullOrder = (m_srcUtilCEBuffer_[
+ m_srcUtilCEBufferSize_ - 1]
+ == CollationElementIterator.NULLORDER);
+ boolean isTargetNullOrder = (m_tgtUtilCEBuffer_[
+ m_tgtUtilCEBufferSize_ - 1]
+ == CollationElementIterator.NULLORDER);
+ m_srcUtilCEBufferSize_ = -1;
+ m_tgtUtilCEBufferSize_ = -1;
+ if (isSourceNullOrder) {
+ return -1;
+ }
+ if (isTargetNullOrder) {
+ return 1;
+ }
+ // getting rid of the sign
+ sorder >>>= CE_PRIMARY_SHIFT_;
+ torder >>>= CE_PRIMARY_SHIFT_;
+ if (sorder < torder) {
+ return -1;
+ }
+ return 1;
+ }
+
+ /**
+ * Calculates the next primary shifted value and fills up cebuffer with the
+ * next non-ignorable ce.
+ * @param coleiter collation element iterator
+ * @param doHiragana4 flag indicator if hiragana quaternary is to be
+ * handled
+ * @param lowestpvalue lowest primary shifted value that will not be
+ * ignored
+ * @return result next modified ce
+ */
+ private final int getPrimaryShiftedCompareCE(
+ CollationElementIterator coleiter,
+ int lowestpvalue, boolean isSrc)
+
+ {
+ boolean shifted = false;
+ int result = CollationElementIterator.IGNORABLE;
+ int cebuffer[] = m_srcUtilCEBuffer_;
+ int cebuffersize = m_srcUtilCEBufferSize_;
+ if (!isSrc) {
+ cebuffer = m_tgtUtilCEBuffer_;
+ cebuffersize = m_tgtUtilCEBufferSize_;
+ }
+ while (true) {
+ result = coleiter.next();
+ if (result == CollationElementIterator.NULLORDER) {
+ cebuffer = append(cebuffer, cebuffersize, result);
+ cebuffersize ++;
+ break;
+ }
+ else if (result == CollationElementIterator.IGNORABLE
+ || (shifted
+ && (result & CE_PRIMARY_MASK_)
+ == CollationElementIterator.IGNORABLE)) {
+ // UCA amendment - ignore ignorables that follow shifted code
+ // points
+ continue;
+ }
+ else if (isContinuation(result)) {
+ if ((result & CE_PRIMARY_MASK_)
+ != CollationElementIterator.IGNORABLE) {
+ // There is primary value
+ if (shifted) {
+ result = (result & CE_PRIMARY_MASK_)
+ | CE_CONTINUATION_MARKER_;
+ // preserve interesting continuation
+ cebuffer = append(cebuffer, cebuffersize, result);
+ cebuffersize ++;
+ continue;
+ }
+ else {
+ cebuffer = append(cebuffer, cebuffersize, result);
+ cebuffersize ++;
+ break;
+ }
+ }
+ else { // Just lower level values
+ if (!shifted) {
+ cebuffer = append(cebuffer, cebuffersize, result);
+ cebuffersize ++;
+ }
+ }
+ }
+ else { // regular
+ if (Utility.compareUnsigned(result & CE_PRIMARY_MASK_,
+ lowestpvalue) > 0) {
+ cebuffer = append(cebuffer, cebuffersize, result);
+ cebuffersize ++;
+ break;
+ }
+ else {
+ if ((result & CE_PRIMARY_MASK_) != 0) {
+ shifted = true;
+ result &= CE_PRIMARY_MASK_;
+ cebuffer = append(cebuffer, cebuffersize, result);
+ cebuffersize ++;
+ continue;
+ }
+ else {
+ cebuffer = append(cebuffer, cebuffersize, result);
+ cebuffersize ++;
+ shifted = false;
+ continue;
+ }
+ }
+ }
+ }
+ if (isSrc) {
+ m_srcUtilCEBuffer_ = cebuffer;
+ m_srcUtilCEBufferSize_ = cebuffersize;
+ }
+ else {
+ m_tgtUtilCEBuffer_ = cebuffer;
+ m_tgtUtilCEBufferSize_ = cebuffersize;
+ }
+ result &= CE_PRIMARY_MASK_;
+ return result;
+ }
+
+ /**
+ * Appending an int to an array of ints and increases it if we run out of
+ * space
+ * @param array of int arrays
+ * @param appendindex index at which value will be appended
+ * @param value to append
+ * @return array if size is not increased, otherwise a new array will be
+ * returned
+ */
+ private static final int[] append(int array[], int appendindex, int value)
+ {
+ if (appendindex + 1 >= array.length) {
+ array = increase(array, appendindex, CE_BUFFER_SIZE_);
+ }
+ array[appendindex] = value;
+ return array;
+ }
+
+ /**
+ * Does secondary strength comparison based on the collected ces.
+ * @param doFrench flag indicates if French ordering is to be done
+ * @return the secondary strength comparison result
+ */
+ private final int doSecondaryCompare(boolean doFrench)
+ {
+ // now, we're gonna reexamine collected CEs
+ if (!doFrench) { // normal
+ int soffset = 0;
+ int toffset = 0;
+ while (true) {
+ int sorder = CollationElementIterator.IGNORABLE;
+ while (sorder == CollationElementIterator.IGNORABLE) {
+ sorder = m_srcUtilCEBuffer_[soffset ++]
+ & CE_SECONDARY_MASK_;
+ }
+ int torder = CollationElementIterator.IGNORABLE;
+ while (torder == CollationElementIterator.IGNORABLE) {
+ torder = m_tgtUtilCEBuffer_[toffset ++]
+ & CE_SECONDARY_MASK_;
+ }
+
+ if (sorder == torder) {
+ if (m_srcUtilCEBuffer_[soffset - 1]
+ == CollationElementIterator.NULLORDER) {
+ if (m_tgtUtilCEBuffer_[toffset - 1]
+ != CollationElementIterator.NULLORDER) {
+ return -1;
+ }
+ break;
+ }
+ else if (m_tgtUtilCEBuffer_[toffset - 1]
+ == CollationElementIterator.NULLORDER) {
+ return 1;
+ }
+ }
+ else {
+ if (m_srcUtilCEBuffer_[soffset - 1] ==
+ CollationElementIterator.NULLORDER) {
+ return -1;
+ }
+ if (m_tgtUtilCEBuffer_[toffset - 1] ==
+ CollationElementIterator.NULLORDER) {
+ return 1;
+ }
+ return (sorder < torder) ? -1 : 1;
+ }
+ }
+ }
+ else { // do the French
+ m_srcUtilContOffset_ = 0;
+ m_tgtUtilContOffset_ = 0;
+ m_srcUtilOffset_ = m_srcUtilCEBufferSize_ - 2;
+ m_tgtUtilOffset_ = m_tgtUtilCEBufferSize_ - 2;
+ while (true) {
+ int sorder = getSecondaryFrenchCE(true);
+ int torder = getSecondaryFrenchCE(false);
+ if (sorder == torder) {
+ if ((m_srcUtilOffset_ < 0 && m_tgtUtilOffset_ < 0)
+ || (m_srcUtilOffset_ >= 0
+ && m_srcUtilCEBuffer_[m_srcUtilOffset_]
+ == CollationElementIterator.NULLORDER)) {
+ break;
+ }
+ }
+ else {
+ return (sorder < torder) ? -1 : 1;
+ }
+ }
+ }
+ return 0;
+ }
+
+ /**
+ * Calculates the next secondary french CE.
+ * @param isSrc flag indicator if we are calculating the src ces
+ * @return result next modified ce
+ */
+ private final int getSecondaryFrenchCE(boolean isSrc)
+ {
+ int result = CollationElementIterator.IGNORABLE;
+ int offset = m_srcUtilOffset_;
+ int continuationoffset = m_srcUtilContOffset_;
+ int cebuffer[] = m_srcUtilCEBuffer_;
+ if (!isSrc) {
+ offset = m_tgtUtilOffset_;
+ continuationoffset = m_tgtUtilContOffset_;
+ cebuffer = m_tgtUtilCEBuffer_;
+ }
+
+ while (result == CollationElementIterator.IGNORABLE
+ && offset >= 0) {
+ if (continuationoffset == 0) {
+ result = cebuffer[offset];
+ while (isContinuation(cebuffer[offset --])){
+ }
+ // after this, sorder is at the start of continuation,
+ // and offset points before that
+ if (isContinuation(cebuffer[offset + 1])) {
+ // save offset for later
+ continuationoffset = offset;
+ offset += 2;
+ }
+ }
+ else {
+ result = cebuffer[offset ++];
+ if (!isContinuation(result)) {
+ // we have finished with this continuation
+ offset = continuationoffset;
+ // reset the pointer to before continuation
+ continuationoffset = 0;
+ continue;
+ }
+ }
+ result &= CE_SECONDARY_MASK_; // remove continuation bit
+ }
+ if (isSrc) {
+ m_srcUtilOffset_ = offset;
+ m_srcUtilContOffset_ = continuationoffset;
+ }
+ else {
+ m_tgtUtilOffset_ = offset;
+ m_tgtUtilContOffset_ = continuationoffset;
+ }
+ return result;
+ }
+
+ /**
+ * Does case strength comparison based on the collected ces.
+ * @return the case strength comparison result
+ */
+ private final int doCaseCompare()
+ {
+ int soffset = 0;
+ int toffset = 0;
+ while (true) {
+ int sorder = CollationElementIterator.IGNORABLE;
+ int torder = CollationElementIterator.IGNORABLE;
+ while ((sorder & CE_REMOVE_CASE_)
+ == CollationElementIterator.IGNORABLE) {
+ sorder = m_srcUtilCEBuffer_[soffset ++];
+ if (!isContinuation(sorder) && ((sorder & CE_PRIMARY_MASK_) != 0 || m_utilCompare2_ == true)) {
+ // primary ignorables should not be considered on the case level when the strength is primary
+ // otherwise, the CEs stop being well-formed
+ sorder &= CE_CASE_MASK_3_;
+ sorder ^= m_caseSwitch_;
+ }
+ else {
+ sorder = CollationElementIterator.IGNORABLE;
+ }
+ }
+
+ while ((torder & CE_REMOVE_CASE_)
+ == CollationElementIterator.IGNORABLE) {
+ torder = m_tgtUtilCEBuffer_[toffset ++];
+ if (!isContinuation(torder) && ((torder & CE_PRIMARY_MASK_) != 0 || m_utilCompare2_ == true)) {
+ // primary ignorables should not be considered on the case level when the strength is primary
+ // otherwise, the CEs stop being well-formed
+ torder &= CE_CASE_MASK_3_;
+ torder ^= m_caseSwitch_;
+ }
+ else {
+ torder = CollationElementIterator.IGNORABLE;
+ }
+ }
+
+ sorder &= CE_CASE_BIT_MASK_;
+ torder &= CE_CASE_BIT_MASK_;
+ if (sorder == torder) {
+ // checking end of strings
+ if (m_srcUtilCEBuffer_[soffset - 1]
+ == CollationElementIterator.NULLORDER) {
+ if (m_tgtUtilCEBuffer_[toffset - 1]
+ != CollationElementIterator.NULLORDER) {
+ return -1;
+ }
+ break;
+ }
+ else if (m_tgtUtilCEBuffer_[toffset - 1]
+ == CollationElementIterator.NULLORDER) {
+ return 1;
+ }
+ }
+ else {
+ if (m_srcUtilCEBuffer_[soffset - 1]
+ == CollationElementIterator.NULLORDER) {
+ return -1;
+ }
+ if (m_tgtUtilCEBuffer_[soffset - 1]
+ == CollationElementIterator.NULLORDER) {
+ return 1;
+ }
+ return (sorder < torder) ? -1 : 1;
+ }
+ }
+ return 0;
+ }
+
+ /**
+ * Does tertiary strength comparison based on the collected ces.
+ * @return the tertiary strength comparison result
+ */
+ private final int doTertiaryCompare()
+ {
+ int soffset = 0;
+ int toffset = 0;
+ while (true) {
+ int sorder = CollationElementIterator.IGNORABLE;
+ int torder = CollationElementIterator.IGNORABLE;
+ while ((sorder & CE_REMOVE_CASE_)
+ == CollationElementIterator.IGNORABLE) {
+ sorder = m_srcUtilCEBuffer_[soffset ++] & m_mask3_;
+ if (!isContinuation(sorder)) {
+ sorder ^= m_caseSwitch_;
+ }
+ else {
+ sorder &= CE_REMOVE_CASE_;
+ }
+ }
+
+ while ((torder & CE_REMOVE_CASE_)
+ == CollationElementIterator.IGNORABLE) {
+ torder = m_tgtUtilCEBuffer_[toffset ++] & m_mask3_;
+ if (!isContinuation(torder)) {
+ torder ^= m_caseSwitch_;
+ }
+ else {
+ torder &= CE_REMOVE_CASE_;
+ }
+ }
+
+ if (sorder == torder) {
+ if (m_srcUtilCEBuffer_[soffset - 1]
+ == CollationElementIterator.NULLORDER) {
+ if (m_tgtUtilCEBuffer_[toffset - 1]
+ != CollationElementIterator.NULLORDER) {
+ return -1;
+ }
+ break;
+ }
+ else if (m_tgtUtilCEBuffer_[toffset - 1]
+ == CollationElementIterator.NULLORDER) {
+ return 1;
+ }
+ }
+ else {
+ if (m_srcUtilCEBuffer_[soffset - 1] ==
+ CollationElementIterator.NULLORDER) {
+ return -1;
+ }
+ if (m_tgtUtilCEBuffer_[toffset - 1] ==
+ CollationElementIterator.NULLORDER) {
+ return 1;
+ }
+ return (sorder < torder) ? -1 : 1;
+ }
+ }
+ return 0;
+ }
+
+ /**
+ * Does quaternary strength comparison based on the collected ces.
+ * @param lowestpvalue the lowest primary value that will not be ignored if
+ * alternate handling is shifted
+ * @return the quaternary strength comparison result
+ */
+ private final int doQuaternaryCompare(int lowestpvalue)
+ {
+ boolean sShifted = true;
+ boolean tShifted = true;
+ int soffset = 0;
+ int toffset = 0;
+ while (true) {
+ int sorder = CollationElementIterator.IGNORABLE;
+ int torder = CollationElementIterator.IGNORABLE;
+ while (sorder == CollationElementIterator.IGNORABLE
+ || (isContinuation(sorder) && !sShifted)) {
+ sorder = m_srcUtilCEBuffer_[soffset ++];
+ if (isContinuation(sorder)) {
+ if (!sShifted) {
+ continue;
+ }
+ }
+ else if (Utility.compareUnsigned(sorder, lowestpvalue) > 0
+ || (sorder & CE_PRIMARY_MASK_)
+ == CollationElementIterator.IGNORABLE) {
+ // non continuation
+ sorder = CE_PRIMARY_MASK_;
+ sShifted = false;
+ }
+ else {
+ sShifted = true;
+ }
+ }
+ sorder >>>= CE_PRIMARY_SHIFT_;
+ while (torder == CollationElementIterator.IGNORABLE
+ || (isContinuation(torder) && !tShifted)) {
+ torder = m_tgtUtilCEBuffer_[toffset ++];
+ if (isContinuation(torder)) {
+ if (!tShifted) {
+ continue;
+ }
+ }
+ else if (Utility.compareUnsigned(torder, lowestpvalue) > 0
+ || (torder & CE_PRIMARY_MASK_)
+ == CollationElementIterator.IGNORABLE) {
+ // non continuation
+ torder = CE_PRIMARY_MASK_;
+ tShifted = false;
+ }
+ else {
+ tShifted = true;
+ }
+ }
+ torder >>>= CE_PRIMARY_SHIFT_;
+
+ if (sorder == torder) {
+ if (m_srcUtilCEBuffer_[soffset - 1]
+ == CollationElementIterator.NULLORDER) {
+ if (m_tgtUtilCEBuffer_[toffset - 1]
+ != CollationElementIterator.NULLORDER) {
+ return -1;
+ }
+ break;
+ }
+ else if (m_tgtUtilCEBuffer_[toffset - 1]
+ == CollationElementIterator.NULLORDER) {
+ return 1;
+ }
+ }
+ else {
+ if (m_srcUtilCEBuffer_[soffset - 1] ==
+ CollationElementIterator.NULLORDER) {
+ return -1;
+ }
+ if (m_tgtUtilCEBuffer_[toffset - 1] ==
+ CollationElementIterator.NULLORDER) {
+ return 1;
+ }
+ return (sorder < torder) ? -1 : 1;
+ }
+ }
+ return 0;
+ }
+
+ /**
+ * Internal function. Does byte level string compare. Used by strcoll if
+ * strength == identical and strings are otherwise equal. This is a rare
+ * case. Comparison must be done on NFD normalized strings. FCD is not good
+ * enough.
+ * @param source text
+ * @param target text
+ * @param offset of the first difference in the text strings
+ * @param normalize flag indicating if we are to normalize the text before
+ * comparison
+ * @return 1 if source is greater than target, -1 less than and 0 if equals
+ */
+ private static final int doIdenticalCompare(String source, String target,
+ int offset, boolean normalize)
+
+ {
+ if (normalize) {
+ if (Normalizer.quickCheck(source, Normalizer.NFD,0)
+ != Normalizer.YES) {
+ source = Normalizer.decompose(source, false);
+ }
+
+ if (Normalizer.quickCheck(target, Normalizer.NFD,0)
+ != Normalizer.YES) {
+ target = Normalizer.decompose(target, false);
+ }
+ offset = 0;
+ }
+
+ return doStringCompare(source, target, offset);
+ }
+
+ /**
+ * Compares string for their codepoint order.
+ * This comparison handles surrogate characters and place them after the
+ * all non surrogate characters.
+ * @param source text
+ * @param target text
+ * @param offset start offset for comparison
+ * @return 1 if source is greater than target, -1 less than and 0 if equals
+ */
+ private static final int doStringCompare(String source,
+ String target,
+ int offset)
+ {
+ // compare identical prefixes - they do not need to be fixed up
+ char schar = 0;
+ char tchar = 0;
+ int slength = source.length();
+ int tlength = target.length();
+ int minlength = Math.min(slength, tlength);
+ while (offset < minlength) {
+ schar = source.charAt(offset);
+ tchar = target.charAt(offset ++);
+ if (schar != tchar) {
+ break;
+ }
+ }
+
+ if (schar == tchar && offset == minlength) {
+ if (slength > minlength) {
+ return 1;
+ }
+ if (tlength > minlength) {
+ return -1;
+ }
+ return 0;
+ }
+
+ // if both values are in or above the surrogate range, Fix them up.
+ if (schar >= UTF16.LEAD_SURROGATE_MIN_VALUE
+ && tchar >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
+ schar = fixupUTF16(schar);
+ tchar = fixupUTF16(tchar);
+ }
+
+ // now c1 and c2 are in UTF-32-compatible order
+ return (schar < tchar) ? -1 : 1; // schar and tchar has to be different
+ }
+
+ /**
+ * Rotate surrogates to the top to get code point order
+ */
+ private static final char fixupUTF16(char ch)
+ {
+ if (ch >= 0xe000) {
+ ch -= 0x800;
+ }
+ else {
+ ch += 0x2000;
+ }
+ return ch;
+ }
+
+ /**
+ * Resets the internal case data members and compression values.
+ */
+ private void updateInternalState()
+ {
+ if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {
+ m_caseSwitch_ = CASE_SWITCH_;
+ }
+ else {
+ m_caseSwitch_ = NO_CASE_SWITCH_;
+ }
+
+ if (m_isCaseLevel_ || m_caseFirst_ == AttributeValue.OFF_) {
+ m_mask3_ = CE_REMOVE_CASE_;
+ m_common3_ = COMMON_NORMAL_3_;
+ m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_OFF_;
+ m_top3_ = COMMON_TOP_CASE_SWITCH_OFF_3_;
+ m_bottom3_ = COMMON_BOTTOM_3_;
+ }
+ else {
+ m_mask3_ = CE_KEEP_CASE_;
+ m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_ON_;
+ if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {
+ m_common3_ = COMMON_UPPER_FIRST_3_;
+ m_top3_ = COMMON_TOP_CASE_SWITCH_UPPER_3_;
+ m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_UPPER_3_;
+ } else {
+ m_common3_ = COMMON_NORMAL_3_;
+ m_top3_ = COMMON_TOP_CASE_SWITCH_LOWER_3_;
+ m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_LOWER_3_;
+ }
+ }
+
+ // Set the compression values
+ int total3 = m_top3_ - COMMON_BOTTOM_3_ - 1;
+ // we multilply double with int, but need only int
+ m_topCount3_ = (int)(PROPORTION_3_ * total3);
+ m_bottomCount3_ = total3 - m_topCount3_;
+
+ if (!m_isCaseLevel_ && getStrength() == AttributeValue.TERTIARY_
+ && !m_isFrenchCollation_ && !m_isAlternateHandlingShifted_) {
+ m_isSimple3_ = true;
+ }
+ else {
+ m_isSimple3_ = false;
+ }
+ if(!m_isCaseLevel_ && getStrength() <= AttributeValue.TERTIARY_ && !m_isNumericCollation_
+ && !m_isAlternateHandlingShifted_ && !latinOneFailed_) {
+ if(latinOneCEs_ == null || latinOneRegenTable_) {
+ if(setUpLatinOne()) { // if we succeed in building latin1 table, we'll use it
+ latinOneUse_ = true;
+ } else {
+ latinOneUse_ = false;
+ latinOneFailed_ = true;
+ }
+ latinOneRegenTable_ = false;
+ } else { // latin1Table exists and it doesn't need to be regenerated, just use it
+ latinOneUse_ = true;
+ }
+ } else {
+ latinOneUse_ = false;
+ }
+
+ }
+
+ /**
+ * Initializes the RuleBasedCollator
+ */
+ private final void init()
+ {
+ for (m_minUnsafe_ = 0; m_minUnsafe_ < DEFAULT_MIN_HEURISTIC_;
+ m_minUnsafe_ ++) {
+ // Find the smallest unsafe char.
+ if (isUnsafe(m_minUnsafe_)) {
+ break;
+ }
+ }
+
+ for (m_minContractionEnd_ = 0;
+ m_minContractionEnd_ < DEFAULT_MIN_HEURISTIC_;
+ m_minContractionEnd_ ++) {
+ // Find the smallest contraction-ending char.
+ if (isContractionEnd(m_minContractionEnd_)) {
+ break;
+ }
+ }
+ latinOneFailed_ = true;
+ setStrength(m_defaultStrength_);
+ setDecomposition(m_defaultDecomposition_);
+ m_variableTopValue_ = m_defaultVariableTopValue_;
+ m_isFrenchCollation_ = m_defaultIsFrenchCollation_;
+ m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_;
+ m_isCaseLevel_ = m_defaultIsCaseLevel_;
+ m_caseFirst_ = m_defaultCaseFirst_;
+ m_isHiragana4_ = m_defaultIsHiragana4_;
+ m_isNumericCollation_ = m_defaultIsNumericCollation_;
+ latinOneFailed_ = false;
+ updateInternalState();
+ }
+
+ /**
+ * Initializes utility iterators and byte buffer used by compare
+ */
+ private final void initUtility(boolean allocate) {
+ if (allocate) {
+ if (m_srcUtilIter_ == null) {
+ m_srcUtilIter_ = new StringUCharacterIterator();
+ m_srcUtilColEIter_ = new CollationElementIterator(m_srcUtilIter_, this);
+ m_tgtUtilIter_ = new StringUCharacterIterator();
+ m_tgtUtilColEIter_ = new CollationElementIterator(m_tgtUtilIter_, this);
+ m_utilBytes0_ = new byte[SORT_BUFFER_INIT_SIZE_CASE_]; // case
+ m_utilBytes1_ = new byte[SORT_BUFFER_INIT_SIZE_1_]; // primary
+ m_utilBytes2_ = new byte[SORT_BUFFER_INIT_SIZE_2_]; // secondary
+ m_utilBytes3_ = new byte[SORT_BUFFER_INIT_SIZE_3_]; // tertiary
+ m_utilBytes4_ = new byte[SORT_BUFFER_INIT_SIZE_4_]; // Quaternary
+ m_srcUtilCEBuffer_ = new int[CE_BUFFER_SIZE_];
+ m_tgtUtilCEBuffer_ = new int[CE_BUFFER_SIZE_];
+ }
+ } else {
+ m_srcUtilIter_ = null;
+ m_srcUtilColEIter_ = null;
+ m_tgtUtilIter_ = null;
+ m_tgtUtilColEIter_ = null;
+ m_utilBytes0_ = null;
+ m_utilBytes1_ = null;
+ m_utilBytes2_ = null;
+ m_utilBytes3_ = null;
+ m_utilBytes4_ = null;
+ m_srcUtilCEBuffer_ = null;
+ m_tgtUtilCEBuffer_ = null;
+ }
+ }
+
+ // Consts for Latin-1 special processing
+ private static final int ENDOFLATINONERANGE_ = 0xFF;
+ private static final int LATINONETABLELEN_ = (ENDOFLATINONERANGE_+50);
+ private static final int BAIL_OUT_CE_ = 0xFF000000;
+
+ /**
+ * Generate latin-1 tables
+ */
+
+ private class shiftValues {
+ int primShift = 24;
+ int secShift = 24;
+ int terShift = 24;
+ }
+
+ private final void
+ addLatinOneEntry(char ch, int CE, shiftValues sh) {
+ int primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
+ boolean reverseSecondary = false;
+ if(!isContinuation(CE)) {
+ tertiary = ((CE & m_mask3_));
+ tertiary ^= m_caseSwitch_;
+ reverseSecondary = true;
+ } else {
+ tertiary = (byte)((CE & CE_REMOVE_CONTINUATION_MASK_));
+ tertiary &= CE_REMOVE_CASE_;
+ reverseSecondary = false;
+ }
+
+ secondary = ((CE >>>= 8) & LAST_BYTE_MASK_);
+ primary2 = ((CE >>>= 8) & LAST_BYTE_MASK_);
+ primary1 = (CE >>> 8);
+
+ if(primary1 != 0) {
+ latinOneCEs_[ch] |= (primary1 << sh.primShift);
+ sh.primShift -= 8;
+ }
+ if(primary2 != 0) {
+ if(sh.primShift < 0) {
+ latinOneCEs_[ch] = BAIL_OUT_CE_;
+ latinOneCEs_[latinOneTableLen_+ch] = BAIL_OUT_CE_;
+ latinOneCEs_[2*latinOneTableLen_+ch] = BAIL_OUT_CE_;
+ return;
+ }
+ latinOneCEs_[ch] |= (primary2 << sh.primShift);
+ sh.primShift -= 8;
+ }
+ if(secondary != 0) {
+ if(reverseSecondary && m_isFrenchCollation_) { // reverse secondary
+ latinOneCEs_[latinOneTableLen_+ch] >>>= 8; // make space for secondary
+ latinOneCEs_[latinOneTableLen_+ch] |= (secondary << 24);
+ } else { // normal case
+ latinOneCEs_[latinOneTableLen_+ch] |= (secondary << sh.secShift);
+ }
+ sh.secShift -= 8;
+ }
+ if(tertiary != 0) {
+ latinOneCEs_[2*latinOneTableLen_+ch] |= (tertiary << sh.terShift);
+ sh.terShift -= 8;
+ }
+ }
+
+ private final void
+ resizeLatinOneTable(int newSize) {
+ int newTable[] = new int[3*newSize];
+ int sizeToCopy = ((newSize<latinOneTableLen_)?newSize:latinOneTableLen_);
+ //uprv_memset(newTable, 0, newSize*sizeof(uint32_t)*3); // automatically cleared.
+ System.arraycopy(latinOneCEs_, 0, newTable, 0, sizeToCopy);
+ System.arraycopy(latinOneCEs_, latinOneTableLen_, newTable, newSize, sizeToCopy);
+ System.arraycopy(latinOneCEs_, 2*latinOneTableLen_, newTable, 2*newSize, sizeToCopy);
+ latinOneTableLen_ = newSize;
+ latinOneCEs_ = newTable;
+ }
+
+ private final boolean setUpLatinOne() {
+ if(latinOneCEs_ == null || m_reallocLatinOneCEs_) {
+ latinOneCEs_ = new int[3*LATINONETABLELEN_];
+ latinOneTableLen_ = LATINONETABLELEN_;
+ m_reallocLatinOneCEs_ = false;
+ } else {
+ Arrays.fill(latinOneCEs_, 0);
+ }
+ if(m_ContInfo_ == null) {
+ m_ContInfo_ = new ContractionInfo();
+ }
+ char ch = 0;
+ //StringBuffer sCh = new StringBuffer();
+ //CollationElementIterator it = getCollationElementIterator(sCh.toString());
+ CollationElementIterator it = getCollationElementIterator("");
+
+ shiftValues s = new shiftValues();
+ int CE = 0;
+ char contractionOffset = ENDOFLATINONERANGE_+1;
+
+ for(ch = 0; ch <= ENDOFLATINONERANGE_; ch++) {
+ s.primShift = 24; s.secShift = 24; s.terShift = 24;
+ if(ch < 0x100) {
+ CE = m_trie_.getLatin1LinearValue(ch);
+ } else {
+ CE = m_trie_.getLeadValue(ch);
+ if(CE == CollationElementIterator.CE_NOT_FOUND_) {
+ CE = UCA_.m_trie_.getLeadValue(ch);
+ }
+ }
+ if(!isSpecial(CE)) {
+ addLatinOneEntry(ch, CE, s);
+ } else {
+ switch (RuleBasedCollator.getTag(CE)) {
+ case CollationElementIterator.CE_EXPANSION_TAG_:
+ case CollationElementIterator.CE_DIGIT_TAG_:
+ //sCh.delete(0, sCh.length());
+ //sCh.append(ch);
+ //it.setText(sCh.toString());
+ it.setText(UCharacter.toString(ch));
+ while((CE = it.next()) != CollationElementIterator.NULLORDER) {
+ if(s.primShift < 0 || s.secShift < 0 || s.terShift < 0) {
+ latinOneCEs_[ch] = BAIL_OUT_CE_;
+ latinOneCEs_[latinOneTableLen_+ch] = BAIL_OUT_CE_;
+ latinOneCEs_[2*latinOneTableLen_+ch] = BAIL_OUT_CE_;
+ break;
+ }
+ addLatinOneEntry(ch, CE, s);
+ }
+ break;
+ case CollationElementIterator.CE_CONTRACTION_TAG_:
+ // here is the trick
+ // F2 is contraction. We do something very similar to contractions
+ // but have two indices, one in the real contraction table and the
+ // other to where we stuffed things. This hopes that we don't have
+ // many contractions (this should work for latin-1 tables).
+ {
+ if((CE & 0x00FFF000) != 0) {
+ latinOneFailed_ = true;
+ return false;
+ }
+
+ int UCharOffset = (CE & 0xFFFFFF) - m_contractionOffset_; //getContractionOffset(CE)]
+
+ CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
+
+ latinOneCEs_[ch] = CE;
+ latinOneCEs_[latinOneTableLen_+ch] = CE;
+ latinOneCEs_[2*latinOneTableLen_+ch] = CE;
+
+ // We're going to jump into contraction table, pick the elements
+ // and use them
+ do {
+ //CE = *(contractionCEs + (UCharOffset - contractionIndex));
+ CE = m_contractionCE_[UCharOffset];
+ if(isSpecial(CE)
+ && getTag(CE)
+ == CollationElementIterator.CE_EXPANSION_TAG_) {
+ int i; /* general counter */
+ //uint32_t *CEOffset = (uint32_t *)image+getExpansionOffset(CE); /* find the offset to expansion table */
+ int offset = ((CE & 0xFFFFF0) >> 4) - m_expansionOffset_; //it.getExpansionOffset(this, CE);
+ int size = CE & 0xF; // getExpansionCount(CE);
+ //CE = *CEOffset++;
+ if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
+ for(i = 0; i<size; i++) {
+ if(s.primShift < 0 || s.secShift < 0 || s.terShift < 0) {
+ latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;
+ latinOneCEs_[latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;
+ latinOneCEs_[2*latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;
+ break;
+ }
+ addLatinOneEntry(contractionOffset, m_expansion_[offset+i], s);
+ }
+ } else { /* else, we do */
+ while(m_expansion_[offset] != 0) {
+ if(s.primShift < 0 || s.secShift < 0 || s.terShift < 0) {
+ latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;
+ latinOneCEs_[latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;
+ latinOneCEs_[2*latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;
+ break;
+ }
+ addLatinOneEntry(contractionOffset, m_expansion_[offset++], s);
+ }
+ }
+ contractionOffset++;
+ } else if(!isSpecial(CE)) {
+ addLatinOneEntry(contractionOffset++, CE, s);
+ } else {
+ latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;
+ latinOneCEs_[latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;
+ latinOneCEs_[2*latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;
+ contractionOffset++;
+ }
+ UCharOffset++;
+ s.primShift = 24; s.secShift = 24; s.terShift = 24;
+ if(contractionOffset == latinOneTableLen_) { // we need to reallocate
+ resizeLatinOneTable(2*latinOneTableLen_);
+ }
+ } while(m_contractionIndex_[UCharOffset] != 0xFFFF);
+ }
+ break;
+ case CollationElementIterator.CE_SPEC_PROC_TAG_:
+ {
+ // 0xB7 is a precontext character defined in UCA5.1, a special
+ // handle is implemeted in order to save LatinOne table for
+ // most locales.
+ if (ch == 0xb7) {
+ addLatinOneEntry(ch, CE, s);
+ }
+ else {
+ latinOneFailed_ = true;
+ return false;
+ }
+ }
+ break;
+ default:
+ latinOneFailed_ = true;
+ return false;
+ }
+ }
+ }
+ // compact table
+ if(contractionOffset < latinOneTableLen_) {
+ resizeLatinOneTable(contractionOffset);
+ }
+ return true;
+ }
+
+ private class ContractionInfo {
+ int index;
+ }
+
+ ContractionInfo m_ContInfo_;
+
+ private int
+ getLatinOneContraction(int strength, int CE, String s) {
+ //int strength, int CE, String s, Integer ind) {
+ int len = s.length();
+ //const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
+ int UCharOffset = (CE & 0xFFF) - m_contractionOffset_;
+ int offset = 1;
+ int latinOneOffset = (CE & 0x00FFF000) >>> 12;
+ char schar = 0, tchar = 0;
+
+ for(;;) {
+ /*
+ if(len == -1) {
+ if(s[*index] == 0) { // end of string
+ return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
+ } else {
+ schar = s[*index];
+ }
+ } else {
+ */
+ if(m_ContInfo_.index == len) {
+ return(latinOneCEs_[strength*latinOneTableLen_+latinOneOffset]);
+ } else {
+ schar = s.charAt(m_ContInfo_.index);
+ }
+ //}
+
+ while(schar > (tchar = m_contractionIndex_[UCharOffset+offset]/**(UCharOffset+offset)*/)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
+ offset++;
+ }
+
+ if (schar == tchar) {
+ m_ContInfo_.index++;
+ return(latinOneCEs_[strength*latinOneTableLen_+latinOneOffset+offset]);
+ }
+ else
+ {
+ if(schar > ENDOFLATINONERANGE_ /*& 0xFF00*/) {
+ return BAIL_OUT_CE_;
+ }
+ // skip completely ignorables
+ int isZeroCE = m_trie_.getLeadValue(schar); //UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
+ if(isZeroCE == 0) { // we have to ignore completely ignorables
+ m_ContInfo_.index++;
+ continue;
+ }
+
+ return(latinOneCEs_[strength*latinOneTableLen_+latinOneOffset]);
+ }
+ }
+ }
+
+
+ /**
+ * This is a fast strcoll, geared towards text in Latin-1.
+ * It supports contractions of size two, French secondaries
+ * and case switching. You can use it with strengths primary
+ * to tertiary. It does not support shifted and case level.
+ * It relies on the table build by setupLatin1Table. If it
+ * doesn't understand something, it will go to the regular
+ * strcoll.
+ */
+ private final int
+ compareUseLatin1(String source, String target, int startOffset)
+ {
+ int sLen = source.length();
+ int tLen = target.length();
+
+ int strength = getStrength();
+
+ int sIndex = startOffset, tIndex = startOffset;
+ char sChar = 0, tChar = 0;
+ int sOrder=0, tOrder=0;
+
+ boolean endOfSource = false;
+
+ //uint32_t *elements = coll->latinOneCEs;
+
+ boolean haveContractions = false; // if we have contractions in our string
+ // we cannot do French secondary
+
+ int offset = latinOneTableLen_;
+
+ // Do the primary level
+ primLoop:
+ for(;;) {
+ while(sOrder==0) { // this loop skips primary ignorables
+ // sOrder=getNextlatinOneCE(source);
+ if(sIndex==sLen) {
+ endOfSource = true;
+ break;
+ }
+ sChar=source.charAt(sIndex++); //[sIndex++];
+ //}
+ if(sChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out
+ //fprintf(stderr, "R");
+ return compareRegular(source, target, startOffset);
+ }
+ sOrder = latinOneCEs_[sChar];
+ if(isSpecial(sOrder)) { // if we got a special
+ // specials can basically be either contractions or bail-out signs. If we get anything
+ // else, we'll bail out anywasy
+ if(getTag(sOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) {
+ m_ContInfo_.index = sIndex;
+ sOrder = getLatinOneContraction(0, sOrder, source);
+ sIndex = m_ContInfo_.index;
+ haveContractions = true; // if there are contractions, we cannot do French secondary
+ // However, if there are contractions in the table, but we always use just one char,
+ // we might be able to do French. This should be checked out.
+ }
+ if(isSpecial(sOrder) /*== UCOL_BAIL_OUT_CE*/) {
+ //fprintf(stderr, "S");
+ return compareRegular(source, target, startOffset);
+ }
+ }
+ }
+
+ while(tOrder==0) { // this loop skips primary ignorables
+ // tOrder=getNextlatinOneCE(target);
+ if(tIndex==tLen) {
+ if(endOfSource) {
+ break primLoop;
+ } else {
+ return 1;
+ }
+ }
+ tChar=target.charAt(tIndex++); //[tIndex++];
+ if(tChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out
+ //fprintf(stderr, "R");
+ return compareRegular(source, target, startOffset);
+ }
+ tOrder = latinOneCEs_[tChar];
+ if(isSpecial(tOrder)) {
+ // Handling specials, see the comments for source
+ if(getTag(tOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) {
+ m_ContInfo_.index = tIndex;
+ tOrder = getLatinOneContraction(0, tOrder, target);
+ tIndex = m_ContInfo_.index;
+ haveContractions = true;
+ }
+ if(isSpecial(tOrder)/*== UCOL_BAIL_OUT_CE*/) {
+ //fprintf(stderr, "S");
+ return compareRegular(source, target, startOffset);
+ }
+ }
+ }
+ if(endOfSource) { // source is finished, but target is not, say the result.
+ return -1;
+ }
+
+ if(sOrder == tOrder) { // if we have same CEs, we continue the loop
+ sOrder = 0; tOrder = 0;
+ continue;
+ } else {
+ // compare current top bytes
+ if(((sOrder^tOrder)&0xFF000000)!=0) {
+ // top bytes differ, return difference
+ if(sOrder >>> 8 < tOrder >>> 8) {
+ return -1;
+ } else {
+ return 1;
+ }
+ // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
+ // since we must return enum value
+ }
+
+ // top bytes match, continue with following bytes
+ sOrder<<=8;
+ tOrder<<=8;
+ }
+ }
+
+ // after primary loop, we definitely know the sizes of strings,
+ // so we set it and use simpler loop for secondaries and tertiaries
+ //sLen = sIndex; tLen = tIndex;
+ if(strength >= SECONDARY) {
+ // adjust the table beggining
+ //latinOneCEs_ += coll->latinOneTableLen;
+ endOfSource = false;
+
+ if(!m_isFrenchCollation_) { // non French
+ // This loop is a simplified copy of primary loop
+ // at this point we know that whole strings are latin-1, so we don't
+ // check for that. We also know that we only have contractions as
+ // specials.
+ //sIndex = 0; tIndex = 0;
+ sIndex = startOffset; tIndex = startOffset;
+ secLoop:
+ for(;;) {
+ while(sOrder==0) {
+ if(sIndex==sLen) {
+ endOfSource = true;
+ break;
+ }
+ sChar=source.charAt(sIndex++); //[sIndex++];
+ sOrder = latinOneCEs_[offset+sChar];
+ if(isSpecial(sOrder)) {
+ m_ContInfo_.index = sIndex;
+ sOrder = getLatinOneContraction(1, sOrder, source);
+ sIndex = m_ContInfo_.index;
+ }
+ }
+
+ while(tOrder==0) {
+ if(tIndex==tLen) {
+ if(endOfSource) {
+ break secLoop;
+ } else {
+ return 1;
+ }
+ }
+ tChar=target.charAt(tIndex++); //[tIndex++];
+ tOrder = latinOneCEs_[offset+tChar];
+ if(isSpecial(tOrder)) {
+ m_ContInfo_.index = tIndex;
+ tOrder = getLatinOneContraction(1, tOrder, target);
+ tIndex = m_ContInfo_.index;
+ }
+ }
+ if(endOfSource) {
+ return -1;
+ }
+
+ if(sOrder == tOrder) {
+ sOrder = 0; tOrder = 0;
+ continue;
+ } else {
+ // see primary loop for comments on this
+ if(((sOrder^tOrder)&0xFF000000)!=0) {
+ if(sOrder >>> 8 < tOrder >>> 8) {
+ return -1;
+ } else {
+ return 1;
+ }
+ }
+ sOrder<<=8;
+ tOrder<<=8;
+ }
+ }
+ } else { // French
+ if(haveContractions) { // if we have contractions, we have to bail out
+ // since we don't really know how to handle them here
+ return compareRegular(source, target, startOffset);
+ }
+ // For French, we go backwards
+ sIndex = sLen; tIndex = tLen;
+ secFLoop:
+ for(;;) {
+ while(sOrder==0) {
+ if(sIndex==startOffset) {
+ endOfSource = true;
+ break;
+ }
+ sChar=source.charAt(--sIndex); //[--sIndex];
+ sOrder = latinOneCEs_[offset+sChar];
+ // don't even look for contractions
+ }
+
+ while(tOrder==0) {
+ if(tIndex==startOffset) {
+ if(endOfSource) {
+ break secFLoop;
+ } else {
+ return 1;
+ }
+ }
+ tChar=target.charAt(--tIndex); //[--tIndex];
+ tOrder = latinOneCEs_[offset+tChar];
+ // don't even look for contractions
+ }
+ if(endOfSource) {
+ return -1;
+ }
+
+ if(sOrder == tOrder) {
+ sOrder = 0; tOrder = 0;
+ continue;
+ } else {
+ // see the primary loop for comments
+ if(((sOrder^tOrder)&0xFF000000)!=0) {
+ if(sOrder >>> 8 < tOrder >>> 8) {
+ return -1;
+ } else {
+ return 1;
+ }
+ }
+ sOrder<<=8;
+ tOrder<<=8;
+ }
+ }
+ }
+ }
+
+ if(strength >= TERTIARY) {
+ // tertiary loop is the same as secondary (except no French)
+ offset += latinOneTableLen_;
+ //sIndex = 0; tIndex = 0;
+ sIndex = startOffset; tIndex = startOffset;
+ endOfSource = false;
+ for(;;) {
+ while(sOrder==0) {
+ if(sIndex==sLen) {
+ endOfSource = true;
+ break;
+ }
+ sChar=source.charAt(sIndex++); //[sIndex++];
+ sOrder = latinOneCEs_[offset+sChar];
+ if(isSpecial(sOrder)) {
+ m_ContInfo_.index = sIndex;
+ sOrder = getLatinOneContraction(2, sOrder, source);
+ sIndex = m_ContInfo_.index;
+ }
+ }
+ while(tOrder==0) {
+ if(tIndex==tLen) {
+ if(endOfSource) {
+ return 0; // if both strings are at the end, they are equal
+ } else {
+ return 1;
+ }
+ }
+ tChar=target.charAt(tIndex++); //[tIndex++];
+ tOrder = latinOneCEs_[offset+tChar];
+ if(isSpecial(tOrder)) {
+ m_ContInfo_.index = tIndex;
+ tOrder = getLatinOneContraction(2, tOrder, target);
+ tIndex = m_ContInfo_.index;
+ }
+ }
+ if(endOfSource) {
+ return -1;
+ }
+ if(sOrder == tOrder) {
+ sOrder = 0; tOrder = 0;
+ continue;
+ } else {
+ if(((sOrder^tOrder)&0xff000000)!=0) {
+ if(sOrder >>> 8 < tOrder >>> 8) {
+ return -1;
+ } else {
+ return 1;
+ }
+ }
+ sOrder<<=8;
+ tOrder<<=8;
+ }
+ }
+ }
+ return 0;
+ }
+ /**
+ * Get the version of this collator object.
+ * @return the version object associated with this collator
+ * @stable ICU 2.8
+ */
+ public VersionInfo getVersion() {
+ /* RunTime version */
+ int rtVersion = VersionInfo.UCOL_RUNTIME_VERSION.getMajor();
+ /* Builder version*/
+ int bdVersion = m_version_.getMajor();
+
+ /* Charset Version. Need to get the version from cnv files
+ * makeconv should populate cnv files with version and
+ * an api has to be provided in ucnv.h to obtain this version
+ */
+ int csVersion = 0;
+
+ /* combine the version info */
+ int cmbVersion = ((rtVersion<<11) | (bdVersion<<6) | (csVersion)) & 0xFFFF;
+
+ /* Tailoring rules */
+ return VersionInfo.getInstance(cmbVersion>>8,
+ cmbVersion & 0xFF,
+ m_version_.getMinor(),
+ UCA_.m_UCA_version_.getMajor());
+
+// versionInfo[0] = (uint8_t)(cmbVersion>>8);
+// versionInfo[1] = (uint8_t)cmbVersion;
+// versionInfo[2] = coll->image->version[1];
+// versionInfo[3] = coll->UCA->image->UCAVersion[0];
+ }
+
+ /**
+ * Get the UCA version of this collator object.
+ * @return the version object associated with this collator
+ * @stable ICU 2.8
+ */
+ public VersionInfo getUCAVersion() {
+ return UCA_.m_UCA_version_;
+ }
+
+ private transient boolean m_reallocLatinOneCEs_;
+}