2 *******************************************************************************
3 * Copyright (C) 1996-2013, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 *******************************************************************************
7 package com.ibm.icu.text;
9 import java.io.DataInputStream;
10 import java.io.IOException;
11 import java.nio.ByteBuffer;
12 import java.text.CharacterIterator;
13 import java.text.ParseException;
14 import java.util.Arrays;
15 import java.util.HashMap;
16 import java.util.HashSet;
18 import java.util.MissingResourceException;
20 import java.util.concurrent.locks.Lock;
21 import java.util.concurrent.locks.ReentrantLock;
23 import com.ibm.icu.impl.BOCU;
24 import com.ibm.icu.impl.ICUDebug;
25 import com.ibm.icu.impl.ICUResourceBundle;
26 import com.ibm.icu.impl.ImplicitCEGenerator;
27 import com.ibm.icu.impl.IntTrie;
28 import com.ibm.icu.impl.StringUCharacterIterator;
29 import com.ibm.icu.impl.Trie;
30 import com.ibm.icu.impl.TrieIterator;
31 import com.ibm.icu.impl.Utility;
32 import com.ibm.icu.lang.UCharacter;
33 import com.ibm.icu.lang.UScript;
34 import com.ibm.icu.util.Output;
35 import com.ibm.icu.util.RangeValueIterator;
36 import com.ibm.icu.util.ULocale;
37 import com.ibm.icu.util.UResourceBundle;
38 import com.ibm.icu.util.VersionInfo;
42 * RuleBasedCollator is a concrete subclass of Collator. It allows customization of the Collator via user-specified rule
43 * sets. RuleBasedCollator is designed to be fully compliant to the <a
44 * href="http://www.unicode.org/unicode/reports/tr10/">Unicode Collation Algorithm (UCA)</a> and conforms to ISO 14651.
48 * Users are strongly encouraged to read <a href="http://www.icu-project.org/userguide/Collate_Intro.html"> the users
49 * guide</a> for more information about the collation service before using this class.
53 * Create a RuleBasedCollator from a locale by calling the getInstance(Locale) factory method in the base class
54 * Collator. Collator.getInstance(Locale) creates a RuleBasedCollator object based on the collation rules defined by the
55 * argument locale. If a customized collation ordering ar attributes is required, use the RuleBasedCollator(String)
56 * constructor with the appropriate rules. The customized RuleBasedCollator will base its ordering on UCA, while
57 * re-adjusting the attributes and orders of the characters in the specified rule accordingly.
61 * RuleBasedCollator provides correct collation orders for most locales supported in ICU. If specific data for a locale
62 * is not available, the orders eventually falls back to the <a href="http://www.unicode.org/unicode/reports/tr10/">UCA
63 * collation order </a>.
67 * For information about the collation rule syntax and details about customization, please refer to the <a
68 * href="http://www.icu-project.org/userguide/Collate_Customization.html"> Collation customization</a> section of the
73 * <strong>Note</strong> that there are some differences between the Collation rule syntax used in Java and ICU4J:
76 * <li>According to the JDK documentation: <i>
78 * Modifier '!' : Turns on Thai/Lao vowel-consonant swapping. If this rule is in force when a Thai vowel of the range
79 * \U0E40-\U0E44 precedes a Thai consonant of the range \U0E01-\U0E2E OR a Lao vowel of the range
80 * \U0EC0-\U0EC4 precedes a Lao consonant of the range \U0E81-\U0EAE then the vowel is placed after the
81 * consonant for collation purposes.
84 * If a rule is without the modifier '!', the Thai/Lao vowel-consonant swapping is not turned on.
88 * ICU4J's RuleBasedCollator does not support turning off the Thai/Lao vowel-consonant swapping, since the UCA clearly
89 * states that it has to be supported to ensure a correct sorting order. If a '!' is encountered, it is ignored.
91 * <li>As mentioned in the documentation of the base class Collator, compatibility decomposition mode is not supported.
94 * <strong>Examples</strong>
97 * Creating Customized RuleBasedCollators: <blockquote>
100 * String simple = "& a < b < c < d";
101 * RuleBasedCollator simpleCollator = new RuleBasedCollator(simple);
103 * String norwegian = "& a , A < b , B < c , C < d , D < e , E "
104 * + "< f , F < g , G < h , H < i , I < j , "
105 * + "J < k , K < l , L < m , M < n , N < "
106 * + "o , O < p , P < q , Q < r , R < s , S < "
107 * + "t , T < u , U < v , V < w , W < x , X "
108 * + "< y , Y < z , Z < \u00E5 = a\u030A "
109 * + ", \u00C5 = A\u030A ; aa , AA < \u00E6 "
110 * + ", \u00C6 < \u00F8 , \u00D8";
111 * RuleBasedCollator norwegianCollator = new RuleBasedCollator(norwegian);
116 * Concatenating rules to combine <code>Collator</code>s: <blockquote>
119 * // Create an en_US Collator object
120 * RuleBasedCollator en_USCollator = (RuleBasedCollator)
121 * Collator.getInstance(new Locale("en", "US", ""));
122 * // Create a da_DK Collator object
123 * RuleBasedCollator da_DKCollator = (RuleBasedCollator)
124 * Collator.getInstance(new Locale("da", "DK", ""));
126 * // First, get the collation rules from en_USCollator
127 * String en_USRules = en_USCollator.getRules();
128 * // Second, get the collation rules from da_DKCollator
129 * String da_DKRules = da_DKCollator.getRules();
130 * RuleBasedCollator newCollator =
131 * new RuleBasedCollator(en_USRules + da_DKRules);
132 * // newCollator has the combined rules
137 * Making changes to an existing RuleBasedCollator to create a new <code>Collator</code> object, by appending changes to
138 * the existing rule: <blockquote>
141 * // Create a new Collator object with additional rules
142 * String addRules = "& C < ch, cH, Ch, CH";
143 * RuleBasedCollator myCollator =
144 * new RuleBasedCollator(en_USCollator.getRules() + addRules);
145 * // myCollator contains the new rules
150 * How to change the order of non-spacing accents: <blockquote>
153 * // old rule with main accents
154 * String oldRules = "= \u0301 ; \u0300 ; \u0302 ; \u0308 "
155 * + "; \u0327 ; \u0303 ; \u0304 ; \u0305 "
156 * + "; \u0306 ; \u0307 ; \u0309 ; \u030A "
157 * + "; \u030B ; \u030C ; \u030D ; \u030E "
158 * + "; \u030F ; \u0310 ; \u0311 ; \u0312 "
159 * + "< a , A ; ae, AE ; \u00e6 , \u00c6 "
160 * + "< b , B < c, C < e, E & C < d , D";
161 * // change the order of accent characters
162 * String addOn = "& \u0300 ; \u0308 ; \u0302";
163 * RuleBasedCollator myCollator = new RuleBasedCollator(oldRules + addOn);
168 * Putting in a new primary ordering before the default setting, e.g. sort English characters before or after Japanese
169 * characters in the Japanese <code>Collator</code>: <blockquote>
172 * // get en_US Collator rules
173 * RuleBasedCollator en_USCollator
174 * = (RuleBasedCollator)Collator.getInstance(Locale.US);
175 * // add a few Japanese characters to sort before English characters
176 * // suppose the last character before the first base letter 'a' in
177 * // the English collation rule is \u2212
178 * String jaString = "& \u2212 < \u3041, \u3042 < \u3043, "
180 * RuleBasedCollator myJapaneseCollator
181 * = new RuleBasedCollator(en_USCollator.getRules() + jaString);
187 * This class is not subclassable
190 * @author Syn Wee Quek
193 public final class RuleBasedCollator extends Collator {
194 // public constructors ---------------------------------------------------
198 * Constructor that takes the argument rules for customization. The collator will be based on UCA, with the
199 * attributes and re-ordering of the characters specified in the argument rules.
202 * See the user guide's section on <a href="http://www.icu-project.org/userguide/Collate_Customization.html">
203 * Collation Customization</a> for details on the rule syntax.
207 * the collation rules to build the collation table from.
208 * @exception ParseException
209 * and IOException thrown. ParseException thrown when argument rules have an invalid syntax.
210 * IOException thrown when an error occured while reading internal data.
213 public RuleBasedCollator(String rules) throws Exception {
216 throw new IllegalArgumentException("Collation rules can not be null");
221 // public methods --------------------------------------------------------
224 * Clones the RuleBasedCollator
226 * @return a new instance of this RuleBasedCollator object
229 public Object clone() throws CloneNotSupportedException {
230 return clone(isFrozen());
234 * Clones the RuleBasedCollator
236 * @param frozen should the clone be frozen or not
237 * @return a new instance of this RuleBasedCollator object
239 private Object clone(boolean frozen) throws CloneNotSupportedException {
240 //TODO: once buffer and threading issue is resolved have frozen clone just return itself
241 RuleBasedCollator result = (RuleBasedCollator) super.clone();
242 if (latinOneCEs_ != null) {
243 result.m_reallocLatinOneCEs_ = true;
244 result.m_ContInfo_ = new ContractionInfo();
247 // since all collation data in the RuleBasedCollator do not change
248 // we can safely assign the result.fields to this collator
249 // except in cases where we can't
250 result.collationBuffer = null;
251 result.frozenLock = frozen ? new ReentrantLock() : null;
256 * Return a CollationElementIterator for the given String.
258 * @see CollationElementIterator
261 public CollationElementIterator getCollationElementIterator(String source) {
262 return new CollationElementIterator(source, this);
266 * Return a CollationElementIterator for the given CharacterIterator. The source iterator's integrity will be
267 * preserved since a new copy will be created for use.
269 * @see CollationElementIterator
272 public CollationElementIterator getCollationElementIterator(CharacterIterator source) {
273 CharacterIterator newsource = (CharacterIterator) source.clone();
274 return new CollationElementIterator(newsource, this);
278 * Return a CollationElementIterator for the given UCharacterIterator. The source iterator's integrity will be
279 * preserved since a new copy will be created for use.
281 * @see CollationElementIterator
284 public CollationElementIterator getCollationElementIterator(UCharacterIterator source) {
285 return new CollationElementIterator(source, this);
288 // Freezable interface implementation -------------------------------------------------
291 * Determines whether the object has been frozen or not.
294 public boolean isFrozen() {
295 return frozenLock != null;
299 * Freezes the collator.
300 * @return the collator itself.
303 public Collator freeze() {
305 frozenLock = new ReentrantLock();
311 * Provides for the clone operation. Any clone is initially unfrozen.
314 public RuleBasedCollator cloneAsThawed() {
315 RuleBasedCollator clone = null;
317 clone = (RuleBasedCollator) clone(false);
318 } catch (CloneNotSupportedException e) {
319 // Clone is implemented
324 // public setters --------------------------------------------------------
327 * Sets the Hiragana Quaternary mode to be on or off. When the Hiragana Quaternary mode is turned on, the collator
328 * positions Hiragana characters before all non-ignorable characters in QUATERNARY strength. This is to produce a
329 * correct JIS collation order, distinguishing between Katakana and Hiragana characters.
331 * This attribute is an implementation detail of the CLDR Japanese tailoring.
332 * The implementation might change to use a different mechanism
333 * to achieve the same Japanese sort order.
334 * Since ICU 50, this attribute is not settable any more via API functions.
337 * true if Hiragana Quaternary mode is to be on, false otherwise
338 * @see #setHiraganaQuaternaryDefault
339 * @see #isHiraganaQuaternary
340 * @deprecated ICU 50 Implementation detail, cannot be set via API, might be removed from implementation.
342 public void setHiraganaQuaternary(boolean flag) {
344 throw new UnsupportedOperationException("Attempt to modify frozen object");
349 * Sets the Hiragana Quaternary mode to the initial mode set during construction of the RuleBasedCollator. See
350 * setHiraganaQuaternary(boolean) for more details.
352 * This attribute is an implementation detail of the CLDR Japanese tailoring.
353 * The implementation might change to use a different mechanism
354 * to achieve the same Japanese sort order.
355 * Since ICU 50, this attribute is not settable any more via API functions.
357 * @see #setHiraganaQuaternary(boolean)
358 * @see #isHiraganaQuaternary
359 * @deprecated ICU 50 Implementation detail, cannot be set via API, might be removed from implementation.
361 public void setHiraganaQuaternaryDefault() {
363 throw new UnsupportedOperationException("Attempt to modify frozen object");
368 * Sets whether uppercase characters sort before lowercase characters or vice versa, in strength TERTIARY. The
369 * default mode is false, and so lowercase characters sort before uppercase characters. If true, sort upper case
373 * true to sort uppercase characters before lowercase characters, false to sort lowercase characters
374 * before uppercase characters
375 * @see #isLowerCaseFirst
376 * @see #isUpperCaseFirst
377 * @see #setLowerCaseFirst
378 * @see #setCaseFirstDefault
381 public void setUpperCaseFirst(boolean upperfirst) {
383 throw new UnsupportedOperationException("Attempt to modify frozen object");
387 if (m_caseFirst_ != AttributeValue.UPPER_FIRST_) {
388 latinOneRegenTable_ = true;
390 m_caseFirst_ = AttributeValue.UPPER_FIRST_;
392 if (m_caseFirst_ != AttributeValue.OFF_) {
393 latinOneRegenTable_ = true;
395 m_caseFirst_ = AttributeValue.OFF_;
397 updateInternalState();
401 * Sets the orders of lower cased characters to sort before upper cased characters, in strength TERTIARY. The
402 * default mode is false. If true is set, the RuleBasedCollator will sort lower cased characters before the upper
403 * cased ones. Otherwise, if false is set, the RuleBasedCollator will ignore case preferences.
406 * true for sorting lower cased characters before upper cased characters, false to ignore case
408 * @see #isLowerCaseFirst
409 * @see #isUpperCaseFirst
410 * @see #setUpperCaseFirst
411 * @see #setCaseFirstDefault
414 public void setLowerCaseFirst(boolean lowerfirst) {
416 throw new UnsupportedOperationException("Attempt to modify frozen object");
420 if (m_caseFirst_ != AttributeValue.LOWER_FIRST_) {
421 latinOneRegenTable_ = true;
423 m_caseFirst_ = AttributeValue.LOWER_FIRST_;
425 if (m_caseFirst_ != AttributeValue.OFF_) {
426 latinOneRegenTable_ = true;
428 m_caseFirst_ = AttributeValue.OFF_;
430 updateInternalState();
434 * Sets the case first mode to the initial mode set during construction of the RuleBasedCollator. See
435 * setUpperCaseFirst(boolean) and setLowerCaseFirst(boolean) for more details.
437 * @see #isLowerCaseFirst
438 * @see #isUpperCaseFirst
439 * @see #setLowerCaseFirst(boolean)
440 * @see #setUpperCaseFirst(boolean)
443 public final void setCaseFirstDefault() {
445 throw new UnsupportedOperationException("Attempt to modify frozen object");
448 if (m_caseFirst_ != m_defaultCaseFirst_) {
449 latinOneRegenTable_ = true;
451 m_caseFirst_ = m_defaultCaseFirst_;
452 updateInternalState();
456 * Sets the alternate handling mode to the initial mode set during construction of the RuleBasedCollator. See
457 * setAlternateHandling(boolean) for more details.
459 * @see #setAlternateHandlingShifted(boolean)
460 * @see #isAlternateHandlingShifted()
463 public void setAlternateHandlingDefault() {
465 throw new UnsupportedOperationException("Attempt to modify frozen object");
468 m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_;
469 updateInternalState();
473 * Sets the case level mode to the initial mode set during construction of the RuleBasedCollator. See
474 * setCaseLevel(boolean) for more details.
476 * @see #setCaseLevel(boolean)
480 public void setCaseLevelDefault() {
482 throw new UnsupportedOperationException("Attempt to modify frozen object");
485 m_isCaseLevel_ = m_defaultIsCaseLevel_;
486 updateInternalState();
490 * Sets the decomposition mode to the initial mode set during construction of the RuleBasedCollator. See
491 * setDecomposition(int) for more details.
493 * @see #getDecomposition
494 * @see #setDecomposition(int)
497 public void setDecompositionDefault() {
499 throw new UnsupportedOperationException("Attempt to modify frozen object");
502 setDecomposition(m_defaultDecomposition_);
503 updateInternalState();
507 * Sets the French collation mode to the initial mode set during construction of the RuleBasedCollator. See
508 * setFrenchCollation(boolean) for more details.
510 * @see #isFrenchCollation
511 * @see #setFrenchCollation(boolean)
514 public void setFrenchCollationDefault() {
516 throw new UnsupportedOperationException("Attempt to modify frozen object");
519 if (m_isFrenchCollation_ != m_defaultIsFrenchCollation_) {
520 latinOneRegenTable_ = true;
522 m_isFrenchCollation_ = m_defaultIsFrenchCollation_;
523 updateInternalState();
527 * Sets the collation strength to the initial mode set during the construction of the RuleBasedCollator. See
528 * setStrength(int) for more details.
530 * @see #setStrength(int)
534 public void setStrengthDefault() {
535 setStrength(m_defaultStrength_);
536 updateInternalState();
540 * Method to set numeric collation to its default value. When numeric collation is turned on, this Collator
541 * generates a collation key for the numeric value of substrings of digits. This is a way to get '100' to sort AFTER
544 * @see #getNumericCollation
545 * @see #setNumericCollation
548 public void setNumericCollationDefault() {
550 throw new UnsupportedOperationException("Attempt to modify frozen object");
553 setNumericCollation(m_defaultIsNumericCollation_);
554 updateInternalState();
558 * Sets the mode for the direction of SECONDARY weights to be used in French collation. The default value is false,
559 * which treats SECONDARY weights in the order they appear. If set to true, the SECONDARY weights will be sorted
560 * backwards. See the section on <a href="http://www.icu-project.org/userguide/Collate_ServiceArchitecture.html">
561 * French collation</a> for more information.
564 * true to set the French collation on, false to set it off
566 * @see #isFrenchCollation
567 * @see #setFrenchCollationDefault
569 public void setFrenchCollation(boolean flag) {
571 throw new UnsupportedOperationException("Attempt to modify frozen object");
574 if (m_isFrenchCollation_ != flag) {
575 latinOneRegenTable_ = true;
577 m_isFrenchCollation_ = flag;
578 updateInternalState();
582 * Sets the alternate handling for QUATERNARY strength to be either shifted or non-ignorable. See the UCA definition
583 * on <a href="http://www.unicode.org/unicode/reports/tr10/#Variable_Weighting"> Alternate Weighting</a>. This
584 * attribute will only be effective when QUATERNARY strength is set. The default value for this mode is false,
585 * corresponding to the NON_IGNORABLE mode in UCA. In the NON-IGNORABLE mode, the RuleBasedCollator will treats all
586 * the codepoints with non-ignorable primary weights in the same way. If the mode is set to true, the behaviour
587 * corresponds to SHIFTED defined in UCA, this causes codepoints with PRIMARY orders that are equal or below the
588 * variable top value to be ignored in PRIMARY order and moved to the QUATERNARY order.
591 * true if SHIFTED behaviour for alternate handling is desired, false for the NON_IGNORABLE behaviour.
592 * @see #isAlternateHandlingShifted
593 * @see #setAlternateHandlingDefault
596 public void setAlternateHandlingShifted(boolean shifted) {
598 throw new UnsupportedOperationException("Attempt to modify frozen object");
601 m_isAlternateHandlingShifted_ = shifted;
602 updateInternalState();
607 * When case level is set to true, an additional weight is formed between the SECONDARY and TERTIARY weight, known
608 * as the case level. The case level is used to distinguish large and small Japanese Kana characters. Case level
609 * could also be used in other situations. For example to distinguish certain Pinyin characters. The default value
610 * is false, which means the case level is not generated. The contents of the case level are affected by the case
611 * first mode. A simple way to ignore accent differences in a string is to set the strength to PRIMARY and enable
615 * See the section on <a href="http://www.icu-project.org/userguide/Collate_ServiceArchitecture.html"> case
616 * level</a> for more information.
620 * true if case level sorting is required, false otherwise
622 * @see #setCaseLevelDefault
625 public void setCaseLevel(boolean flag) {
627 throw new UnsupportedOperationException("Attempt to modify frozen object");
630 m_isCaseLevel_ = flag;
631 updateInternalState();
636 * Sets this Collator's strength property. The strength property determines the minimum level of difference
637 * considered significant during comparison.
640 * See the Collator class description for an example of use.
644 * the new strength value.
646 * @see #setStrengthDefault
652 * @exception IllegalArgumentException
653 * If the new strength value is not one of PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL.
656 public void setStrength(int newStrength) {
657 super.setStrength(newStrength);
658 updateInternalState();
663 * Variable top is a two byte primary value which causes all the codepoints with primary values that are less or
664 * equal than the variable top to be shifted when alternate handling is set to SHIFTED.
667 * Sets the variable top to a collation element value of a string supplied.
671 * one or more (if contraction) characters to which the variable top should be set
672 * @return a int value containing the value of the variable top in upper 16 bits. Lower 16 bits are undefined.
673 * @exception IllegalArgumentException
674 * is thrown if varTop argument is not a valid variable top element. A variable top element is
677 * <li>it is a contraction that does not exist in the Collation order
678 * <li>when the PRIMARY strength collation element for the variable top has more than two bytes
679 * <li>when the varTop argument is null or zero in length.
681 * @see #getVariableTop
682 * @see RuleBasedCollator#setAlternateHandlingShifted
685 public int setVariableTop(String varTop) {
687 throw new UnsupportedOperationException("Attempt to modify frozen object");
690 if (varTop == null || varTop.length() == 0) {
691 throw new IllegalArgumentException("Variable top argument string can not be null or zero in length.");
694 CollationBuffer buffer = null;
696 buffer = getCollationBuffer();
697 return setVariableTop(varTop, buffer);
699 releaseCollationBuffer(buffer);
704 private int setVariableTop(String varTop, CollationBuffer buffer) {
705 buffer.m_srcUtilColEIter_.setText(varTop);
706 int ce = buffer.m_srcUtilColEIter_.next();
708 // here we check if we have consumed all characters
709 // you can put in either one character or a contraction
710 // you shouldn't put more...
711 if (buffer.m_srcUtilColEIter_.getOffset() != varTop.length() || ce == CollationElementIterator.NULLORDER) {
712 throw new IllegalArgumentException("Variable top argument string is a contraction that does not exist "
713 + "in the Collation order");
716 int nextCE = buffer.m_srcUtilColEIter_.next();
718 if ((nextCE != CollationElementIterator.NULLORDER)
719 && (!isContinuation(nextCE) || (nextCE & CE_PRIMARY_MASK_) != 0)) {
720 throw new IllegalArgumentException("Variable top argument string can only have a single collation "
721 + "element that has less than or equal to two PRIMARY strength " + "bytes");
724 m_variableTopValue_ = (ce & CE_PRIMARY_MASK_) >> 16;
726 return ce & CE_PRIMARY_MASK_;
730 * Sets the variable top to a collation element value supplied. Variable top is set to the upper 16 bits. Lower 16
734 * Collation element value, as returned by setVariableTop or getVariableTop
735 * @see #getVariableTop
736 * @see #setVariableTop(String)
739 public void setVariableTop(int varTop) {
741 throw new UnsupportedOperationException("Attempt to modify frozen object");
744 m_variableTopValue_ = (varTop & CE_PRIMARY_MASK_) >> 16;
748 * When numeric collation is turned on, this Collator generates a collation key for the numeric value of substrings
749 * of digits. This is a way to get '100' to sort AFTER '2'
752 * true to turn numeric collation on and false to turn it off
753 * @see #getNumericCollation
754 * @see #setNumericCollationDefault
757 public void setNumericCollation(boolean flag) {
759 throw new UnsupportedOperationException("Attempt to modify frozen object");
762 // sort substrings of digits as numbers
763 m_isNumericCollation_ = flag;
764 updateInternalState();
768 * Sets the reordering codes for this collator.
769 * Collation reordering allows scripts and some other defined blocks of characters
770 * to be moved relative to each other as a block. This reordering is done on top of
771 * the DUCET/CLDR standard collation order. Reordering can specify groups to be placed
772 * at the start and/or the end of the collation order.
773 * <p>By default, reordering codes specified for the start of the order are placed in the
774 * order given after a group of “special” non-script blocks. These special groups of characters
775 * are space, punctuation, symbol, currency, and digit. These special groups are represented with
776 * {@link Collator.ReorderCodes}. Script groups can be intermingled with
777 * these special non-script blocks if those special blocks are explicitly specified in the reordering.
778 * <p>The special code {@link Collator.ReorderCodes#OTHERS OTHERS} stands for any script that is not explicitly
779 * mentioned in the list of reordering codes given. Anything that is after {@link Collator.ReorderCodes#OTHERS OTHERS}
780 * will go at the very end of the reordering in the order given.
781 * <p>The special reorder code {@link Collator.ReorderCodes#DEFAULT DEFAULT} will reset the reordering for this collator
782 * to the default for this collator. The default reordering may be the DUCET/CLDR order or may be a reordering that
783 * was specified when this collator was created from resource data or from rules. The
784 * {@link Collator.ReorderCodes#DEFAULT DEFAULT} code <b>must</b> be the sole code supplied when it used. If not
785 * that will result in an {@link IllegalArgumentException} being thrown.
786 * <p>The special reorder code {@link Collator.ReorderCodes#NONE NONE} will remove any reordering for this collator.
787 * The result of setting no reordering will be to have the DUCET/CLDR reordering used. The
788 * {@link Collator.ReorderCodes#NONE NONE} code <b>must</b> be the sole code supplied when it used.
789 * @param order the reordering codes to apply to this collator; if this is null or an empty array
790 * then this clears any existing reordering
791 * @throws IllegalArgumentException if the reordering codes are malformed in any way (e.g. duplicates, multiple reset codes, overlapping equivalent scripts)
792 * @see #getReorderCodes
793 * @see #getEquivalentReorderCodes
796 public void setReorderCodes(int... order) {
798 throw new UnsupportedOperationException("Attempt to modify frozen object");
801 if (order != null && order.length > 0) {
802 m_reorderCodes_ = order.clone();
804 m_reorderCodes_ = null;
806 buildPermutationTable();
809 // public getters --------------------------------------------------------
812 * Gets the collation tailoring rules for this RuleBasedCollator.
813 * Equivalent to String getRules(false).
815 * @return the collation tailoring rules
816 * @see #getRules(boolean)
819 public String getRules() {
824 * Returns current rules. The argument defines whether full rules (UCA + tailored) rules are returned or just the
827 * <p>The "UCA rules" are an <i>approximation</i> of the root collator's sort order.
828 * They are almost never used or useful at runtime and can be removed from the data.
829 * See <a href="http://userguide.icu-project.org/collation/customization#TOC-Building-on-Existing-Locales">User Guide:
830 * Collation Customization, Building on Existing Locales</a>
832 * <p>{@link #getRules()} should normally be used instead.
834 * true if the rules that defines the full set of collation order is required, otherwise false for
835 * returning only the tailored rules
836 * @return the current rules that defines this Collator.
840 public String getRules(boolean fullrules) {
844 // take the UCA rules and append real rules at the end
845 return UCA_.m_rules_.concat(m_rules_);
849 * Get an UnicodeSet that contains all the characters and sequences tailored in this collator.
851 * @return a pointer to a UnicodeSet object containing all the code points and sequences that may sort differently
855 public UnicodeSet getTailoredSet() {
857 CollationRuleParser src = new CollationRuleParser(getRules());
858 return src.getTailoredSet();
859 } catch (Exception e) {
860 throw new IllegalStateException("A tailoring rule should not " + "have errors. Something is quite wrong!");
864 private static class contContext {
865 RuleBasedCollator coll;
866 UnicodeSet contractions;
867 UnicodeSet expansions;
868 UnicodeSet removedContractions;
871 contContext(RuleBasedCollator coll, UnicodeSet contractions, UnicodeSet expansions,
872 UnicodeSet removedContractions, boolean addPrefixes) {
874 this.contractions = contractions;
875 this.expansions = expansions;
876 this.removedContractions = removedContractions;
877 this.addPrefixes = addPrefixes;
881 private void addSpecial(contContext c, StringBuilder buffer, int CE) {
882 StringBuilder b = new StringBuilder();
883 int offset = (CE & 0xFFFFFF) - c.coll.m_contractionOffset_;
884 int newCE = c.coll.m_contractionCE_[offset];
885 // we might have a contraction that ends from previous level
886 if (newCE != CollationElementIterator.CE_NOT_FOUND_) {
887 if (isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_ && isSpecial(newCE)
888 && getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) {
889 addSpecial(c, buffer, newCE);
891 if (buffer.length() > 1) {
892 if (c.contractions != null) {
893 c.contractions.add(buffer.toString());
895 if (c.expansions != null && isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) {
896 c.expansions.add(buffer.toString());
902 // check whether we're doing contraction or prefix
903 if (getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) {
904 while (c.coll.m_contractionIndex_[offset] != 0xFFFF) {
905 b.delete(0, b.length());
907 newCE = c.coll.m_contractionCE_[offset];
908 b.insert(0, c.coll.m_contractionIndex_[offset]);
910 && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) {
911 addSpecial(c, b, newCE);
913 if (c.contractions != null) {
914 c.contractions.add(b.toString());
916 if (c.expansions != null && isSpecial(newCE)
917 && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) {
918 c.expansions.add(b.toString());
923 } else if (getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_) {
924 while (c.coll.m_contractionIndex_[offset] != 0xFFFF) {
925 b.delete(0, b.length());
927 newCE = c.coll.m_contractionCE_[offset];
928 b.append(c.coll.m_contractionIndex_[offset]);
930 && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) {
931 addSpecial(c, b, newCE);
933 if (c.contractions != null) {
934 c.contractions.add(b.toString());
936 if (c.expansions != null && isSpecial(newCE)
937 && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) {
938 c.expansions.add(b.toString());
946 private void processSpecials(contContext c) {
947 int internalBufferSize = 512;
948 TrieIterator trieiterator = new TrieIterator(c.coll.m_trie_);
949 RangeValueIterator.Element element = new RangeValueIterator.Element();
950 while (trieiterator.next(element)) {
951 int start = element.start;
952 int limit = element.limit;
953 int CE = element.value;
954 StringBuilder contraction = new StringBuilder(internalBufferSize);
957 if (((getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) || getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_)) {
958 while (start < limit) {
959 // if there are suppressed contractions, we don't
961 if (c.removedContractions != null && c.removedContractions.contains(start)) {
965 // we start our contraction from middle, since we don't know if it
966 // will grow toward right or left
967 contraction.append((char) start);
968 addSpecial(c, contraction, CE);
971 } else if (c.expansions != null && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) {
972 while (start < limit) {
973 c.expansions.add(start++);
981 * Gets unicode sets containing contractions and/or expansions of a collator
983 * @param contractions
984 * if not null, set to contain contractions
986 * if not null, set to contain expansions
988 * add the prefix contextual elements to contractions
990 * Throws an exception if any errors occurs.
993 public void getContractionsAndExpansions(UnicodeSet contractions, UnicodeSet expansions, boolean addPrefixes)
995 if (contractions != null) {
996 contractions.clear();
998 if (expansions != null) {
1001 String rules = getRules();
1003 CollationRuleParser src = new CollationRuleParser(rules);
1004 contContext c = new contContext(RuleBasedCollator.UCA_, contractions, expansions, src.m_removeSet_,
1007 // Add the UCA contractions
1009 // This is collator specific. Add contractions from a collator
1011 c.removedContractions = null;
1013 } catch (Exception e) {
1020 * Get a Collation key for the argument String source from this RuleBasedCollator.
1023 * General recommendation: <br>
1024 * If comparison are to be done to the same String multiple times, it would be more efficient to generate
1025 * CollationKeys for the Strings and use CollationKey.compareTo(CollationKey) for the comparisons. If the each
1026 * Strings are compared to only once, using the method RuleBasedCollator.compare(String, String) will have a better
1030 * See the class documentation for an explanation about CollationKeys.
1034 * the text String to be transformed into a collation key.
1035 * @return the CollationKey for the given String based on this RuleBasedCollator's collation rules. If the source
1036 * String is null, a null CollationKey is returned.
1038 * @see #compare(String, String)
1039 * @see #getRawCollationKey
1042 public CollationKey getCollationKey(String source) {
1043 if (source == null) {
1046 CollationBuffer buffer = null;
1048 buffer = getCollationBuffer();
1049 return getCollationKey(source, buffer);
1051 releaseCollationBuffer(buffer);
1055 private CollationKey getCollationKey(String source, CollationBuffer buffer) {
1056 buffer.m_utilRawCollationKey_ = getRawCollationKey(source, buffer.m_utilRawCollationKey_, buffer);
1057 return new CollationKey(source, buffer.m_utilRawCollationKey_);
1061 * Gets the simpler form of a CollationKey for the String source following the rules of this Collator and stores the
1062 * result into the user provided argument key. If key has a internal byte array of length that's too small for the
1063 * result, the internal byte array will be grown to the exact required size.
1065 * @param source the text String to be transformed into a RawCollationKey
1066 * @param key output RawCollationKey to store results
1067 * @return If key is null, a new instance of RawCollationKey will be created and returned, otherwise the user
1068 * provided key will be returned.
1069 * @see #getCollationKey
1070 * @see #compare(String, String)
1071 * @see RawCollationKey
1074 public RawCollationKey getRawCollationKey(String source, RawCollationKey key) {
1075 if (source == null) {
1078 CollationBuffer buffer = null;
1080 buffer = getCollationBuffer();
1081 return getRawCollationKey(source, key, buffer);
1083 releaseCollationBuffer(buffer);
1087 private RawCollationKey getRawCollationKey(String source, RawCollationKey key, CollationBuffer buffer) {
1088 int strength = getStrength();
1089 buffer.m_utilCompare0_ = m_isCaseLevel_;
1090 // m_utilCompare1_ = true;
1091 buffer.m_utilCompare2_ = strength >= SECONDARY;
1092 buffer.m_utilCompare3_ = strength >= TERTIARY;
1093 buffer.m_utilCompare4_ = strength >= QUATERNARY;
1094 buffer.m_utilCompare5_ = strength == IDENTICAL;
1096 boolean doFrench = m_isFrenchCollation_ && buffer.m_utilCompare2_;
1097 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted.
1098 // If we have no qShifted, we don't need to set UCOL_COMMON_BOT4 so
1100 int commonBottom4 = ((m_variableTopValue_ >>> 8) + 1) & LAST_BYTE_MASK_;
1102 if (m_isHiragana4_ && buffer.m_utilCompare4_) {
1103 // allocate one more space for hiragana, value for hiragana
1104 hiragana4 = (byte) commonBottom4;
1108 int bottomCount4 = 0xFF - commonBottom4;
1109 // If we need to normalize, we'll do it all at once at the beginning!
1110 if (buffer.m_utilCompare5_ && Normalizer.quickCheck(source, Normalizer.NFD, 0) != Normalizer.YES) {
1111 // if it is identical strength, we have to normalize the string to
1112 // NFD so that it will be appended correctly to the end of the sort
1114 source = Normalizer.decompose(source, false);
1115 } else if (getDecomposition() != NO_DECOMPOSITION
1116 && Normalizer.quickCheck(source, Normalizer.FCD, 0) != Normalizer.YES) {
1117 // for the rest of the strength, if decomposition is on, FCD is
1118 // enough for us to work on.
1119 source = Normalizer.normalize(source, Normalizer.FCD);
1121 getSortKeyBytes(source, doFrench, hiragana4, commonBottom4, bottomCount4, buffer);
1123 key = new RawCollationKey();
1125 getSortKey(source, doFrench, commonBottom4, bottomCount4, key, buffer);
1130 * Return true if an uppercase character is sorted before the corresponding lowercase character. See
1131 * setCaseFirst(boolean) for details.
1133 * @see #setUpperCaseFirst
1134 * @see #setLowerCaseFirst
1135 * @see #isLowerCaseFirst
1136 * @see #setCaseFirstDefault
1137 * @return true if upper cased characters are sorted before lower cased characters, false otherwise
1140 public boolean isUpperCaseFirst() {
1141 return (m_caseFirst_ == AttributeValue.UPPER_FIRST_);
1145 * Return true if a lowercase character is sorted before the corresponding uppercase character. See
1146 * setCaseFirst(boolean) for details.
1148 * @see #setUpperCaseFirst
1149 * @see #setLowerCaseFirst
1150 * @see #isUpperCaseFirst
1151 * @see #setCaseFirstDefault
1152 * @return true lower cased characters are sorted before upper cased characters, false otherwise
1155 public boolean isLowerCaseFirst() {
1156 return (m_caseFirst_ == AttributeValue.LOWER_FIRST_);
1160 * Checks if the alternate handling behaviour is the UCA defined SHIFTED or NON_IGNORABLE. If return value is true,
1161 * then the alternate handling attribute for the Collator is SHIFTED. Otherwise if return value is false, then the
1162 * alternate handling attribute for the Collator is NON_IGNORABLE See setAlternateHandlingShifted(boolean) for more
1165 * @return true or false
1166 * @see #setAlternateHandlingShifted(boolean)
1167 * @see #setAlternateHandlingDefault
1170 public boolean isAlternateHandlingShifted() {
1171 return m_isAlternateHandlingShifted_;
1175 * Checks if case level is set to true. See setCaseLevel(boolean) for details.
1177 * @return the case level mode
1178 * @see #setCaseLevelDefault
1180 * @see #setCaseLevel(boolean)
1183 public boolean isCaseLevel() {
1184 return m_isCaseLevel_;
1188 * Checks if French Collation is set to true. See setFrenchCollation(boolean) for details.
1190 * @return true if French Collation is set to true, false otherwise
1191 * @see #setFrenchCollation(boolean)
1192 * @see #setFrenchCollationDefault
1195 public boolean isFrenchCollation() {
1196 return m_isFrenchCollation_;
1200 * Checks if the Hiragana Quaternary mode is set on. See setHiraganaQuaternary(boolean) for more details.
1202 * This attribute is an implementation detail of the CLDR Japanese tailoring.
1203 * The implementation might change to use a different mechanism
1204 * to achieve the same Japanese sort order.
1205 * Since ICU 50, this attribute is not settable any more via API functions.
1207 * @return flag true if Hiragana Quaternary mode is on, false otherwise
1208 * @see #setHiraganaQuaternaryDefault
1209 * @see #setHiraganaQuaternary(boolean)
1210 * @deprecated ICU 50 Implementation detail, cannot be set via API, might be removed from implementation.
1212 public boolean isHiraganaQuaternary() {
1213 return m_isHiragana4_;
1217 * Gets the variable top value of a Collator. Lower 16 bits are undefined and should be ignored.
1219 * @return the variable top value of a Collator.
1220 * @see #setVariableTop
1223 public int getVariableTop() {
1224 return m_variableTopValue_ << 16;
1228 * Method to retrieve the numeric collation value. When numeric collation is turned on, this Collator generates a
1229 * collation key for the numeric value of substrings of digits. This is a way to get '100' to sort AFTER '2'
1231 * @see #setNumericCollation
1232 * @see #setNumericCollationDefault
1233 * @return true if numeric collation is turned on, false otherwise
1236 public boolean getNumericCollation() {
1237 return m_isNumericCollation_;
1241 * Retrieves the reordering codes for this collator.
1242 * These reordering codes are a combination of UScript codes and ReorderCodes.
1243 * @return a copy of the reordering codes for this collator;
1244 * if none are set then returns an empty array
1245 * @see #setReorderCodes
1246 * @see #getEquivalentReorderCodes
1249 public int[] getReorderCodes() {
1250 if (m_reorderCodes_ != null) {
1251 return m_reorderCodes_.clone();
1253 return LeadByteConstants.EMPTY_INT_ARRAY;
1258 * Retrieves all the reorder codes that are grouped with the given reorder code. Some reorder
1259 * codes are grouped and must reorder together.
1261 * @param reorderCode code for which equivalents to be retrieved
1262 * @return the set of all reorder codes in the same group as the given reorder code.
1263 * @see #setReorderCodes
1264 * @see #getReorderCodes
1267 public static int[] getEquivalentReorderCodes(int reorderCode) {
1268 Set<Integer> equivalentCodesSet = new HashSet<Integer>();
1269 int[] leadBytes = RuleBasedCollator.LEADBYTE_CONSTANTS_.getLeadBytesForReorderCode(reorderCode);
1270 for (int leadByte : leadBytes) {
1271 int[] codes = RuleBasedCollator.LEADBYTE_CONSTANTS_.getReorderCodesForLeadByte(leadByte);
1272 for (int code : codes) {
1273 equivalentCodesSet.add(code);
1276 int[] equivalentCodes = new int[equivalentCodesSet.size()];
1278 for (int code : equivalentCodesSet) {
1279 equivalentCodes[i++] = code;
1281 return equivalentCodes;
1284 // public other methods -------------------------------------------------
1287 * Compares the equality of two RuleBasedCollator objects. RuleBasedCollator objects are equal if they have the same
1288 * collation rules and the same attributes.
1291 * the RuleBasedCollator to be compared to.
1292 * @return true if this RuleBasedCollator has exactly the same collation behaviour as obj, false otherwise.
1295 public boolean equals(Object obj) {
1297 return false; // super does class check
1302 if (getClass() != obj.getClass()) {
1305 RuleBasedCollator other = (RuleBasedCollator) obj;
1306 // all other non-transient information is also contained in rules.
1307 if (getStrength() != other.getStrength() || getDecomposition() != other.getDecomposition()
1308 || other.m_caseFirst_ != m_caseFirst_ || other.m_caseSwitch_ != m_caseSwitch_
1309 || other.m_isAlternateHandlingShifted_ != m_isAlternateHandlingShifted_
1310 || other.m_isCaseLevel_ != m_isCaseLevel_ || other.m_isFrenchCollation_ != m_isFrenchCollation_
1311 || other.m_isHiragana4_ != m_isHiragana4_) {
1314 if (m_reorderCodes_ != null ^ other.m_reorderCodes_ != null) {
1317 if (m_reorderCodes_ != null) {
1318 if (m_reorderCodes_.length != other.m_reorderCodes_.length) {
1321 for (int i = 0; i < m_reorderCodes_.length; i++) {
1322 if (m_reorderCodes_[i] != other.m_reorderCodes_[i]) {
1327 boolean rules = m_rules_ == other.m_rules_;
1328 if (!rules && (m_rules_ != null && other.m_rules_ != null)) {
1329 rules = m_rules_.equals(other.m_rules_);
1331 if (!rules || !ICUDebug.enabled("collation")) {
1334 if (m_addition3_ != other.m_addition3_ || m_bottom3_ != other.m_bottom3_
1335 || m_bottomCount3_ != other.m_bottomCount3_ || m_common3_ != other.m_common3_
1336 || m_isSimple3_ != other.m_isSimple3_ || m_mask3_ != other.m_mask3_
1337 || m_minContractionEnd_ != other.m_minContractionEnd_ || m_minUnsafe_ != other.m_minUnsafe_
1338 || m_top3_ != other.m_top3_ || m_topCount3_ != other.m_topCount3_
1339 || !Arrays.equals(m_unsafe_, other.m_unsafe_)) {
1342 if (!m_trie_.equals(other.m_trie_)) {
1343 // we should use the trie iterator here, but then this part is
1344 // only used in the test.
1345 for (int i = UCharacter.MAX_VALUE; i >= UCharacter.MIN_VALUE; i--) {
1346 int v = m_trie_.getCodePointValue(i);
1347 int otherv = other.m_trie_.getCodePointValue(i);
1349 int mask = v & (CE_TAG_MASK_ | CE_SPECIAL_FLAG_);
1350 if (mask == (otherv & 0xff000000)) {
1353 if (mask == 0xf1000000) {
1354 v -= (m_expansionOffset_ << 4);
1355 otherv -= (other.m_expansionOffset_ << 4);
1356 } else if (mask == 0xf2000000) {
1357 v -= m_contractionOffset_;
1358 otherv -= other.m_contractionOffset_;
1368 if (!Arrays.equals(m_contractionCE_, other.m_contractionCE_)
1369 || !Arrays.equals(m_contractionEnd_, other.m_contractionEnd_)
1370 || !Arrays.equals(m_contractionIndex_, other.m_contractionIndex_)
1371 || !Arrays.equals(m_expansion_, other.m_expansion_)
1372 || !Arrays.equals(m_expansionEndCE_, other.m_expansionEndCE_)) {
1375 // not comparing paddings
1376 for (int i = 0; i < m_expansionEndCE_.length; i++) {
1377 if (m_expansionEndCEMaxSize_[i] != other.m_expansionEndCEMaxSize_[i]) {
1385 * Generates a unique hash code for this RuleBasedCollator.
1387 * @return the unique hash code for this Collator
1390 public int hashCode() {
1391 String rules = getRules();
1392 if (rules == null) {
1395 return rules.hashCode();
1399 * Compares the source text String to the target text String according to the collation rules, strength and
1400 * decomposition mode for this RuleBasedCollator. Returns an integer less than, equal to or greater than zero
1401 * depending on whether the source String is less than, equal to or greater than the target String. See the Collator
1402 * class description for an example of use. </p>
1404 * General recommendation: <br>
1405 * If comparison are to be done to the same String multiple times, it would be more efficient to generate
1406 * CollationKeys for the Strings and use CollationKey.compareTo(CollationKey) for the comparisons. If speed
1407 * performance is critical and object instantiation is to be reduced, further optimization may be achieved by
1408 * generating a simpler key of the form RawCollationKey and reusing this RawCollationKey object with the method
1409 * RuleBasedCollator.getRawCollationKey. Internal byte representation can be directly accessed via RawCollationKey
1410 * and stored for future use. Like CollationKey, RawCollationKey provides a method RawCollationKey.compareTo for key
1411 * comparisons. If the each Strings are compared to only once, using the method RuleBasedCollator.compare(String,
1412 * String) will have a better performance.
1416 * the source text String.
1418 * the target text String.
1419 * @return Returns an integer value. Value is less than zero if source is less than target, value is zero if source
1420 * and target are equal, value is greater than zero if source is greater than target.
1422 * @see #getCollationKey
1425 public int compare(String source, String target) {
1426 if (source.equals(target)) {
1429 CollationBuffer buffer = null;
1431 buffer = getCollationBuffer();
1432 return compare(source, target, buffer);
1434 releaseCollationBuffer(buffer);
1438 private int compare(String source, String target, CollationBuffer buffer) {
1439 // Find the length of any leading portion that is equal
1440 int offset = getFirstUnmatchedOffset(source, target);
1441 // return compareRegular(source, target, offset);
1443 if ((offset < source.length() && source.charAt(offset) > ENDOFLATINONERANGE_)
1444 || (offset < target.length() && target.charAt(offset) > ENDOFLATINONERANGE_)) {
1445 // source or target start with non-latin-1
1446 return compareRegular(source, target, offset, buffer);
1448 return compareUseLatin1(source, target, offset, buffer);
1451 return compareRegular(source, target, offset, buffer);
1455 // package private inner interfaces --------------------------------------
1458 * Attribute values to be used when setting the Collator options
1460 static interface AttributeValue {
1462 * Indicates that the default attribute value will be used. See individual attribute for details on its default
1465 static final int DEFAULT_ = -1;
1467 * Primary collation strength
1469 static final int PRIMARY_ = Collator.PRIMARY;
1471 * Secondary collation strength
1473 static final int SECONDARY_ = Collator.SECONDARY;
1475 * Tertiary collation strength
1477 static final int TERTIARY_ = Collator.TERTIARY;
1479 * Default collation strength
1481 static final int DEFAULT_STRENGTH_ = Collator.TERTIARY;
1483 * Internal use for strength checks in Collation elements
1485 static final int CE_STRENGTH_LIMIT_ = Collator.TERTIARY + 1;
1487 * Quaternary collation strength
1489 static final int QUATERNARY_ = 3;
1491 * Identical collation strength
1493 static final int IDENTICAL_ = Collator.IDENTICAL;
1495 * Internal use for strength checks
1497 static final int STRENGTH_LIMIT_ = Collator.IDENTICAL + 1;
1499 * Turn the feature off - works for FRENCH_COLLATION, CASE_LEVEL, HIRAGANA_QUATERNARY_MODE and
1500 * DECOMPOSITION_MODE
1502 static final int OFF_ = 16;
1504 * Turn the feature on - works for FRENCH_COLLATION, CASE_LEVEL, HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE
1506 static final int ON_ = 17;
1508 * Valid for ALTERNATE_HANDLING. Alternate handling will be shifted
1510 static final int SHIFTED_ = 20;
1512 * Valid for ALTERNATE_HANDLING. Alternate handling will be non ignorable
1514 static final int NON_IGNORABLE_ = 21;
1516 * Valid for CASE_FIRST - lower case sorts before upper case
1518 static final int LOWER_FIRST_ = 24;
1520 * Upper case sorts before lower case
1522 static final int UPPER_FIRST_ = 25;
1524 * Number of attribute values
1526 static final int LIMIT_ = 29;
1530 * Attributes that collation service understands. All the attributes can take DEFAULT value, as well as the values
1531 * specific to each one.
1533 static interface Attribute {
1535 * Attribute for direction of secondary weights - used in French. Acceptable values are ON, which results in
1536 * secondary weights being considered backwards and OFF which treats secondary weights in the order they appear.
1538 static final int FRENCH_COLLATION_ = 0;
1540 * Attribute for handling variable elements. Acceptable values are NON_IGNORABLE (default) which treats all the
1541 * codepoints with non-ignorable primary weights in the same way, and SHIFTED which causes codepoints with
1542 * primary weights that are equal or below the variable top value to be ignored on primary level and moved to
1543 * the quaternary level.
1545 static final int ALTERNATE_HANDLING_ = 1;
1547 * Controls the ordering of upper and lower case letters. Acceptable values are OFF (default), which orders
1548 * upper and lower case letters in accordance to their tertiary weights, UPPER_FIRST which forces upper case
1549 * letters to sort before lower case letters, and LOWER_FIRST which does the opposite.
1551 static final int CASE_FIRST_ = 2;
1553 * Controls whether an extra case level (positioned before the third level) is generated or not. Acceptable
1554 * values are OFF (default), when case level is not generated, and ON which causes the case level to be
1555 * generated. Contents of the case level are affected by the value of CASE_FIRST attribute. A simple way to
1556 * ignore accent differences in a string is to set the strength to PRIMARY and enable case level.
1558 static final int CASE_LEVEL_ = 3;
1560 * Controls whether the normalization check and necessary normalizations are performed. When set to OFF
1561 * (default) no normalization check is performed. The correctness of the result is guaranteed only if the input
1562 * data is in so-called FCD form (see users manual for more info). When set to ON, an incremental check is
1563 * performed to see whether the input data is in the FCD form. If the data is not in the FCD form, incremental
1564 * NFD normalization is performed.
1566 static final int NORMALIZATION_MODE_ = 4;
1568 * The strength attribute. Can be either PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL. The usual
1569 * strength for most locales (except Japanese) is tertiary. Quaternary strength is useful when combined with
1570 * shifted setting for alternate handling attribute and for JIS x 4061 collation, when it is used to distinguish
1571 * between Katakana and Hiragana (this is achieved by setting the HIRAGANA_QUATERNARY mode to on. Otherwise,
1572 * quaternary level is affected only by the number of non ignorable code points in the string. Identical
1573 * strength is rarely useful, as it amounts to codepoints of the NFD form of the string.
1575 static final int STRENGTH_ = 5;
1577 * When turned on, this attribute positions Hiragana before all non-ignorables on quaternary level. This is a
1578 * sneaky way to produce JIS sort order.
1580 static final int HIRAGANA_QUATERNARY_MODE_ = 6;
1584 static final int LIMIT_ = 7;
1588 * DataManipulate singleton
1590 static class DataManipulate implements Trie.DataManipulate {
1591 // public methods ----------------------------------------------------
1594 * Internal method called to parse a lead surrogate's ce for the offset to the next trail surrogate data.
1597 * collation element of the lead surrogate
1598 * @return data offset or 0 for the next trail surrogate
1601 public final int getFoldingOffset(int ce) {
1602 if (isSpecial(ce) && getTag(ce) == CE_SURROGATE_TAG_) {
1603 return (ce & 0xFFFFFF);
1609 * Get singleton object
1611 public static final DataManipulate getInstance() {
1612 if (m_instance_ == null) {
1613 m_instance_ = new DataManipulate();
1618 // private data member ----------------------------------------------
1621 * Singleton instance
1623 private static DataManipulate m_instance_;
1625 // private constructor ----------------------------------------------
1628 * private to prevent initialization
1630 private DataManipulate() {
1637 static final class UCAConstants {
1638 int FIRST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000
1639 int LAST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000
1640 int FIRST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x00008705
1641 int FIRST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000000
1642 int LAST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000500
1643 int LAST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x0000DD05
1644 int FIRST_VARIABLE_[] = new int[2]; // 0x05070505
1645 int LAST_VARIABLE_[] = new int[2]; // 0x13CF0505
1646 int FIRST_NON_VARIABLE_[] = new int[2]; // 0x16200505
1647 int LAST_NON_VARIABLE_[] = new int[2]; // 0x767C0505
1648 int RESET_TOP_VALUE_[] = new int[2]; // 0x9F000303
1649 int FIRST_IMPLICIT_[] = new int[2];
1650 int LAST_IMPLICIT_[] = new int[2];
1651 int FIRST_TRAILING_[] = new int[2];
1652 int LAST_TRAILING_[] = new int[2];
1653 int PRIMARY_TOP_MIN_;
1654 int PRIMARY_IMPLICIT_MIN_; // 0xE8000000
1655 int PRIMARY_IMPLICIT_MAX_; // 0xF0000000
1656 int PRIMARY_TRAILING_MIN_; // 0xE8000000
1657 int PRIMARY_TRAILING_MAX_; // 0xF0000000
1658 int PRIMARY_SPECIAL_MIN_; // 0xE8000000
1659 int PRIMARY_SPECIAL_MAX_; // 0xF0000000
1663 * Script to Lead Byte and Lead Byte to Script Data
1666 static final class LeadByteConstants {
1667 private static final int DATA_MASK_FOR_INDEX = 0x8000;
1668 private static final int[] EMPTY_INT_ARRAY = new int[0];
1670 private int serializedSize = 0;
1672 private Map<Integer, Integer> SCRIPT_TO_LEAD_BYTES_INDEX;
1673 private byte[] SCRIPT_TO_LEAD_BYTES_DATA;
1675 private int[] LEAD_BYTE_TO_SCRIPTS_INDEX;
1676 private byte[] LEAD_BYTE_TO_SCRIPTS_DATA;
1678 LeadByteConstants() {
1681 void read(DataInputStream dis) throws IOException {
1686 // script to lead bytes
1687 indexCount = dis.readShort();
1689 dataSize = dis.readShort();
1691 this.SCRIPT_TO_LEAD_BYTES_INDEX = new HashMap<Integer, Integer>();
1692 //System.out.println("Script to Lead Bytes Index - Count = " + indexCount);
1693 for (int index = 0; index < indexCount; index++) {
1694 int reorderCode = dis.readShort(); // reorder code
1696 int dataOffset = 0xffff & dis.readShort(); // data offset
1698 // System.out.println("\t-------------");
1699 // System.out.println("\toffset = " + Integer.toHexString(readcount - 4));
1700 // System.out.println("\treorderCode = " + Integer.toHexString(reorderCode));
1701 // System.out.println("\tdataOffset = " + Integer.toHexString(dataOffset));
1702 this.SCRIPT_TO_LEAD_BYTES_INDEX.put(reorderCode, dataOffset);
1705 this.SCRIPT_TO_LEAD_BYTES_DATA = new byte[dataSize * 2];
1706 dis.readFully(this.SCRIPT_TO_LEAD_BYTES_DATA, 0, this.SCRIPT_TO_LEAD_BYTES_DATA.length);
1707 readcount += this.SCRIPT_TO_LEAD_BYTES_DATA.length;
1709 // lead byte to scripts
1710 indexCount = dis.readShort();
1712 dataSize = dis.readShort();
1714 this.LEAD_BYTE_TO_SCRIPTS_INDEX = new int[indexCount];
1715 //System.out.println("Lead Byte to Scripts Index - Count = " + indexCount);
1716 for (int index = 0; index < indexCount; index++) {
1717 this.LEAD_BYTE_TO_SCRIPTS_INDEX[index] = 0xffff & dis.readShort();
1719 // System.out.println("\t-------------");
1720 // System.out.println("\toffset = " + Integer.toHexString(readcount - 2));
1721 // System.out.println("\tindex = " + Integer.toHexString(index));
1722 // System.out.println("\tdataOffset = " + Integer.toHexString(this.LEAD_BYTE_TO_SCRIPTS_INDEX[index]));
1725 this.LEAD_BYTE_TO_SCRIPTS_DATA = new byte[dataSize * 2];
1726 dis.readFully(this.LEAD_BYTE_TO_SCRIPTS_DATA, 0, this.LEAD_BYTE_TO_SCRIPTS_DATA.length);
1727 readcount += this.LEAD_BYTE_TO_SCRIPTS_DATA.length;
1729 this.serializedSize = readcount;
1732 int getSerializedDataSize() {
1733 return this.serializedSize;
1736 int[] getReorderCodesForLeadByte(int leadByte) {
1737 if (leadByte >= this.LEAD_BYTE_TO_SCRIPTS_INDEX.length) {
1738 return EMPTY_INT_ARRAY;
1740 int offset = this.LEAD_BYTE_TO_SCRIPTS_INDEX[leadByte];
1742 return EMPTY_INT_ARRAY;
1745 if ((offset & DATA_MASK_FOR_INDEX) == DATA_MASK_FOR_INDEX) {
1746 reorderCodes = new int[1];
1747 reorderCodes[0] = offset & ~DATA_MASK_FOR_INDEX;
1749 int length = readShort(this.LEAD_BYTE_TO_SCRIPTS_DATA, offset);
1752 reorderCodes = new int[length];
1753 for (int code = 0; code < length; code++, offset++) {
1754 reorderCodes[code] = readShort(this.LEAD_BYTE_TO_SCRIPTS_DATA, offset);
1757 return reorderCodes;
1760 int[] getLeadBytesForReorderCode(int reorderCode) {
1761 if (!this.SCRIPT_TO_LEAD_BYTES_INDEX.containsKey(reorderCode)) {
1762 return EMPTY_INT_ARRAY;
1764 int offset = this.SCRIPT_TO_LEAD_BYTES_INDEX.get(reorderCode);
1767 return EMPTY_INT_ARRAY;
1771 if ((offset & DATA_MASK_FOR_INDEX) == DATA_MASK_FOR_INDEX) {
1772 leadBytes = new int[1];
1773 leadBytes[0] = offset & ~DATA_MASK_FOR_INDEX;
1775 int length = readShort(this.SCRIPT_TO_LEAD_BYTES_DATA, offset);
1778 leadBytes = new int[length];
1779 for (int leadByte = 0; leadByte < length; leadByte++, offset++) {
1780 leadBytes[leadByte] = readShort(this.SCRIPT_TO_LEAD_BYTES_DATA, offset);
1786 private static int readShort(byte[] data, int offset) {
1787 return (0xff & data[offset * 2]) << 8 | (data[offset * 2 + 1] & 0xff);
1791 // package private data member -------------------------------------------
1793 static final byte BYTE_FIRST_TAILORED_ = (byte) 0x04;
1794 static final byte BYTE_COMMON_ = (byte) 0x05;
1795 static final int COMMON_TOP_2_ = 0x86; // int for unsigness
1796 static final int COMMON_BOTTOM_2_ = BYTE_COMMON_;
1797 static final int COMMON_BOTTOM_3 = 0x05;
1799 * Case strength mask
1801 static final int CE_CASE_BIT_MASK_ = 0xC0;
1802 static final int CE_TAG_SHIFT_ = 24;
1803 static final int CE_TAG_MASK_ = 0x0F000000;
1805 static final int CE_SPECIAL_FLAG_ = 0xF0000000;
1807 * Lead surrogate that is tailored and doesn't start a contraction
1809 static final int CE_SURROGATE_TAG_ = 5;
1811 * Mask to get the primary strength of the collation element
1813 static final int CE_PRIMARY_MASK_ = 0xFFFF0000;
1815 * Mask to get the secondary strength of the collation element
1817 static final int CE_SECONDARY_MASK_ = 0xFF00;
1819 * Mask to get the tertiary strength of the collation element
1821 static final int CE_TERTIARY_MASK_ = 0xFF;
1823 * Primary strength shift
1825 static final int CE_PRIMARY_SHIFT_ = 16;
1827 * Secondary strength shift
1829 static final int CE_SECONDARY_SHIFT_ = 8;
1831 * Continuation marker
1833 static final int CE_CONTINUATION_MARKER_ = 0xC0;
1836 * Size of collator raw data headers and options before the expansion data. This is used when expansion ces are to
1837 * be retrieved. ICU4C uses the expansion offset starting from UCollator.UColHeader, hence ICU4J will have to minus
1838 * that off to get the right expansion ce offset. In number of ints.
1840 int m_expansionOffset_;
1842 * Size of collator raw data headers, options and expansions before contraction data. This is used when contraction
1843 * ces are to be retrieved. ICU4C uses contraction offset starting from UCollator.UColHeader, hence ICU4J will have
1844 * to minus that off to get the right contraction ce offset. In number of chars.
1846 int m_contractionOffset_;
1848 * Flag indicator if Jamo is special
1850 boolean m_isJamoSpecial_;
1852 // Collator options ------------------------------------------------------
1854 int m_defaultVariableTopValue_;
1855 boolean m_defaultIsFrenchCollation_;
1856 boolean m_defaultIsAlternateHandlingShifted_;
1857 int m_defaultCaseFirst_;
1858 boolean m_defaultIsCaseLevel_;
1859 int m_defaultDecomposition_;
1860 int m_defaultStrength_;
1861 boolean m_defaultIsHiragana4_;
1862 boolean m_defaultIsNumericCollation_;
1864 * Default script order - the one created at initial rule parse time
1866 int[] m_defaultReorderCodes_;
1869 * Value of the variable top
1871 int m_variableTopValue_;
1873 * Attribute for special Hiragana
1875 boolean m_isHiragana4_;
1877 * Case sorting customization
1881 * Numeric collation option
1883 boolean m_isNumericCollation_;
1887 int[] m_reorderCodes_;
1889 // end Collator options --------------------------------------------------
1896 * Contraction index table
1898 char m_contractionIndex_[];
1900 * Contraction CE table
1902 int m_contractionCE_[];
1908 * Table to store all collation elements that are the last element of an expansion. This is for use in StringSearch.
1910 int m_expansionEndCE_[];
1912 * Table to store the maximum size of any expansions that end with the corresponding collation element in
1913 * m_expansionEndCE_. For use in StringSearch too
1915 byte m_expansionEndCEMaxSize_[];
1917 * Heuristic table to store information on whether a char character is considered "unsafe". "Unsafe" character are
1918 * combining marks or those belonging to some contraction sequence from the offset 1 onwards. E.g. if "ABC" is the
1919 * only contraction, then 'B' and 'C' are considered unsafe. If we have another contraction "ZA" with the one above,
1920 * then 'A', 'B', 'C' are "unsafe" but 'Z' is not.
1924 * Table to store information on whether a codepoint can occur as the last character in a contraction
1926 byte m_contractionEnd_[];
1928 * Original collation rules
1932 * The smallest "unsafe" codepoint
1936 * The smallest codepoint that could be the end of a contraction
1938 char m_minContractionEnd_;
1940 * General version of the collator
1942 VersionInfo m_version_;
1946 VersionInfo m_UCA_version_;
1950 VersionInfo m_UCD_version_;
1952 * Lead byte and script data
1954 int m_leadByteToScripts;
1955 int m_scriptToLeadBytes;
1957 * UnicodeData.txt property object
1959 static final RuleBasedCollator UCA_;
1963 static final UCAConstants UCA_CONSTANTS_;
1965 * Lead Byte Constants
1967 static LeadByteConstants LEADBYTE_CONSTANTS_;
1969 * Table for UCA and builder use
1971 static final char UCA_CONTRACTIONS_[];
1972 static final int MAX_UCA_CONTRACTION_LENGTH;
1974 private static boolean UCA_INIT_COMPLETE;
1977 * Implicit generator
1979 static final ImplicitCEGenerator impCEGen_;
1981 static final byte SORT_LEVEL_TERMINATOR_ = 1;
1983 // These are values from UCA required for
1984 // implicit generation and supressing sort key compression
1985 // they should regularly be in the UCA, but if one
1986 // is running without UCA, it could be a problem
1987 static final int maxRegularPrimary = 0x7A;
1988 static final int minImplicitPrimary = 0xE0;
1989 static final int maxImplicitPrimary = 0xE4;
1991 // block to initialise character property database
1993 // take pains to let static class init succeed, otherwise the class itself won't exist and
1994 // clients will get a NoClassDefFoundException. Instead, make the constructors fail if
1995 // we can't load the UCA data.
1997 RuleBasedCollator iUCA_ = null;
1998 UCAConstants iUCA_CONSTANTS_ = null;
1999 LeadByteConstants iLEADBYTE_CONSTANTS = null;
2000 char iUCA_CONTRACTIONS_[] = null;
2001 Output<Integer> maxUCAContractionLength = new Output<Integer>();
2002 ImplicitCEGenerator iimpCEGen_ = null;
2004 // !!! note what's going on here...
2005 // even though the static init of the class is not yet complete, we
2006 // instantiate an instance of the class. So we'd better be sure that
2007 // instantiation doesn't rely on the static initialization that's
2008 // not complete yet!
2009 iUCA_ = new RuleBasedCollator();
2010 iUCA_CONSTANTS_ = new UCAConstants();
2011 iLEADBYTE_CONSTANTS = new LeadByteConstants();
2012 iUCA_CONTRACTIONS_ = CollatorReader.read(iUCA_, iUCA_CONSTANTS_, iLEADBYTE_CONSTANTS, maxUCAContractionLength);
2014 // called before doing canonical closure for the UCA.
2015 iimpCEGen_ = new ImplicitCEGenerator(minImplicitPrimary, maxImplicitPrimary);
2016 // iimpCEGen_ = new ImplicitCEGenerator(iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_,
2017 // iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_);
2019 ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle.getBundleInstance(
2020 ICUResourceBundle.ICU_COLLATION_BASE_NAME, ULocale.ENGLISH);
2021 iUCA_.m_rules_ = (String) rb.getObject("UCARules");
2022 } catch (MissingResourceException ex) {
2024 } catch (IOException e) {
2025 // e.printStackTrace();
2026 // throw new MissingResourceException(e.getMessage(),"","");
2030 UCA_CONSTANTS_ = iUCA_CONSTANTS_;
2031 LEADBYTE_CONSTANTS_ = iLEADBYTE_CONSTANTS;
2032 UCA_CONTRACTIONS_ = iUCA_CONTRACTIONS_;
2033 MAX_UCA_CONTRACTION_LENGTH = maxUCAContractionLength.value;
2034 impCEGen_ = iimpCEGen_;
2036 UCA_INIT_COMPLETE = true;
2039 private static void checkUCA() throws MissingResourceException {
2040 if (UCA_INIT_COMPLETE && UCA_ == null) {
2041 throw new MissingResourceException("Collator UCA data unavailable", "", "");
2045 // package private constructors ------------------------------------------
2049 * Private contructor for use by subclasses. Public access to creating Collators is handled by the API
2050 * Collator.getInstance() or RuleBasedCollator(String rules).
2053 * This constructor constructs the UCA collator internally
2056 RuleBasedCollator() {
2061 * Constructs a RuleBasedCollator from the argument locale.
2062 * If no resource bundle is associated with the locale, UCA is used instead.
2066 RuleBasedCollator(ULocale locale) {
2069 ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle.getBundleInstance(
2070 ICUResourceBundle.ICU_COLLATION_BASE_NAME, locale);
2072 ICUResourceBundle elements = null;
2074 // Use keywords, if supplied for lookup
2075 String collkey = locale.getKeywordValue("collation");
2076 if (collkey != null) {
2078 elements = rb.getWithFallback("collations/" + collkey);
2079 } catch (MissingResourceException e) {
2083 if (elements == null) {
2084 // either collation keyword was not supplied or
2085 // the keyword was invalid - use default collation for the locale
2087 // collations/default should always give a string back
2088 // keyword for the real collation data
2089 collkey = rb.getStringWithFallback("collations/default");
2090 elements = rb.getWithFallback("collations/" + collkey);
2093 // TODO: Determine actual & valid locale correctly
2094 ULocale uloc = rb.getULocale();
2095 setLocale(uloc, uloc);
2097 m_rules_ = elements.getString("Sequence");
2098 ByteBuffer buf = elements.get("%%CollationBin").getBinary();
2101 // m_rules_ = (String)rules[1][1];
2102 CollatorReader.initRBC(this, buf);
2104 * BufferedInputStream input = new BufferedInputStream( new ByteArrayInputStream(map)); /*
2105 * CollatorReader reader = new CollatorReader(input, false); if (map.length >
2106 * MIN_BINARY_DATA_SIZE_) { reader.read(this, null); } else { reader.readHeader(this);
2107 * reader.readOptions(this); // duplicating UCA_'s data setWithUCATables(); }
2109 // at this point, we have read in the collator
2110 // now we need to check whether the binary image has
2111 // the right UCA and other versions
2112 if (!m_UCA_version_.equals(UCA_.m_UCA_version_) || !m_UCD_version_.equals(UCA_.m_UCD_version_)) {
2118 UResourceBundle reorderRes = elements.get("%%ReorderCodes");
2119 if (reorderRes != null) {
2120 int[] reorderCodes = reorderRes.getIntVector();
2121 setReorderCodes(reorderCodes);
2122 m_defaultReorderCodes_ = reorderCodes.clone();
2124 } catch (MissingResourceException e) {
2133 } catch (Exception e) {
2139 // package private methods -----------------------------------------------
2142 * Sets this collator to use the tables in UCA. Note options not taken care of here.
2144 final void setWithUCATables() {
2145 m_contractionOffset_ = UCA_.m_contractionOffset_;
2146 m_expansionOffset_ = UCA_.m_expansionOffset_;
2147 m_expansion_ = UCA_.m_expansion_;
2148 m_contractionIndex_ = UCA_.m_contractionIndex_;
2149 m_contractionCE_ = UCA_.m_contractionCE_;
2150 m_trie_ = UCA_.m_trie_;
2151 m_expansionEndCE_ = UCA_.m_expansionEndCE_;
2152 m_expansionEndCEMaxSize_ = UCA_.m_expansionEndCEMaxSize_;
2153 m_unsafe_ = UCA_.m_unsafe_;
2154 m_contractionEnd_ = UCA_.m_contractionEnd_;
2155 m_minUnsafe_ = UCA_.m_minUnsafe_;
2156 m_minContractionEnd_ = UCA_.m_minContractionEnd_;
2160 * Sets this collator to use the all options and tables in UCA.
2162 final void setWithUCAData() {
2163 latinOneFailed_ = true;
2165 m_addition3_ = UCA_.m_addition3_;
2166 m_bottom3_ = UCA_.m_bottom3_;
2167 m_bottomCount3_ = UCA_.m_bottomCount3_;
2168 m_caseFirst_ = UCA_.m_caseFirst_;
2169 m_caseSwitch_ = UCA_.m_caseSwitch_;
2170 m_common3_ = UCA_.m_common3_;
2171 m_contractionOffset_ = UCA_.m_contractionOffset_;
2172 setDecomposition(UCA_.getDecomposition());
2173 m_defaultCaseFirst_ = UCA_.m_defaultCaseFirst_;
2174 m_defaultDecomposition_ = UCA_.m_defaultDecomposition_;
2175 m_defaultIsAlternateHandlingShifted_ = UCA_.m_defaultIsAlternateHandlingShifted_;
2176 m_defaultIsCaseLevel_ = UCA_.m_defaultIsCaseLevel_;
2177 m_defaultIsFrenchCollation_ = UCA_.m_defaultIsFrenchCollation_;
2178 m_defaultIsHiragana4_ = UCA_.m_defaultIsHiragana4_;
2179 m_defaultStrength_ = UCA_.m_defaultStrength_;
2180 m_defaultVariableTopValue_ = UCA_.m_defaultVariableTopValue_;
2181 m_defaultIsNumericCollation_ = UCA_.m_defaultIsNumericCollation_;
2182 m_expansionOffset_ = UCA_.m_expansionOffset_;
2183 m_isAlternateHandlingShifted_ = UCA_.m_isAlternateHandlingShifted_;
2184 m_isCaseLevel_ = UCA_.m_isCaseLevel_;
2185 m_isFrenchCollation_ = UCA_.m_isFrenchCollation_;
2186 m_isHiragana4_ = UCA_.m_isHiragana4_;
2187 m_isJamoSpecial_ = UCA_.m_isJamoSpecial_;
2188 m_isSimple3_ = UCA_.m_isSimple3_;
2189 m_mask3_ = UCA_.m_mask3_;
2190 m_minContractionEnd_ = UCA_.m_minContractionEnd_;
2191 m_minUnsafe_ = UCA_.m_minUnsafe_;
2192 m_rules_ = UCA_.m_rules_;
2193 setStrength(UCA_.getStrength());
2194 m_top3_ = UCA_.m_top3_;
2195 m_topCount3_ = UCA_.m_topCount3_;
2196 m_variableTopValue_ = UCA_.m_variableTopValue_;
2197 m_isNumericCollation_ = UCA_.m_isNumericCollation_;
2199 latinOneFailed_ = false;
2203 * Test whether a char character is potentially "unsafe" for use as a collation starting point. "Unsafe" characters
2204 * are combining marks or those belonging to some contraction sequence from the offset 1 onwards. E.g. if "ABC" is
2205 * the only contraction, then 'B' and 'C' are considered unsafe. If we have another contraction "ZA" with the one
2206 * above, then 'A', 'B', 'C' are "unsafe" but 'Z' is not.
2209 * character to determin
2210 * @return true if ch is unsafe, false otherwise
2212 final boolean isUnsafe(char ch) {
2213 if (ch < m_minUnsafe_) {
2217 if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) {
2218 if (UTF16.isLeadSurrogate(ch) || UTF16.isTrailSurrogate(ch)) {
2219 // Trail surrogate are always considered unsafe.
2222 ch &= HEURISTIC_OVERFLOW_MASK_;
2223 ch += HEURISTIC_OVERFLOW_OFFSET_;
2225 int value = m_unsafe_[ch >> HEURISTIC_SHIFT_];
2226 return ((value >> (ch & HEURISTIC_MASK_)) & 1) != 0;
2230 * Approximate determination if a char character is at a contraction end. Guaranteed to be true if a character is at
2231 * the end of a contraction, otherwise it is not deterministic.
2234 * character to be determined
2236 final boolean isContractionEnd(char ch) {
2237 if (UTF16.isTrailSurrogate(ch)) {
2241 if (ch < m_minContractionEnd_) {
2245 if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) {
2246 ch &= HEURISTIC_OVERFLOW_MASK_;
2247 ch += HEURISTIC_OVERFLOW_OFFSET_;
2249 int value = m_contractionEnd_[ch >> HEURISTIC_SHIFT_];
2250 return ((value >> (ch & HEURISTIC_MASK_)) & 1) != 0;
2254 * Retrieve the tag of a special ce
2260 static int getTag(int ce) {
2261 return (ce & CE_TAG_MASK_) >> CE_TAG_SHIFT_;
2265 * Checking if ce is special
2269 * @return true if ce is special
2271 static boolean isSpecial(int ce) {
2272 return (ce & CE_SPECIAL_FLAG_) == CE_SPECIAL_FLAG_;
2276 * Checks if the argument ce is a continuation
2279 * collation element to test
2280 * @return true if ce is a continuation
2282 static final boolean isContinuation(int ce) {
2283 return ce != CollationElementIterator.NULLORDER && (ce & CE_CONTINUATION_TAG_) == CE_CONTINUATION_TAG_;
2286 // private inner classes ------------------------------------------------
2288 // private variables -----------------------------------------------------
2291 * The smallest natural unsafe or contraction end char character before tailoring. This is a combining mark.
2293 private static final int DEFAULT_MIN_HEURISTIC_ = 0x300;
2295 * Heuristic table table size. Size is 32 bytes, 1 bit for each latin 1 char, and some power of two for hashing the
2296 * rest of the chars. Size in bytes.
2298 private static final char HEURISTIC_SIZE_ = 1056;
2300 * Mask value down to "some power of two" - 1, number of bits, not num of bytes.
2302 private static final char HEURISTIC_OVERFLOW_MASK_ = 0x1fff;
2304 * Unsafe character shift
2306 private static final int HEURISTIC_SHIFT_ = 3;
2308 * Unsafe character addition for character too large, it has to be folded then incremented.
2310 private static final char HEURISTIC_OVERFLOW_OFFSET_ = 256;
2312 * Mask value to get offset in heuristic table.
2314 private static final char HEURISTIC_MASK_ = 7;
2316 private int m_caseSwitch_;
2317 private int m_common3_;
2318 private int m_mask3_;
2320 * When switching case, we need to add or subtract different values.
2322 private int m_addition3_;
2324 * Upper range when compressing
2326 private int m_top3_;
2328 * Upper range when compressing
2330 private int m_bottom3_;
2331 private int m_topCount3_;
2332 private int m_bottomCount3_;
2334 * Script reordering table
2336 private byte[] m_leadBytePermutationTable_;
2338 * Case first constants
2340 private static final int CASE_SWITCH_ = 0xC0;
2341 private static final int NO_CASE_SWITCH_ = 0;
2343 * Case level constants
2345 private static final int CE_REMOVE_CASE_ = 0x3F;
2346 private static final int CE_KEEP_CASE_ = 0xFF;
2348 * Case strength mask
2350 private static final int CE_CASE_MASK_3_ = 0xFF;
2352 * Sortkey size factor. Values can be changed.
2354 private static final double PROPORTION_2_ = 0.5;
2355 private static final double PROPORTION_3_ = 0.667;
2357 // These values come from the UCA ----------------------------------------
2360 * This is an enum that lists magic special byte values from the fractional UCA
2362 // private static final byte BYTE_ZERO_ = 0x0;
2363 // private static final byte BYTE_LEVEL_SEPARATOR_ = (byte)0x01;
2364 // private static final byte BYTE_SORTKEY_GLUE_ = (byte)0x02;
2365 private static final byte BYTE_SHIFT_PREFIX_ = (byte) 0x03;
2366 /* private */static final byte BYTE_UNSHIFTED_MIN_ = BYTE_SHIFT_PREFIX_;
2367 // private static final byte BYTE_FIRST_UCA_ = BYTE_COMMON_;
2368 // TODO: Make the following values dynamic since they change with almost every UCA version.
2369 static final byte CODAN_PLACEHOLDER = 0x12;
2370 private static final byte BYTE_FIRST_NON_LATIN_PRIMARY_ = (byte) 0x5B;
2372 private static final byte BYTE_UNSHIFTED_MAX_ = (byte) 0xFF;
2373 private static final int TOTAL_2_ = COMMON_TOP_2_ - COMMON_BOTTOM_2_ - 1;
2374 private static final int FLAG_BIT_MASK_CASE_SWITCH_OFF_ = 0x80;
2375 private static final int FLAG_BIT_MASK_CASE_SWITCH_ON_ = 0x40;
2376 private static final int COMMON_TOP_CASE_SWITCH_OFF_3_ = 0x85;
2377 private static final int COMMON_TOP_CASE_SWITCH_LOWER_3_ = 0x45;
2378 private static final int COMMON_TOP_CASE_SWITCH_UPPER_3_ = 0xC5;
2379 private static final int COMMON_BOTTOM_3_ = 0x05;
2380 private static final int COMMON_BOTTOM_CASE_SWITCH_UPPER_3_ = 0x86;
2381 private static final int COMMON_BOTTOM_CASE_SWITCH_LOWER_3_ = COMMON_BOTTOM_3_;
2382 private static final int TOP_COUNT_2_ = (int) (PROPORTION_2_ * TOTAL_2_);
2383 private static final int BOTTOM_COUNT_2_ = TOTAL_2_ - TOP_COUNT_2_;
2384 private static final int COMMON_2_ = COMMON_BOTTOM_2_;
2385 private static final int COMMON_UPPER_FIRST_3_ = 0xC5;
2386 private static final int COMMON_NORMAL_3_ = COMMON_BOTTOM_3_;
2387 // private static final int COMMON_4_ = (byte)0xFF;
2390 * Minimum size required for the binary collation data in bytes. Size of UCA header + size of options to 4 bytes
2392 // private static final int MIN_BINARY_DATA_SIZE_ = (42 + 25) << 2;
2395 * If this collator is to generate only simple tertiaries for fast path
2397 private boolean m_isSimple3_;
2400 * French collation sorting flag
2402 private boolean m_isFrenchCollation_;
2404 * Flag indicating if shifted is requested for Quaternary alternate handling. If this is not true, the default for
2405 * alternate handling will be non-ignorable.
2407 private boolean m_isAlternateHandlingShifted_;
2409 * Extra case level for sorting
2411 private boolean m_isCaseLevel_;
2413 * Frozen state of the collator.
2415 private Lock frozenLock;
2418 private static final int SORT_BUFFER_INIT_SIZE_ = 128;
2419 private static final int SORT_BUFFER_INIT_SIZE_1_ = SORT_BUFFER_INIT_SIZE_ << 3;
2420 private static final int SORT_BUFFER_INIT_SIZE_2_ = SORT_BUFFER_INIT_SIZE_;
2421 private static final int SORT_BUFFER_INIT_SIZE_3_ = SORT_BUFFER_INIT_SIZE_;
2422 private static final int SORT_BUFFER_INIT_SIZE_CASE_ = SORT_BUFFER_INIT_SIZE_ >> 2;
2423 private static final int SORT_BUFFER_INIT_SIZE_4_ = SORT_BUFFER_INIT_SIZE_;
2425 private static final int CE_CONTINUATION_TAG_ = 0xC0;
2426 private static final int CE_REMOVE_CONTINUATION_MASK_ = 0xFFFFFF3F;
2428 private static final int LAST_BYTE_MASK_ = 0xFF;
2430 // private static final int CE_RESET_TOP_VALUE_ = 0x9F000303;
2431 // private static final int CE_NEXT_TOP_VALUE_ = 0xE8960303;
2433 private static final byte SORT_CASE_BYTE_START_ = (byte) 0x80;
2434 private static final byte SORT_CASE_SHIFT_START_ = (byte) 7;
2439 private static final int CE_BUFFER_SIZE_ = 512;
2441 // variables for Latin-1 processing
2442 boolean latinOneUse_ = false;
2443 boolean latinOneRegenTable_ = false;
2444 boolean latinOneFailed_ = false;
2446 int latinOneTableLen_ = 0;
2447 int latinOneCEs_[] = null;
2449 private final class CollationBuffer {
2451 * Bunch of utility iterators
2453 protected StringUCharacterIterator m_srcUtilIter_;
2454 protected CollationElementIterator m_srcUtilColEIter_;
2455 protected StringUCharacterIterator m_tgtUtilIter_;
2456 protected CollationElementIterator m_tgtUtilColEIter_;
2459 * Utility comparison flags
2461 protected boolean m_utilCompare0_;
2462 // private boolean m_utilCompare1_;
2463 protected boolean m_utilCompare2_;
2464 protected boolean m_utilCompare3_;
2465 protected boolean m_utilCompare4_;
2466 protected boolean m_utilCompare5_;
2469 * Utility byte buffer
2471 protected byte m_utilBytes0_[];
2472 protected byte m_utilBytes1_[];
2473 protected byte m_utilBytes2_[];
2474 protected byte m_utilBytes3_[];
2475 protected byte m_utilBytes4_[];
2476 // private byte m_utilBytes5_[];
2478 protected RawCollationKey m_utilRawCollationKey_;
2480 protected int m_utilBytesCount0_;
2481 protected int m_utilBytesCount1_;
2482 protected int m_utilBytesCount2_;
2483 protected int m_utilBytesCount3_;
2484 protected int m_utilBytesCount4_;
2485 // private int m_utilBytesCount5_;
2487 // private int m_utilCount0_;
2488 // private int m_utilCount1_;
2489 protected int m_utilCount2_;
2490 protected int m_utilCount3_;
2491 protected int m_utilCount4_;
2492 // private int m_utilCount5_;
2494 protected int m_utilFrenchStart_;
2495 protected int m_utilFrenchEnd_;
2498 * Preparing the CE buffers. will be filled during the primary phase
2500 protected int m_srcUtilCEBuffer_[];
2501 protected int m_tgtUtilCEBuffer_[];
2502 protected int m_srcUtilCEBufferSize_;
2503 protected int m_tgtUtilCEBufferSize_;
2505 protected int m_srcUtilContOffset_;
2506 protected int m_tgtUtilContOffset_;
2508 protected int m_srcUtilOffset_;
2509 protected int m_tgtUtilOffset_;
2511 private CollationBuffer() {
2516 * Initializes utility iterators and byte buffer used by compare
2518 protected final void initBuffers() {
2520 m_srcUtilIter_ = new StringUCharacterIterator();
2521 m_srcUtilColEIter_ = new CollationElementIterator(m_srcUtilIter_, RuleBasedCollator.this);
2522 m_tgtUtilIter_ = new StringUCharacterIterator();
2523 m_tgtUtilColEIter_ = new CollationElementIterator(m_tgtUtilIter_, RuleBasedCollator.this);
2524 m_utilBytes0_ = new byte[SORT_BUFFER_INIT_SIZE_CASE_]; // case
2525 m_utilBytes1_ = new byte[SORT_BUFFER_INIT_SIZE_1_]; // primary
2526 m_utilBytes2_ = new byte[SORT_BUFFER_INIT_SIZE_2_]; // secondary
2527 m_utilBytes3_ = new byte[SORT_BUFFER_INIT_SIZE_3_]; // tertiary
2528 m_utilBytes4_ = new byte[SORT_BUFFER_INIT_SIZE_4_]; // Quaternary
2529 m_srcUtilCEBuffer_ = new int[CE_BUFFER_SIZE_];
2530 m_tgtUtilCEBuffer_ = new int[CE_BUFFER_SIZE_];
2533 protected final void resetBuffers() {
2534 m_utilCompare0_ = false;
2535 // private boolean m_utilCompare1_;
2536 m_utilCompare2_ = false;
2537 m_utilCompare3_ = false;
2538 m_utilCompare4_ = false;
2539 m_utilCompare5_ = false;
2541 m_utilBytesCount0_ = 0;
2542 m_utilBytesCount1_ = 0;
2543 m_utilBytesCount2_ = 0;
2544 m_utilBytesCount3_ = 0;
2545 m_utilBytesCount4_ = 0;
2546 // private int m_utilBytesCount5_;
2552 m_utilFrenchStart_ = 0;
2553 m_utilFrenchEnd_ = 0;
2555 m_srcUtilContOffset_ = 0;
2556 m_tgtUtilContOffset_ = 0;
2558 m_srcUtilOffset_ = 0;
2559 m_tgtUtilOffset_ = 0;
2563 // private methods -------------------------------------------------------
2565 private void init(String rules) throws Exception {
2567 CollationParsedRuleBuilder builder = new CollationParsedRuleBuilder(rules);
2568 builder.setRules(this);
2571 buildPermutationTable();
2574 private final int compareRegular(String source, String target, int offset, CollationBuffer buffer) {
2575 buffer.resetBuffers();
2577 int strength = getStrength();
2578 // setting up the collator parameters
2579 buffer.m_utilCompare0_ = m_isCaseLevel_;
2580 // m_utilCompare1_ = true;
2581 buffer.m_utilCompare2_ = strength >= SECONDARY;
2582 buffer.m_utilCompare3_ = strength >= TERTIARY;
2583 buffer.m_utilCompare4_ = strength >= QUATERNARY;
2584 buffer.m_utilCompare5_ = strength == IDENTICAL;
2585 boolean doFrench = m_isFrenchCollation_ && buffer.m_utilCompare2_;
2586 boolean doShift4 = m_isAlternateHandlingShifted_ && buffer.m_utilCompare4_;
2587 boolean doHiragana4 = m_isHiragana4_ && buffer.m_utilCompare4_;
2589 if (doHiragana4 && doShift4) {
2590 String sourcesub = source.substring(offset);
2591 String targetsub = target.substring(offset);
2592 return compareBySortKeys(sourcesub, targetsub, buffer);
2595 // This is the lowest primary value that will not be ignored if shifted
2596 int lowestpvalue = m_isAlternateHandlingShifted_ ? m_variableTopValue_ << 16 : 0;
2597 buffer.m_srcUtilCEBufferSize_ = 0;
2598 buffer.m_tgtUtilCEBufferSize_ = 0;
2599 int result = doPrimaryCompare(doHiragana4, lowestpvalue, source, target, offset, buffer);
2600 if (buffer.m_srcUtilCEBufferSize_ == -1 && buffer.m_tgtUtilCEBufferSize_ == -1) {
2601 // since the cebuffer is cleared when we have determined that
2602 // either source is greater than target or vice versa, the return
2603 // result is the comparison result and not the hiragana result
2607 int hiraganaresult = result;
2609 if (buffer.m_utilCompare2_) {
2610 result = doSecondaryCompare(doFrench, buffer);
2615 // doing the case bit
2616 if (buffer.m_utilCompare0_) {
2617 result = doCaseCompare(buffer);
2623 if (buffer.m_utilCompare3_) {
2624 result = doTertiaryCompare(buffer);
2630 if (doShift4) { // checkQuad
2631 result = doQuaternaryCompare(lowestpvalue, buffer);
2635 } else if (doHiragana4 && hiraganaresult != 0) {
2636 // If we're fine on quaternaries, we might be different
2637 // on Hiragana. This, however, might fail us in shifted.
2638 return hiraganaresult;
2641 // For IDENTICAL comparisons, we use a bitwise character comparison
2642 // as a tiebreaker if all else is equal.
2643 // Getting here should be quite rare - strings are not identical -
2644 // that is checked first, but compared == through all other checks.
2645 if (buffer.m_utilCompare5_) {
2646 return doIdenticalCompare(source, target, offset, true);
2651 // Is this primary weight compressible?
2652 // Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit).
2653 // TODO: This should use per-lead-byte flags from FractionalUCA.txt.
2654 static boolean isCompressible(int primary1) {
2655 return BYTE_FIRST_NON_LATIN_PRIMARY_ <= primary1 && primary1 <= maxRegularPrimary;
2659 * Gets the 2 bytes of primary order and adds it to the primary byte array
2663 * @param notIsContinuation
2664 * flag indicating if the current bytes belong to a continuation ce
2666 * flag indicating if ce is to be shifted
2667 * @param leadPrimary
2668 * lead primary used for compression
2669 * @param commonBottom4
2670 * common byte value for Quaternary
2671 * @param bottomCount4
2672 * smallest byte value for Quaternary
2673 * @return the new lead primary for compression
2675 private final int doPrimaryBytes(int ce, boolean notIsContinuation, boolean doShift, int leadPrimary,
2676 int commonBottom4, int bottomCount4, CollationBuffer buffer) {
2678 int p2 = (ce >>>= 16) & LAST_BYTE_MASK_; // in ints for unsigned
2679 int p1 = ce >>> 8; // comparison
2680 int originalP1 = p1;
2681 if (notIsContinuation) {
2682 if (m_leadBytePermutationTable_ != null) {
2683 p1 = 0xff & m_leadBytePermutationTable_[p1];
2688 if (buffer.m_utilCount4_ > 0) {
2689 while (buffer.m_utilCount4_ > bottomCount4) {
2690 buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) (commonBottom4 + bottomCount4));
2691 buffer.m_utilBytesCount4_++;
2692 buffer.m_utilCount4_ -= bottomCount4;
2694 buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) (commonBottom4 + (buffer.m_utilCount4_ - 1)));
2695 buffer.m_utilBytesCount4_++;
2696 buffer.m_utilCount4_ = 0;
2698 // dealing with a variable and we're treating them as shifted
2699 // This is a shifted ignorable
2701 // we need to check this since we could be in continuation
2702 buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) p1);
2703 buffer.m_utilBytesCount4_++;
2706 buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) p2);
2707 buffer.m_utilBytesCount4_++;
2710 // Note: This code assumes that the table is well built
2711 // i.e. not having 0 bytes where they are not supposed to be.
2712 // Usually, we'll have non-zero primary1 & primary2, except
2713 // in cases of LatinOne and friends, when primary2 will be
2714 // regular and simple sortkey calc
2715 if (p1 != CollationElementIterator.IGNORABLE) {
2716 if (notIsContinuation) {
2717 if (leadPrimary == p1) {
2718 buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p2);
2719 buffer.m_utilBytesCount1_++;
2721 if (leadPrimary != 0) {
2722 buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_,
2723 ((p1 > leadPrimary) ? BYTE_UNSHIFTED_MAX_ : BYTE_UNSHIFTED_MIN_));
2724 buffer.m_utilBytesCount1_++;
2726 if (p2 == CollationElementIterator.IGNORABLE) {
2727 // one byter, not compressed
2728 buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p1);
2729 buffer.m_utilBytesCount1_++;
2731 } else if (isCompressible(originalP1)) {
2734 buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p1);
2735 buffer.m_utilBytesCount1_++;
2736 buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p2);
2737 buffer.m_utilBytesCount1_++;
2740 buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p1);
2741 buffer.m_utilBytesCount1_++;
2742 buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p2);
2743 buffer.m_utilBytesCount1_++;
2747 // continuation, add primary to the key, no compression
2748 buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p1);
2749 buffer.m_utilBytesCount1_++;
2750 if (p2 != CollationElementIterator.IGNORABLE) {
2751 buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) p2);
2753 buffer.m_utilBytesCount1_++;
2762 * Gets the secondary byte and adds it to the secondary byte array
2764 * @param ce current ce
2765 * @param notIsContinuation flag indicating if the current bytes belong to a continuation ce
2766 * @param doFrench flag indicator if french sort is to be performed
2767 * @param buffer collation buffer temporary state
2769 private final void doSecondaryBytes(int ce, boolean notIsContinuation, boolean doFrench, CollationBuffer buffer) {
2770 int s = (ce >> 8) & LAST_BYTE_MASK_; // int for comparison
2773 // This is compression code.
2774 if (s == COMMON_2_ && notIsContinuation) {
2775 buffer.m_utilCount2_++;
2777 if (buffer.m_utilCount2_ > 0) {
2778 if (s > COMMON_2_) { // not necessary for 4th level.
2779 while (buffer.m_utilCount2_ > TOP_COUNT_2_) {
2780 buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_,
2781 (byte) (COMMON_TOP_2_ - TOP_COUNT_2_));
2782 buffer.m_utilBytesCount2_++;
2783 buffer.m_utilCount2_ -= TOP_COUNT_2_;
2785 buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_,
2786 (byte) (COMMON_TOP_2_ - (buffer.m_utilCount2_ - 1)));
2787 buffer.m_utilBytesCount2_++;
2789 while (buffer.m_utilCount2_ > BOTTOM_COUNT_2_) {
2790 buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_,
2791 (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
2792 buffer.m_utilBytesCount2_++;
2793 buffer.m_utilCount2_ -= BOTTOM_COUNT_2_;
2795 buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_,
2796 (byte) (COMMON_BOTTOM_2_ + (buffer.m_utilCount2_ - 1)));
2797 buffer.m_utilBytesCount2_++;
2799 buffer.m_utilCount2_ = 0;
2801 buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_, (byte) s);
2802 buffer.m_utilBytesCount2_++;
2805 buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_, (byte) s);
2806 buffer.m_utilBytesCount2_++;
2807 // Do the special handling for French secondaries
2808 // We need to get continuation elements and do intermediate
2810 // abc1c2c3de with french secondaries need to be edc1c2c3ba
2812 if (notIsContinuation) {
2813 if (buffer.m_utilFrenchStart_ != -1) {
2814 // reverse secondaries from frenchStartPtr up to
2816 reverseBuffer(buffer.m_utilBytes2_, buffer.m_utilFrenchStart_, buffer.m_utilFrenchEnd_);
2817 buffer.m_utilFrenchStart_ = -1;
2820 if (buffer.m_utilFrenchStart_ == -1) {
2821 buffer.m_utilFrenchStart_ = buffer.m_utilBytesCount2_ - 2;
2823 buffer.m_utilFrenchEnd_ = buffer.m_utilBytesCount2_ - 1;
2830 * Reverse the argument buffer
2832 * @param buffer to reverse
2833 * @param start index in buffer to start from
2834 * @param end index in buffer to end at
2836 private static void reverseBuffer(byte buffer[], int start, int end) {
2837 while (start < end) {
2838 byte b = buffer[start];
2839 buffer[start++] = buffer[end];
2845 * Insert the case shifting byte if required
2847 * @param caseshift value
2848 * @return new caseshift value
2850 private final int doCaseShift(int caseshift, CollationBuffer buffer) {
2851 if (caseshift == 0) {
2852 buffer.m_utilBytes0_ = append(buffer.m_utilBytes0_, buffer.m_utilBytesCount0_, SORT_CASE_BYTE_START_);
2853 buffer.m_utilBytesCount0_++;
2854 caseshift = SORT_CASE_SHIFT_START_;
2860 * Performs the casing sort
2862 * @param tertiary byte in ints for easy comparison
2863 * @param notIsContinuation flag indicating if the current bytes belong to a continuation ce
2865 * @param buffer collation buffer temporary state
2866 * @return the new value of case shift
2868 private final int doCaseBytes(int tertiary, boolean notIsContinuation, int caseshift, CollationBuffer buffer) {
2869 caseshift = doCaseShift(caseshift, buffer);
2871 if (notIsContinuation && tertiary != 0) {
2872 byte casebits = (byte) (tertiary & 0xC0);
2873 if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {
2874 if (casebits == 0) {
2875 buffer.m_utilBytes0_[buffer.m_utilBytesCount0_ - 1] |= (1 << (--caseshift));
2878 caseshift = doCaseShift(caseshift - 1, buffer);
2879 buffer.m_utilBytes0_[buffer.m_utilBytesCount0_ - 1] |= ((casebits >> 6) & 1) << (--caseshift);
2882 if (casebits != 0) {
2883 buffer.m_utilBytes0_[buffer.m_utilBytesCount0_ - 1] |= 1 << (--caseshift);
2885 caseshift = doCaseShift(caseshift, buffer);
2886 buffer.m_utilBytes0_[buffer.m_utilBytesCount0_ - 1] |= ((casebits >> 7) & 1) << (--caseshift);
2897 * Gets the tertiary byte and adds it to the tertiary byte array
2899 * @param tertiary byte in int for easy comparison
2900 * @param notIsContinuation flag indicating if the current bytes belong to a continuation ce
2901 * @param buffer collation buffer temporary state
2903 private final void doTertiaryBytes(int tertiary, boolean notIsContinuation, CollationBuffer buffer) {
2904 if (tertiary != 0) {
2905 // This is compression code.
2906 // sequence size check is included in the if clause
2907 if (tertiary == m_common3_ && notIsContinuation) {
2908 buffer.m_utilCount3_++;
2910 int common3 = m_common3_ & LAST_BYTE_MASK_;
2911 if (tertiary > common3 && m_common3_ == COMMON_NORMAL_3_) {
2912 tertiary += m_addition3_;
2913 } else if (tertiary <= common3 && m_common3_ == COMMON_UPPER_FIRST_3_) {
2914 tertiary -= m_addition3_;
2916 if (buffer.m_utilCount3_ > 0) {
2917 if (tertiary > common3) {
2918 while (buffer.m_utilCount3_ > m_topCount3_) {
2919 buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) (m_top3_ - m_topCount3_));
2920 buffer.m_utilBytesCount3_++;
2921 buffer.m_utilCount3_ -= m_topCount3_;
2923 buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_,
2924 (byte) (m_top3_ - (buffer.m_utilCount3_ - 1)));
2925 buffer.m_utilBytesCount3_++;
2927 while (buffer.m_utilCount3_ > m_bottomCount3_) {
2928 buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_,
2929 (byte) (m_bottom3_ + m_bottomCount3_));
2930 buffer.m_utilBytesCount3_++;
2931 buffer.m_utilCount3_ -= m_bottomCount3_;
2933 buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_,
2934 (byte) (m_bottom3_ + (buffer.m_utilCount3_ - 1)));
2935 buffer.m_utilBytesCount3_++;
2937 buffer.m_utilCount3_ = 0;
2939 buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) tertiary);
2940 buffer.m_utilBytesCount3_++;
2946 * Gets the Quaternary byte and adds it to the Quaternary byte array
2948 * @param isCodePointHiragana flag indicator if the previous codepoint we dealt with was Hiragana
2949 * @param commonBottom4 smallest common Quaternary byte
2950 * @param bottomCount4 smallest Quaternary byte
2951 * @param hiragana4 hiragana Quaternary byte
2952 * @param buffer collation buffer temporary state
2954 private final void doQuaternaryBytes(boolean isCodePointHiragana, int commonBottom4, int bottomCount4,
2955 byte hiragana4, CollationBuffer buffer) {
2956 if (isCodePointHiragana) { // This was Hiragana, need to note it
2957 if (buffer.m_utilCount4_ > 0) { // Close this part
2958 while (buffer.m_utilCount4_ > bottomCount4) {
2959 buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) (commonBottom4 + bottomCount4));
2960 buffer.m_utilBytesCount4_++;
2961 buffer.m_utilCount4_ -= bottomCount4;
2963 buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) (commonBottom4 + (buffer.m_utilCount4_ - 1)));
2964 buffer.m_utilBytesCount4_++;
2965 buffer.m_utilCount4_ = 0;
2967 buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, hiragana4); // Add the Hiragana
2968 buffer.m_utilBytesCount4_++;
2969 } else { // This wasn't Hiragana, so we can continue adding stuff
2970 buffer.m_utilCount4_++;
2975 * Iterates through the argument string for all ces. Split the ces into their relevant primaries, secondaries etc.
2977 * @param source normalized string
2978 * @param doFrench flag indicator if special handling of French has to be done
2979 * @param hiragana4 offset for Hiragana quaternary
2980 * @param commonBottom4 smallest common quaternary byte
2981 * @param bottomCount4 smallest quaternary byte
2982 * @param buffer collation buffer temporary state
2984 private final void getSortKeyBytes(String source, boolean doFrench, byte hiragana4, int commonBottom4,
2985 int bottomCount4, CollationBuffer buffer)
2988 int backupDecomposition = getDecomposition();
2989 // TODO- hack fix around frozen state - stop self-modification
2990 internalSetDecomposition(NO_DECOMPOSITION); // have to revert to backup later
2991 buffer.m_srcUtilIter_.setText(source);
2992 buffer.m_srcUtilColEIter_.setText(buffer.m_srcUtilIter_);
2993 buffer.m_utilFrenchStart_ = -1;
2994 buffer.m_utilFrenchEnd_ = -1;
2996 boolean doShift = false;
2997 boolean notIsContinuation = false;
2999 int leadPrimary = 0; // int for easier comparison
3003 int ce = buffer.m_srcUtilColEIter_.next();
3004 if (ce == CollationElementIterator.NULLORDER) {
3008 if (ce == CollationElementIterator.IGNORABLE) {
3012 notIsContinuation = !isContinuation(ce);
3014 boolean isPrimaryByteIgnorable = (ce & CE_PRIMARY_MASK_) == 0;
3015 // actually we can just check that the first byte is 0
3016 // generation stuffs the order left first
3017 boolean isSmallerThanVariableTop = (ce >>> CE_PRIMARY_SHIFT_) <= m_variableTopValue_;
3018 doShift = (m_isAlternateHandlingShifted_
3019 && ((notIsContinuation && isSmallerThanVariableTop && !isPrimaryByteIgnorable) // primary byte not 0
3020 || (!notIsContinuation && doShift)) || (doShift && isPrimaryByteIgnorable));
3021 if (doShift && isPrimaryByteIgnorable) {
3022 // amendment to the UCA says that primary ignorables and other
3023 // ignorables should be removed if following a shifted code
3025 // if we were shifted and we got an ignorable code point
3026 // we should just completely ignore it
3029 leadPrimary = doPrimaryBytes(ce, notIsContinuation, doShift, leadPrimary, commonBottom4, bottomCount4, buffer);
3034 if (buffer.m_utilCompare2_) {
3035 doSecondaryBytes(ce, notIsContinuation, doFrench, buffer);
3038 int t = ce & LAST_BYTE_MASK_;
3039 if (!notIsContinuation) {
3040 t = ce & CE_REMOVE_CONTINUATION_MASK_;
3043 if (buffer.m_utilCompare0_ && (!isPrimaryByteIgnorable || buffer.m_utilCompare2_)) {
3044 // do the case level if we need to do it. We don't want to calculate
3045 // case level for primary ignorables if we have only primary strength and case level
3046 // otherwise we would break well formedness of CEs
3047 caseShift = doCaseBytes(t, notIsContinuation, caseShift, buffer);
3048 } else if (notIsContinuation) {
3054 if (buffer.m_utilCompare3_) {
3055 doTertiaryBytes(t, notIsContinuation, buffer);
3058 if (buffer.m_utilCompare4_ && notIsContinuation) { // compare quad
3059 doQuaternaryBytes(buffer.m_srcUtilColEIter_.m_isCodePointHiragana_, commonBottom4, bottomCount4, hiragana4, buffer);
3062 // TODO - hack fix around frozen state - stop self-modification
3063 internalSetDecomposition(backupDecomposition); // reverts to original
3064 if (buffer.m_utilFrenchStart_ != -1) {
3065 // one last round of checks
3066 reverseBuffer(buffer.m_utilBytes2_, buffer.m_utilFrenchStart_, buffer.m_utilFrenchEnd_);
3071 * From the individual strength byte results the final compact sortkey will be calculated.
3073 * @param source text string
3074 * @param doFrench flag indicating that special handling of French has to be done
3075 * @param commonBottom4 smallest common quaternary byte
3076 * @param bottomCount4 smallest quaternary byte
3077 * @param key output RawCollationKey to store results, key cannot be null
3078 * @param buffer collation buffer temporary state
3080 private final void getSortKey(String source, boolean doFrench, int commonBottom4, int bottomCount4,
3081 RawCollationKey key, CollationBuffer buffer) {
3082 // we have done all the CE's, now let's put them together to form
3084 if (buffer.m_utilCompare2_) {
3085 doSecondary(doFrench, buffer);
3087 // adding case level should be independent of secondary level
3088 if (buffer.m_utilCompare0_) {
3091 if (buffer.m_utilCompare3_) {
3093 if (buffer.m_utilCompare4_) {
3094 doQuaternary(commonBottom4, bottomCount4, buffer);
3095 if (buffer.m_utilCompare5_) {
3096 doIdentical(source, buffer);
3101 buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) 0);
3102 buffer.m_utilBytesCount1_++;
3104 key.set(buffer.m_utilBytes1_, 0, buffer.m_utilBytesCount1_);
3108 * Packs the French bytes
3109 * @param buffer collation buffer temporary state
3111 private static final void doFrench(CollationBuffer buffer) {
3112 for (int i = 0; i < buffer.m_utilBytesCount2_; i++) {
3113 byte s = buffer.m_utilBytes2_[buffer.m_utilBytesCount2_ - i - 1];
3114 // This is compression code.
3115 if (s == COMMON_2_) {
3116 ++buffer.m_utilCount2_;
3118 if (buffer.m_utilCount2_ > 0) {
3119 // getting the unsigned value
3120 if ((s & LAST_BYTE_MASK_) > COMMON_2_) {
3121 // not necessary for 4th level.
3122 while (buffer.m_utilCount2_ > TOP_COUNT_2_) {
3123 buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_,
3124 (byte) (COMMON_TOP_2_ - TOP_COUNT_2_));
3125 buffer.m_utilBytesCount1_++;
3126 buffer.m_utilCount2_ -= TOP_COUNT_2_;
3128 buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_,
3129 (byte) (COMMON_TOP_2_ - (buffer.m_utilCount2_ - 1)));
3130 buffer.m_utilBytesCount1_++;
3132 while (buffer.m_utilCount2_ > BOTTOM_COUNT_2_) {
3133 buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_,
3134 (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
3135 buffer.m_utilBytesCount1_++;
3136 buffer.m_utilCount2_ -= BOTTOM_COUNT_2_;
3138 buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_,
3139 (byte) (COMMON_BOTTOM_2_ + (buffer.m_utilCount2_ - 1)));
3140 buffer.m_utilBytesCount1_++;
3142 buffer.m_utilCount2_ = 0;
3144 buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, s);
3145 buffer.m_utilBytesCount1_++;
3148 if (buffer.m_utilCount2_ > 0) {
3149 while (buffer.m_utilCount2_ > BOTTOM_COUNT_2_) {
3150 buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
3151 buffer.m_utilBytesCount1_++;
3152 buffer.m_utilCount2_ -= BOTTOM_COUNT_2_;
3154 buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, (byte) (COMMON_BOTTOM_2_ + (buffer.m_utilCount2_ - 1)));
3155 buffer.m_utilBytesCount1_++;
3160 * Compacts the secondary bytes and stores them into the primary array
3162 * @param doFrench flag indicator that French has to be handled specially
3163 * @param buffer collation buffer temporary state
3165 private static final void doSecondary(boolean doFrench, CollationBuffer buffer) {
3166 if (buffer.m_utilCount2_ > 0) {
3167 while (buffer.m_utilCount2_ > BOTTOM_COUNT_2_) {
3168 buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_, (byte) (COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
3169 buffer.m_utilBytesCount2_++;
3170 buffer.m_utilCount2_ -= BOTTOM_COUNT_2_;
3172 buffer.m_utilBytes2_ = append(buffer.m_utilBytes2_, buffer.m_utilBytesCount2_, (byte) (COMMON_BOTTOM_2_ + (buffer.m_utilCount2_ - 1)));
3173 buffer.m_utilBytesCount2_++;
3176 buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_);
3177 buffer.m_utilBytesCount1_++;
3179 if (doFrench) { // do the reverse copy
3182 if (buffer.m_utilBytes1_.length <= buffer.m_utilBytesCount1_ + buffer.m_utilBytesCount2_) {
3183 buffer.m_utilBytes1_ = increase(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount2_);
3185 System.arraycopy(buffer.m_utilBytes2_, 0, buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount2_);
3186 buffer.m_utilBytesCount1_ += buffer.m_utilBytesCount2_;
3191 * Increase buffer size
3193 * @param buffer array of bytes
3194 * @param size of the byte array
3195 * @param incrementsize size to increase
3196 * @return the new buffer
3198 private static final byte[] increase(byte buffer[], int size, int incrementsize) {
3199 byte result[] = new byte[buffer.length + incrementsize];
3200 System.arraycopy(buffer, 0, result, 0, size);
3205 * Increase buffer size
3207 * @param buffer array of ints
3208 * @param size of the byte array
3209 * @param incrementsize size to increase
3210 * @return the new buffer
3212 private static final int[] increase(int buffer[], int size, int incrementsize) {
3213 int result[] = new int[buffer.length + incrementsize];
3214 System.arraycopy(buffer, 0, result, 0, size);
3219 * Compacts the case bytes and stores them into the primary array
3221 * @param buffer collation buffer temporary state
3223 private static final void doCase(CollationBuffer buffer) {
3224 buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_);
3225 buffer.m_utilBytesCount1_++;
3226 if (buffer.m_utilBytes1_.length <= buffer.m_utilBytesCount1_ + buffer.m_utilBytesCount0_) {
3227 buffer.m_utilBytes1_ = increase(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount0_);
3229 System.arraycopy(buffer.m_utilBytes0_, 0, buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount0_);
3230 buffer.m_utilBytesCount1_ += buffer.m_utilBytesCount0_;
3234 * Compacts the tertiary bytes and stores them into the primary array
3236 * @param buffer collation buffer temporary state
3238 private final void doTertiary(CollationBuffer buffer) {
3239 if (buffer.m_utilCount3_ > 0) {
3240 if (m_common3_ != COMMON_BOTTOM_3_) {
3241 while (buffer.m_utilCount3_ >= m_topCount3_) {
3242 buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) (m_top3_ - m_topCount3_));
3243 buffer.m_utilBytesCount3_++;
3244 buffer.m_utilCount3_ -= m_topCount3_;
3246 buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) (m_top3_ - buffer.m_utilCount3_));
3247 buffer.m_utilBytesCount3_++;
3249 while (buffer.m_utilCount3_ > m_bottomCount3_) {
3250 buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) (m_bottom3_ + m_bottomCount3_));
3251 buffer.m_utilBytesCount3_++;
3252 buffer.m_utilCount3_ -= m_bottomCount3_;
3254 buffer.m_utilBytes3_ = append(buffer.m_utilBytes3_, buffer.m_utilBytesCount3_, (byte) (m_bottom3_ + (buffer.m_utilCount3_ - 1)));
3255 buffer.m_utilBytesCount3_++;
3258 buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_);
3259 buffer.m_utilBytesCount1_++;
3260 if (buffer.m_utilBytes1_.length <= buffer.m_utilBytesCount1_ + buffer.m_utilBytesCount3_) {
3261 buffer.m_utilBytes1_ = increase(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount3_);
3263 System.arraycopy(buffer.m_utilBytes3_, 0, buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount3_);
3264 buffer.m_utilBytesCount1_ += buffer.m_utilBytesCount3_;
3268 * Compacts the quaternary bytes and stores them into the primary array
3270 * @param buffer collation buffer temporary state
3272 private final void doQuaternary(int commonbottom4, int bottomcount4, CollationBuffer buffer) {
3273 if (buffer.m_utilCount4_ > 0) {
3274 while (buffer.m_utilCount4_ > bottomcount4) {
3275 buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) (commonbottom4 + bottomcount4));
3276 buffer.m_utilBytesCount4_++;
3277 buffer.m_utilCount4_ -= bottomcount4;
3279 buffer.m_utilBytes4_ = append(buffer.m_utilBytes4_, buffer.m_utilBytesCount4_, (byte) (commonbottom4 + (buffer.m_utilCount4_ - 1)));
3280 buffer.m_utilBytesCount4_++;
3282 buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_);
3283 buffer.m_utilBytesCount1_++;
3284 if (buffer.m_utilBytes1_.length <= buffer.m_utilBytesCount1_ + buffer.m_utilBytesCount4_) {
3285 buffer.m_utilBytes1_ = increase(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount4_);
3287 System.arraycopy(buffer.m_utilBytes4_, 0, buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, buffer.m_utilBytesCount4_);
3288 buffer.m_utilBytesCount1_ += buffer.m_utilBytesCount4_;
3292 * Deals with the identical sort. Appends the BOCSU version of the source string to the ends of the byte buffer.
3294 * @param source text string
3295 * @param buffer collation buffer temporary state
3297 private static final void doIdentical(String source, CollationBuffer buffer) {
3298 int isize = BOCU.getCompressionLength(source);
3299 buffer.m_utilBytes1_ = append(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, SORT_LEVEL_TERMINATOR_);
3300 buffer.m_utilBytesCount1_++;
3301 if (buffer.m_utilBytes1_.length <= buffer.m_utilBytesCount1_ + isize) {
3302 buffer.m_utilBytes1_ = increase(buffer.m_utilBytes1_, buffer.m_utilBytesCount1_, 1 + isize);
3304 buffer.m_utilBytesCount1_ = BOCU.compress(source, buffer.m_utilBytes1_, buffer.m_utilBytesCount1_);
3308 * Gets the offset of the first unmatched characters in source and target. This method returns the offset of the
3309 * start of a contraction or a combining sequence, if the first difference is in the middle of such a sequence.
3315 * @return offset of the first unmatched characters in source and target.
3317 private final int getFirstUnmatchedOffset(String source, String target) {
3319 int slength = source.length();
3320 int tlength = target.length();
3321 int minlength = slength;
3322 if (minlength > tlength) {
3323 minlength = tlength;
3325 while (result < minlength && source.charAt(result) == target.charAt(result)) {
3329 // There is an identical portion at the beginning of the two
3330 // strings. If the identical portion ends within a contraction or a
3331 // combining character sequence, back up to the start of that
3335 if (result < minlength) {
3336 schar = source.charAt(result); // first differing chars
3337 tchar = target.charAt(result);
3339 schar = source.charAt(minlength - 1);
3340 if (isUnsafe(schar)) {
3342 } else if (slength == tlength) {
3344 } else if (slength < tlength) {
3345 tchar = target.charAt(result);
3347 schar = source.charAt(result);
3350 if (isUnsafe(schar) || isUnsafe(tchar)) {
3351 // We are stopped in the middle of a contraction or combining
3353 // Look backwards for the part of the string for the start of
3355 // It doesn't matter which string we scan, since they are the
3356 // same in this region.
3359 } while (result > 0 && isUnsafe(source.charAt(result)));
3366 * Appending an byte to an array of bytes and increases it if we run out of space
3370 * @param appendindex
3371 * index in the byte array to append
3374 * @return array if array size can accomodate the new value, otherwise a bigger array will be created and returned
3376 private static final byte[] append(byte array[], int appendindex, byte value) {
3378 array[appendindex] = value;
3379 } catch (ArrayIndexOutOfBoundsException e) {
3380 array = increase(array, appendindex, SORT_BUFFER_INIT_SIZE_);
3381 array[appendindex] = value;
3387 * This is a trick string compare function that goes in and uses sortkeys to compare. It is used when compare gets
3388 * in trouble and needs to bail out.
3390 * @param source text string
3391 * @param target text string
3392 * @param buffer collation buffer temporary state
3394 private final int compareBySortKeys(String source, String target, CollationBuffer buffer)
3396 buffer.m_utilRawCollationKey_ = getRawCollationKey(source, buffer.m_utilRawCollationKey_);
3397 // this method is very seldom called
3398 RawCollationKey targetkey = getRawCollationKey(target, null);
3399 return buffer.m_utilRawCollationKey_.compareTo(targetkey);
3403 * Performs the primary comparisons, and fills up the CE buffer at the same time. The return value toggles between
3404 * the comparison result and the hiragana result. If either the source is greater than target or vice versa, the
3405 * return result is the comparison result, ie 1 or -1, furthermore the cebuffers will be cleared when that happens.
3406 * If the primary comparisons are equal, we'll have to continue with secondary comparison. In this case the cebuffer
3407 * will not be cleared and the return result will be the hiragana result.
3409 * @param doHiragana4 flag indicator that Hiragana Quaternary has to be observed
3410 * @param lowestpvalue the lowest primary value that will not be ignored if alternate handling is shifted
3411 * @param source text string
3412 * @param target text string
3413 * @param textoffset offset in text to start the comparison
3414 * @param buffer collation buffer temporary state
3415 * @return comparion result if a primary difference is found, otherwise hiragana result
3417 private final int doPrimaryCompare(boolean doHiragana4, int lowestpvalue, String source, String target,
3418 int textoffset, CollationBuffer buffer)
3421 // Preparing the context objects for iterating over strings
3422 buffer.m_srcUtilIter_.setText(source);
3423 buffer.m_srcUtilColEIter_.setText(buffer.m_srcUtilIter_, textoffset);
3424 buffer.m_tgtUtilIter_.setText(target);
3425 buffer.m_tgtUtilColEIter_.setText(buffer.m_tgtUtilIter_, textoffset);
3427 // Non shifted primary processing is quite simple
3428 if (!m_isAlternateHandlingShifted_) {
3429 int hiraganaresult = 0;
3433 // We fetch CEs until we hit a non ignorable primary or end.
3435 sorder = buffer.m_srcUtilColEIter_.next();
3436 buffer.m_srcUtilCEBuffer_ = append(buffer.m_srcUtilCEBuffer_, buffer.m_srcUtilCEBufferSize_, sorder);
3437 buffer.m_srcUtilCEBufferSize_++;
3438 sPrimary = sorder & CE_PRIMARY_MASK_;
3439 } while (sPrimary == CollationElementIterator.IGNORABLE);
3444 torder = buffer.m_tgtUtilColEIter_.next();
3445 buffer.m_tgtUtilCEBuffer_ = append(buffer.m_tgtUtilCEBuffer_, buffer.m_tgtUtilCEBufferSize_, torder);
3446 buffer.m_tgtUtilCEBufferSize_++;
3447 tPrimary = torder & CE_PRIMARY_MASK_;
3448 } while (tPrimary == CollationElementIterator.IGNORABLE);
3450 // if both primaries are the same
3451 if (sPrimary == tPrimary) {
3452 // and there are no more CEs, we advance to the next level
3453 // see if we are at the end of either string
3454 if (buffer.m_srcUtilCEBuffer_[buffer.m_srcUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER) {
3455 if (buffer.m_tgtUtilCEBuffer_[buffer.m_tgtUtilCEBufferSize_ - 1] != CollationElementIterator.NULLORDER) {
3459 } else if (buffer.m_tgtUtilCEBuffer_[buffer.m_tgtUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER) {
3462 if (doHiragana4 && hiraganaresult == 0
3463 && buffer.m_srcUtilColEIter_.m_isCodePointHiragana_ != buffer.m_tgtUtilColEIter_.m_isCodePointHiragana_) {
3464 if (buffer.m_srcUtilColEIter_.m_isCodePointHiragana_) {
3465 hiraganaresult = -1;
3471 if (!isContinuation(sorder) && m_leadBytePermutationTable_ != null) {
3472 sPrimary = (m_leadBytePermutationTable_[sPrimary >>> 24] << 24) | (sPrimary & 0x00FFFFFF);
3473 tPrimary = (m_leadBytePermutationTable_[tPrimary >>> 24] << 24) | (tPrimary & 0x00FFFFFF);
3475 // if two primaries are different, we are done
3476 return endPrimaryCompare(sPrimary, tPrimary, buffer);
3479 // no primary difference... do the rest from the buffers
3480 return hiraganaresult;
3481 } else { // shifted - do a slightly more complicated processing :)
3483 int sorder = getPrimaryShiftedCompareCE(buffer.m_srcUtilColEIter_, lowestpvalue, true, buffer);
3484 int torder = getPrimaryShiftedCompareCE(buffer.m_tgtUtilColEIter_, lowestpvalue, false, buffer);
3485 if (sorder == torder) {
3486 if (buffer.m_srcUtilCEBuffer_[buffer.m_srcUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER) {
3492 return endPrimaryCompare(sorder, torder, buffer);
3494 } // no primary difference... do the rest from the buffers
3500 * This is used only for primary strength when we know that sorder is already different from torder. Compares sorder
3501 * and torder, returns -1 if sorder is less than torder. Clears the cebuffer at the same time.
3503 * @param sorder source strength order
3504 * @param torder target strength order
3505 * @param buffer collation buffer temporary state
3506 * @return the comparison result of sorder and torder
3508 private static final int endPrimaryCompare(int sorder, int torder, CollationBuffer buffer) {
3509 // if we reach here, the ce offset accessed is the last ce
3510 // appended to the buffer
3511 boolean isSourceNullOrder = (buffer.m_srcUtilCEBuffer_[buffer.m_srcUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER);
3512 boolean isTargetNullOrder = (buffer.m_tgtUtilCEBuffer_[buffer.m_tgtUtilCEBufferSize_ - 1] == CollationElementIterator.NULLORDER);
3513 buffer.m_srcUtilCEBufferSize_ = -1;
3514 buffer.m_tgtUtilCEBufferSize_ = -1;
3515 if (isSourceNullOrder) {
3518 if (isTargetNullOrder) {
3521 // getting rid of the sign
3522 sorder >>>= CE_PRIMARY_SHIFT_;
3523 torder >>>= CE_PRIMARY_SHIFT_;
3524 if (sorder < torder) {
3531 * Calculates the next primary shifted value and fills up cebuffer with the next non-ignorable ce.
3533 * @param coleiter collation element iterator
3534 * @param doHiragana4 flag indicator if hiragana quaternary is to be handled
3535 * @param lowestpvalue lowest primary shifted value that will not be ignored
3536 * @param buffer collation buffer temporary state
3537 * @return result next modified ce
3539 private static final int getPrimaryShiftedCompareCE(CollationElementIterator coleiter, int lowestpvalue, boolean isSrc, CollationBuffer buffer)
3541 boolean shifted = false;
3542 int result = CollationElementIterator.IGNORABLE;
3543 int cebuffer[] = buffer.m_srcUtilCEBuffer_;
3544 int cebuffersize = buffer.m_srcUtilCEBufferSize_;
3546 cebuffer = buffer.m_tgtUtilCEBuffer_;
3547 cebuffersize = buffer.m_tgtUtilCEBufferSize_;
3550 result = coleiter.next();
3551 if (result == CollationElementIterator.NULLORDER) {
3552 cebuffer = append(cebuffer, cebuffersize, result);
3555 } else if (result == CollationElementIterator.IGNORABLE
3556 || (shifted && (result & CE_PRIMARY_MASK_) == CollationElementIterator.IGNORABLE)) {
3557 // UCA amendment - ignore ignorables that follow shifted code
3560 } else if (isContinuation(result)) {
3561 if ((result & CE_PRIMARY_MASK_) != CollationElementIterator.IGNORABLE) {
3562 // There is primary value
3564 result = (result & CE_PRIMARY_MASK_) | CE_CONTINUATION_MARKER_;
3565 // preserve interesting continuation
3566 cebuffer = append(cebuffer, cebuffersize, result);
3570 cebuffer = append(cebuffer, cebuffersize, result);
3574 } else { // Just lower level values
3576 cebuffer = append(cebuffer, cebuffersize, result);
3581 if (Utility.compareUnsigned(result & CE_PRIMARY_MASK_, lowestpvalue) > 0) {
3582 cebuffer = append(cebuffer, cebuffersize, result);
3586 if ((result & CE_PRIMARY_MASK_) != 0) {
3588 result &= CE_PRIMARY_MASK_;
3589 cebuffer = append(cebuffer, cebuffersize, result);
3593 cebuffer = append(cebuffer, cebuffersize, result);
3602 buffer.m_srcUtilCEBuffer_ = cebuffer;
3603 buffer.m_srcUtilCEBufferSize_ = cebuffersize;
3605 buffer.m_tgtUtilCEBuffer_ = cebuffer;
3606 buffer.m_tgtUtilCEBufferSize_ = cebuffersize;
3608 result &= CE_PRIMARY_MASK_;
3613 * Appending an int to an array of ints and increases it if we run out of space
3617 * @param appendindex
3618 * index at which value will be appended
3621 * @return array if size is not increased, otherwise a new array will be returned
3623 private static final int[] append(int array[], int appendindex, int value) {
3624 if (appendindex + 1 >= array.length) {
3625 array = increase(array, appendindex, CE_BUFFER_SIZE_);
3627 array[appendindex] = value;
3632 * Does secondary strength comparison based on the collected ces.
3634 * @param doFrench flag indicates if French ordering is to be done
3635 * @param buffer collation buffer temporary state
3636 * @return the secondary strength comparison result
3638 private static final int doSecondaryCompare(boolean doFrench, CollationBuffer buffer) {
3639 // now, we're gonna reexamine collected CEs
3640 if (!doFrench) { // normal
3644 int sorder = CollationElementIterator.IGNORABLE;
3645 while (sorder == CollationElementIterator.IGNORABLE) {
3646 sorder = buffer.m_srcUtilCEBuffer_[soffset++] & CE_SECONDARY_MASK_;
3648 int torder = CollationElementIterator.IGNORABLE;
3649 while (torder == CollationElementIterator.IGNORABLE) {
3650 torder = buffer.m_tgtUtilCEBuffer_[toffset++] & CE_SECONDARY_MASK_;
3653 if (sorder == torder) {
3654 if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
3655 if (buffer.m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) {
3659 } else if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
3663 if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
3666 if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
3669 return (sorder < torder) ? -1 : 1;
3672 } else { // do the French
3673 buffer.m_srcUtilContOffset_ = 0;
3674 buffer.m_tgtUtilContOffset_ = 0;
3675 buffer.m_srcUtilOffset_ = buffer.m_srcUtilCEBufferSize_ - 2;
3676 buffer.m_tgtUtilOffset_ = buffer.m_tgtUtilCEBufferSize_ - 2;
3678 int sorder = getSecondaryFrenchCE(true, buffer);
3679 int torder = getSecondaryFrenchCE(false, buffer);
3680 if (sorder == torder) {
3681 if ((buffer.m_srcUtilOffset_ < 0 && buffer.m_tgtUtilOffset_ < 0)
3682 || (buffer.m_srcUtilOffset_ >= 0 && buffer.m_srcUtilCEBuffer_[buffer.m_srcUtilOffset_] == CollationElementIterator.NULLORDER)) {
3686 return (sorder < torder) ? -1 : 1;
3694 * Calculates the next secondary french CE.
3696 * @param isSrc flag indicator if we are calculating the src ces
3697 * @param buffer collation buffer temporary state
3698 * @return result next modified ce
3700 private static final int getSecondaryFrenchCE(boolean isSrc, CollationBuffer buffer) {
3701 int result = CollationElementIterator.IGNORABLE;
3702 int offset = buffer.m_srcUtilOffset_;
3703 int continuationoffset = buffer.m_srcUtilContOffset_;
3704 int cebuffer[] = buffer.m_srcUtilCEBuffer_;
3706 offset = buffer.m_tgtUtilOffset_;
3707 continuationoffset = buffer.m_tgtUtilContOffset_;
3708 cebuffer = buffer.m_tgtUtilCEBuffer_;
3711 while (result == CollationElementIterator.IGNORABLE && offset >= 0) {
3712 if (continuationoffset == 0) {
3713 result = cebuffer[offset];
3714 while (isContinuation(cebuffer[offset--])) {
3716 // after this, sorder is at the start of continuation,
3717 // and offset points before that
3718 if (isContinuation(cebuffer[offset + 1])) {
3719 // save offset for later
3720 continuationoffset = offset;
3724 result = cebuffer[offset++];
3725 if (!isContinuation(result)) {
3726 // we have finished with this continuation
3727 offset = continuationoffset;
3728 // reset the pointer to before continuation
3729 continuationoffset = 0;
3733 result &= CE_SECONDARY_MASK_; // remove continuation bit
3736 buffer.m_srcUtilOffset_ = offset;
3737 buffer.m_srcUtilContOffset_ = continuationoffset;
3739 buffer.m_tgtUtilOffset_ = offset;
3740 buffer.m_tgtUtilContOffset_ = continuationoffset;
3746 * Does case strength comparison based on the collected ces.
3748 * @param buffer collation buffer temporary state
3749 * @return the case strength comparison result
3751 private final int doCaseCompare(CollationBuffer buffer) {
3755 int sorder = CollationElementIterator.IGNORABLE;
3756 int torder = CollationElementIterator.IGNORABLE;
3757 while ((sorder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) {
3758 sorder = buffer.m_srcUtilCEBuffer_[soffset++];
3759 if (!isContinuation(sorder) && ((sorder & CE_PRIMARY_MASK_) != 0 || buffer.m_utilCompare2_ == true)) {
3760 // primary ignorables should not be considered on the case level when the strength is primary
3761 // otherwise, the CEs stop being well-formed
3762 sorder &= CE_CASE_MASK_3_;
3763 sorder ^= m_caseSwitch_;
3765 sorder = CollationElementIterator.IGNORABLE;
3769 while ((torder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) {
3770 torder = buffer.m_tgtUtilCEBuffer_[toffset++];
3771 if (!isContinuation(torder) && ((torder & CE_PRIMARY_MASK_) != 0 || buffer.m_utilCompare2_ == true)) {
3772 // primary ignorables should not be considered on the case level when the strength is primary
3773 // otherwise, the CEs stop being well-formed
3774 torder &= CE_CASE_MASK_3_;
3775 torder ^= m_caseSwitch_;
3777 torder = CollationElementIterator.IGNORABLE;
3781 sorder &= CE_CASE_BIT_MASK_;
3782 torder &= CE_CASE_BIT_MASK_;
3783 if (sorder == torder) {
3784 // checking end of strings
3785 if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
3786 if (buffer.m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) {
3790 } else if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
3794 if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
3797 if (buffer.m_tgtUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
3800 return (sorder < torder) ? -1 : 1;
3807 * Does tertiary strength comparison based on the collected ces.
3809 * @param buffer collation buffer temporary state
3810 * @return the tertiary strength comparison result
3812 private final int doTertiaryCompare(CollationBuffer buffer) {
3816 int sorder = CollationElementIterator.IGNORABLE;
3817 int torder = CollationElementIterator.IGNORABLE;
3818 while ((sorder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) {
3819 sorder = buffer.m_srcUtilCEBuffer_[soffset++];
3820 if (!isContinuation(sorder)) {
3821 sorder = (sorder & m_mask3_) ^ m_caseSwitch_;
3823 sorder = (sorder & m_mask3_) & CE_REMOVE_CASE_;
3827 while ((torder & CE_REMOVE_CASE_) == CollationElementIterator.IGNORABLE) {
3828 torder = buffer.m_tgtUtilCEBuffer_[toffset++];
3829 if (!isContinuation(torder)) {
3830 torder = (torder & m_mask3_) ^ m_caseSwitch_;
3832 torder = (torder & m_mask3_) & CE_REMOVE_CASE_;
3836 if (sorder == torder) {
3837 if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
3838 if (buffer.m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) {
3842 } else if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
3846 if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
3849 if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
3852 return (sorder < torder) ? -1 : 1;
3859 * Does quaternary strength comparison based on the collected ces.
3861 * @param lowestpvalue the lowest primary value that will not be ignored if alternate handling is shifted
3862 * @param buffer collation buffer temporary state
3863 * @return the quaternary strength comparison result
3865 private final int doQuaternaryCompare(int lowestpvalue, CollationBuffer buffer) {
3866 boolean sShifted = true;
3867 boolean tShifted = true;
3871 int sorder = CollationElementIterator.IGNORABLE;
3872 int torder = CollationElementIterator.IGNORABLE;
3873 while (sorder == CollationElementIterator.IGNORABLE || (isContinuation(sorder) && !sShifted)) {
3874 sorder = buffer.m_srcUtilCEBuffer_[soffset++];
3875 if (isContinuation(sorder)) {
3879 } else if (Utility.compareUnsigned(sorder, lowestpvalue) > 0
3880 || (sorder & CE_PRIMARY_MASK_) == CollationElementIterator.IGNORABLE) {
3882 sorder = CE_PRIMARY_MASK_;
3888 sorder >>>= CE_PRIMARY_SHIFT_;
3889 while (torder == CollationElementIterator.IGNORABLE || (isContinuation(torder) && !tShifted)) {
3890 torder = buffer.m_tgtUtilCEBuffer_[toffset++];
3891 if (isContinuation(torder)) {
3895 } else if (Utility.compareUnsigned(torder, lowestpvalue) > 0
3896 || (torder & CE_PRIMARY_MASK_) == CollationElementIterator.IGNORABLE) {
3898 torder = CE_PRIMARY_MASK_;
3904 torder >>>= CE_PRIMARY_SHIFT_;
3906 if (sorder == torder) {
3907 if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
3908 if (buffer.m_tgtUtilCEBuffer_[toffset - 1] != CollationElementIterator.NULLORDER) {
3912 } else if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
3916 if (buffer.m_srcUtilCEBuffer_[soffset - 1] == CollationElementIterator.NULLORDER) {
3919 if (buffer.m_tgtUtilCEBuffer_[toffset - 1] == CollationElementIterator.NULLORDER) {
3922 return (sorder < torder) ? -1 : 1;
3929 * Internal function. Does byte level string compare. Used by strcoll if strength == identical and strings are
3930 * otherwise equal. This is a rare case. Comparison must be done on NFD normalized strings. FCD is not good enough.
3937 * of the first difference in the text strings
3939 * flag indicating if we are to normalize the text before comparison
3940 * @return 1 if source is greater than target, -1 less than and 0 if equals
3942 private static final int doIdenticalCompare(String source, String target, int offset, boolean normalize)
3946 if (Normalizer.quickCheck(source, Normalizer.NFD, 0) != Normalizer.YES) {
3947 source = Normalizer.decompose(source, false);
3950 if (Normalizer.quickCheck(target, Normalizer.NFD, 0) != Normalizer.YES) {
3951 target = Normalizer.decompose(target, false);
3956 return doStringCompare(source, target, offset);
3960 * Compares string for their codepoint order. This comparison handles surrogate characters and place them after the
3961 * all non surrogate characters.
3968 * start offset for comparison
3969 * @return 1 if source is greater than target, -1 less than and 0 if equals
3971 private static final int doStringCompare(String source, String target, int offset) {
3972 // compare identical prefixes - they do not need to be fixed up
3975 int slength = source.length();
3976 int tlength = target.length();
3977 int minlength = Math.min(slength, tlength);
3978 while (offset < minlength) {
3979 schar = source.charAt(offset);
3980 tchar = target.charAt(offset++);
3981 if (schar != tchar) {
3986 if (schar == tchar && offset == minlength) {
3987 if (slength > minlength) {
3990 if (tlength > minlength) {
3996 // if both values are in or above the surrogate range, Fix them up.
3997 if (schar >= UTF16.LEAD_SURROGATE_MIN_VALUE && tchar >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
3998 schar = fixupUTF16(schar);
3999 tchar = fixupUTF16(tchar);
4002 // now c1 and c2 are in UTF-32-compatible order
4003 return (schar < tchar) ? -1 : 1; // schar and tchar has to be different
4007 * Rotate surrogates to the top to get code point order
4009 private static final char fixupUTF16(char ch) {
4018 private static final int UCOL_REORDER_CODE_IGNORE = ReorderCodes.LIMIT + 1;
4020 * Builds the lead byte permuatation table
4022 private void buildPermutationTable() {
4023 if (m_reorderCodes_ == null || m_reorderCodes_.length == 0 || (m_reorderCodes_.length == 1 && m_reorderCodes_[0] == ReorderCodes.NONE)) {
4024 m_leadBytePermutationTable_ = null;
4028 if (m_reorderCodes_[0] == ReorderCodes.DEFAULT) {
4029 if (m_reorderCodes_.length != 1) {
4030 throw new IllegalArgumentException("Illegal collation reorder codes - default reorder code must be the only code in the list.");
4032 // swap the reorder codes for those at build of the rules
4033 if (m_defaultReorderCodes_ == null || m_defaultReorderCodes_.length == 0) {
4034 m_leadBytePermutationTable_ = null;
4037 m_reorderCodes_ = m_defaultReorderCodes_.clone();
4040 // TODO - these need to be read in from the UCA data file
4041 // The lowest byte that hasn't been assigned a mapping
4042 int toBottom = 0x03;
4043 // The highest byte that hasn't been assigned a mapping
4046 // filled slots in the output m_scriptOrder_
4047 boolean[] permutationSlotFilled = new boolean[256];
4050 boolean[] newLeadByteUsed = new boolean[256];
4052 if (m_leadBytePermutationTable_ == null) {
4053 m_leadBytePermutationTable_ = new byte[256];
4056 // prefill the reordering codes with the leading entries
4057 int[] internalReorderCodes = new int[m_reorderCodes_.length + (ReorderCodes.LIMIT - ReorderCodes.FIRST)];
4058 for (int codeIndex = 0; codeIndex < ReorderCodes.LIMIT - ReorderCodes.FIRST; codeIndex++) {
4059 internalReorderCodes[codeIndex] = ReorderCodes.FIRST + codeIndex;
4061 for (int codeIndex = 0; codeIndex < m_reorderCodes_.length; codeIndex++) {
4062 internalReorderCodes[codeIndex + (ReorderCodes.LIMIT - ReorderCodes.FIRST)] = m_reorderCodes_[codeIndex];
4063 if (m_reorderCodes_[codeIndex] >= ReorderCodes.FIRST && m_reorderCodes_[codeIndex] < ReorderCodes.LIMIT) {
4064 internalReorderCodes[m_reorderCodes_[codeIndex] - ReorderCodes.FIRST] = UCOL_REORDER_CODE_IGNORE;
4069 * Start from the front of the list and place each script we encounter at the earliest possible locatation
4070 * in the permutation table. If we encounter UNKNOWN, start processing from the back, and place each script
4071 * in the last possible location. At each step, we also need to make sure that any scripts that need to not
4072 * be moved are copied to their same location in the final table.
4074 boolean fromTheBottom = true;
4075 int reorderCodesIndex = -1;
4076 for (int reorderCodesCount = 0; reorderCodesCount < internalReorderCodes.length; reorderCodesCount++) {
4077 reorderCodesIndex += fromTheBottom ? 1 : -1;
4078 int next = internalReorderCodes[reorderCodesIndex];
4079 if (next == UCOL_REORDER_CODE_IGNORE) {
4082 if (next == UScript.UNKNOWN) {
4083 if (fromTheBottom == false) {
4084 // double turnaround
4085 m_leadBytePermutationTable_ = null;
4086 throw new IllegalArgumentException("Illegal collation reorder codes - two \"from the end\" markers.");
4088 fromTheBottom = false;
4089 reorderCodesIndex = internalReorderCodes.length;
4093 int[] leadBytes = RuleBasedCollator.LEADBYTE_CONSTANTS_.getLeadBytesForReorderCode(next);
4094 if (fromTheBottom) {
4095 for (int leadByte : leadBytes) {
4096 // don't place a lead byte twice in the permutation table
4097 if (permutationSlotFilled[leadByte]) {
4098 // lead byte already used
4099 m_leadBytePermutationTable_ = null;
4100 throw new IllegalArgumentException("Illegal reorder codes specified - multiple codes with the same lead byte.");
4102 m_leadBytePermutationTable_[leadByte] = (byte) toBottom;
4103 newLeadByteUsed[toBottom] = true;
4104 permutationSlotFilled[leadByte] = true;
4108 for (int leadByteIndex = leadBytes.length - 1; leadByteIndex >= 0; leadByteIndex--) {
4109 int leadByte = leadBytes[leadByteIndex];
4110 // don't place a lead byte twice in the permutation table
4111 if (permutationSlotFilled[leadByte]) {
4112 // lead byte already used
4113 m_leadBytePermutationTable_ = null;
4114 throw new IllegalArgumentException("Illegal reorder codes specified - multiple codes with the same lead byte.");
4117 m_leadBytePermutationTable_[leadByte] = (byte) toTop;
4118 newLeadByteUsed[toTop] = true;
4119 permutationSlotFilled[leadByte] = true;
4125 /* Copy everything that's left over */
4126 int reorderCode = 0;
4127 for (int i = 0; i < 256; i++) {
4128 if (!permutationSlotFilled[i]) {
4129 while (newLeadByteUsed[reorderCode]) {
4130 if (reorderCode > 255) {
4131 throw new IllegalArgumentException("Unable to fill collation reordering table slots - no available reordering code.");
4135 m_leadBytePermutationTable_[i] = (byte) reorderCode;
4136 permutationSlotFilled[i] = true;
4137 newLeadByteUsed[reorderCode] = true;
4141 // for (int i = 0; i < 256; i++){
4142 // System.out.println(Integer.toString(i, 16) + " -> " + Integer.toString(m_scriptReorderTable_[i], 16));
4144 latinOneRegenTable_ = true;
4145 updateInternalState();
4149 * Resets the internal case data members and compression values.
4151 private void updateInternalState() {
4152 if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {
4153 m_caseSwitch_ = CASE_SWITCH_;
4155 m_caseSwitch_ = NO_CASE_SWITCH_;
4158 if (m_isCaseLevel_ || m_caseFirst_ == AttributeValue.OFF_) {
4159 m_mask3_ = CE_REMOVE_CASE_;
4160 m_common3_ = COMMON_NORMAL_3_;
4161 m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_OFF_;
4162 m_top3_ = COMMON_TOP_CASE_SWITCH_OFF_3_;
4163 m_bottom3_ = COMMON_BOTTOM_3_;
4165 m_mask3_ = CE_KEEP_CASE_;
4166 m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_ON_;
4167 if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {
4168 m_common3_ = COMMON_UPPER_FIRST_3_;
4169 m_top3_ = COMMON_TOP_CASE_SWITCH_UPPER_3_;
4170 m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_UPPER_3_;
4172 m_common3_ = COMMON_NORMAL_3_;
4173 m_top3_ = COMMON_TOP_CASE_SWITCH_LOWER_3_;
4174 m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_LOWER_3_;
4178 // Set the compression values
4179 int total3 = m_top3_ - m_bottom3_ - 1;
4180 // we multilply double with int, but need only int
4181 m_topCount3_ = (int) (PROPORTION_3_ * total3);
4182 m_bottomCount3_ = total3 - m_topCount3_;
4184 if (!m_isCaseLevel_ && getStrength() == AttributeValue.TERTIARY_ && !m_isFrenchCollation_
4185 && !m_isAlternateHandlingShifted_) {
4186 m_isSimple3_ = true;
4188 m_isSimple3_ = false;
4190 if (!m_isCaseLevel_ && getStrength() <= AttributeValue.TERTIARY_ && !m_isNumericCollation_
4191 && !m_isAlternateHandlingShifted_ && !latinOneFailed_) {
4192 if (latinOneCEs_ == null || latinOneRegenTable_) {
4193 if (setUpLatinOne()) { // if we succeed in building latin1 table, we'll use it
4194 latinOneUse_ = true;
4196 latinOneUse_ = false;
4197 latinOneFailed_ = true;
4199 latinOneRegenTable_ = false;
4200 } else { // latin1Table exists and it doesn't need to be regenerated, just use it
4201 latinOneUse_ = true;
4204 latinOneUse_ = false;
4210 * Initializes the RuleBasedCollator
4212 private final void init() {
4213 for (m_minUnsafe_ = 0; m_minUnsafe_ < DEFAULT_MIN_HEURISTIC_; m_minUnsafe_++) {
4214 // Find the smallest unsafe char.
4215 if (isUnsafe(m_minUnsafe_)) {
4220 for (m_minContractionEnd_ = 0; m_minContractionEnd_ < DEFAULT_MIN_HEURISTIC_; m_minContractionEnd_++) {
4221 // Find the smallest contraction-ending char.
4222 if (isContractionEnd(m_minContractionEnd_)) {
4226 latinOneFailed_ = true;
4227 setStrength(m_defaultStrength_);
4228 setDecomposition(m_defaultDecomposition_);
4229 m_variableTopValue_ = m_defaultVariableTopValue_;
4230 m_isFrenchCollation_ = m_defaultIsFrenchCollation_;
4231 m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_;
4232 m_isCaseLevel_ = m_defaultIsCaseLevel_;
4233 m_caseFirst_ = m_defaultCaseFirst_;
4234 m_isHiragana4_ = m_defaultIsHiragana4_;
4235 m_isNumericCollation_ = m_defaultIsNumericCollation_;
4236 latinOneFailed_ = false;
4237 if (m_defaultReorderCodes_ != null) {
4238 m_reorderCodes_ = m_defaultReorderCodes_.clone();
4240 m_reorderCodes_ = null;
4242 updateInternalState();
4245 // Consts for Latin-1 special processing
4246 private static final int ENDOFLATINONERANGE_ = 0xFF;
4247 private static final int LATINONETABLELEN_ = (ENDOFLATINONERANGE_ + 50);
4248 private static final int BAIL_OUT_CE_ = 0xFF000000;
4251 * Generate latin-1 tables
4254 private static class shiftValues {
4260 private final void addLatinOneEntry(char ch, int CE, shiftValues sh) {
4261 int primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
4262 boolean continuation = isContinuation(CE);
4263 boolean reverseSecondary = false;
4264 if (!continuation) {
4265 tertiary = ((CE & m_mask3_));
4266 tertiary ^= m_caseSwitch_;
4267 reverseSecondary = true;
4269 tertiary = (byte) ((CE & CE_REMOVE_CONTINUATION_MASK_));
4270 tertiary &= CE_REMOVE_CASE_;
4271 reverseSecondary = false;
4274 secondary = ((CE >>>= 8) & LAST_BYTE_MASK_);
4275 primary2 = ((CE >>>= 8) & LAST_BYTE_MASK_);
4276 primary1 = (CE >>> 8);
4278 if (primary1 != 0) {
4279 if (m_leadBytePermutationTable_ != null && !continuation) {
4280 primary1 = m_leadBytePermutationTable_[primary1];
4282 latinOneCEs_[ch] |= (primary1 << sh.primShift);
4285 if (primary2 != 0) {
4286 if (sh.primShift < 0) {
4287 latinOneCEs_[ch] = BAIL_OUT_CE_;
4288 latinOneCEs_[latinOneTableLen_ + ch] = BAIL_OUT_CE_;
4289 latinOneCEs_[2 * latinOneTableLen_ + ch] = BAIL_OUT_CE_;
4292 latinOneCEs_[ch] |= (primary2 << sh.primShift);
4295 if (secondary != 0) {
4296 if (reverseSecondary && m_isFrenchCollation_) { // reverse secondary
4297 latinOneCEs_[latinOneTableLen_ + ch] >>>= 8; // make space for secondary
4298 latinOneCEs_[latinOneTableLen_ + ch] |= (secondary << 24);
4299 } else { // normal case
4300 latinOneCEs_[latinOneTableLen_ + ch] |= (secondary << sh.secShift);
4304 if (tertiary != 0) {
4305 latinOneCEs_[2 * latinOneTableLen_ + ch] |= (tertiary << sh.terShift);
4310 private final void resizeLatinOneTable(int newSize) {
4311 int newTable[] = new int[3 * newSize];
4312 int sizeToCopy = ((newSize < latinOneTableLen_) ? newSize : latinOneTableLen_);
4313 // uprv_memset(newTable, 0, newSize*sizeof(uint32_t)*3); // automatically cleared.
4314 System.arraycopy(latinOneCEs_, 0, newTable, 0, sizeToCopy);
4315 System.arraycopy(latinOneCEs_, latinOneTableLen_, newTable, newSize, sizeToCopy);
4316 System.arraycopy(latinOneCEs_, 2 * latinOneTableLen_, newTable, 2 * newSize, sizeToCopy);
4317 latinOneTableLen_ = newSize;
4318 latinOneCEs_ = newTable;
4321 private final boolean setUpLatinOne() {
4322 if (latinOneCEs_ == null || m_reallocLatinOneCEs_) {
4323 latinOneCEs_ = new int[3 * LATINONETABLELEN_];
4324 latinOneTableLen_ = LATINONETABLELEN_;
4325 m_reallocLatinOneCEs_ = false;
4327 Arrays.fill(latinOneCEs_, 0);
4329 if (m_ContInfo_ == null) {
4330 m_ContInfo_ = new ContractionInfo();
4333 // StringBuffer sCh = new StringBuffer();
4334 // CollationElementIterator it = getCollationElementIterator(sCh.toString());
4335 CollationElementIterator it = getCollationElementIterator("");
4337 shiftValues s = new shiftValues();
4339 char contractionOffset = ENDOFLATINONERANGE_ + 1;
4341 for (ch = 0; ch <= ENDOFLATINONERANGE_; ch++) {
4346 CE = m_trie_.getLatin1LinearValue(ch);
4348 CE = m_trie_.getLeadValue(ch);
4349 if (CE == CollationElementIterator.CE_NOT_FOUND_) {
4350 CE = UCA_.m_trie_.getLeadValue(ch);
4353 if (!isSpecial(CE)) {
4354 addLatinOneEntry(ch, CE, s);
4356 switch (RuleBasedCollator.getTag(CE)) {
4357 case CollationElementIterator.CE_EXPANSION_TAG_:
4358 case CollationElementIterator.CE_DIGIT_TAG_:
4359 // sCh.delete(0, sCh.length());
4361 // it.setText(sCh.toString());
4362 it.setText(UCharacter.toString(ch));
4363 while ((CE = it.next()) != CollationElementIterator.NULLORDER) {
4364 if (s.primShift < 0 || s.secShift < 0 || s.terShift < 0) {
4365 latinOneCEs_[ch] = BAIL_OUT_CE_;
4366 latinOneCEs_[latinOneTableLen_ + ch] = BAIL_OUT_CE_;
4367 latinOneCEs_[2 * latinOneTableLen_ + ch] = BAIL_OUT_CE_;
4370 addLatinOneEntry(ch, CE, s);
4373 case CollationElementIterator.CE_CONTRACTION_TAG_:
4374 // here is the trick
4375 // F2 is contraction. We do something very similar to contractions
4376 // but have two indices, one in the real contraction table and the
4377 // other to where we stuffed things. This hopes that we don't have
4378 // many contractions (this should work for latin-1 tables).
4380 if ((CE & 0x00FFF000) != 0) {
4381 latinOneFailed_ = true;
4385 int UCharOffset = (CE & 0xFFFFFF) - m_contractionOffset_; // getContractionOffset(CE)]
4387 CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
4389 latinOneCEs_[ch] = CE;
4390 latinOneCEs_[latinOneTableLen_ + ch] = CE;
4391 latinOneCEs_[2 * latinOneTableLen_ + ch] = CE;
4393 // We're going to jump into contraction table, pick the elements
4396 // CE = *(contractionCEs + (UCharOffset - contractionIndex));
4397 CE = m_contractionCE_[UCharOffset];
4398 if (isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) {
4399 int i; /* general counter */
4400 // uint32_t *CEOffset = (uint32_t *)image+getExpansionOffset(CE); /* find the offset to
4401 // expansion table */
4402 int offset = ((CE & 0xFFFFF0) >> 4) - m_expansionOffset_; // it.getExpansionOffset(this,
4404 int size = CE & 0xF; // getExpansionCount(CE);
4405 // CE = *CEOffset++;
4406 if (size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
4407 for (i = 0; i < size; i++) {
4408 if (s.primShift < 0 || s.secShift < 0 || s.terShift < 0) {
4409 latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;
4410 latinOneCEs_[latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_;
4411 latinOneCEs_[2 * latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_;
4414 addLatinOneEntry(contractionOffset, m_expansion_[offset + i], s);
4416 } else { /* else, we do */
4417 while (m_expansion_[offset] != 0) {
4418 if (s.primShift < 0 || s.secShift < 0 || s.terShift < 0) {
4419 latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;
4420 latinOneCEs_[latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_;
4421 latinOneCEs_[2 * latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_;
4424 addLatinOneEntry(contractionOffset, m_expansion_[offset++], s);
4427 contractionOffset++;
4428 } else if (!isSpecial(CE)) {
4429 addLatinOneEntry(contractionOffset++, CE, s);
4431 latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;
4432 latinOneCEs_[latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_;
4433 latinOneCEs_[2 * latinOneTableLen_ + contractionOffset] = BAIL_OUT_CE_;
4434 contractionOffset++;
4440 if (contractionOffset == latinOneTableLen_) { // we need to reallocate
4441 resizeLatinOneTable(2 * latinOneTableLen_);
4443 } while (m_contractionIndex_[UCharOffset] != 0xFFFF);
4446 case CollationElementIterator.CE_SPEC_PROC_TAG_: {
4447 // 0xB7 is a precontext character defined in UCA5.1, a special
4448 // handle is implemeted in order to save LatinOne table for
4451 addLatinOneEntry(ch, CE, s);
4453 latinOneFailed_ = true;
4459 latinOneFailed_ = true;
4465 if (contractionOffset < latinOneTableLen_) {
4466 resizeLatinOneTable(contractionOffset);
4471 private static class ContractionInfo {
4475 ContractionInfo m_ContInfo_;
4477 private int getLatinOneContraction(int strength, int CE, String s) {
4478 // int strength, int CE, String s, Integer ind) {
4479 int len = s.length();
4480 // const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
4481 int UCharOffset = (CE & 0xFFF) - m_contractionOffset_;
4483 int latinOneOffset = (CE & 0x00FFF000) >>> 12;
4484 char schar = 0, tchar = 0;
4488 * if(len == -1) { if(s[*index] == 0) { // end of string
4489 * return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); } else { schar = s[*index]; }
4492 if (m_ContInfo_.index == len) {
4493 return (latinOneCEs_[strength * latinOneTableLen_ + latinOneOffset]);
4495 schar = s.charAt(m_ContInfo_.index);
4499 while (schar > (tchar = m_contractionIndex_[UCharOffset + offset]/** (UCharOffset+offset) */
4500 )) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
4504 if (schar == tchar) {
4505 m_ContInfo_.index++;
4506 return (latinOneCEs_[strength * latinOneTableLen_ + latinOneOffset + offset]);
4508 if (schar > ENDOFLATINONERANGE_ /* & 0xFF00 */) {
4509 return BAIL_OUT_CE_;
4511 // skip completely ignorables
4512 int isZeroCE = m_trie_.getLeadValue(schar); // UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
4513 if (isZeroCE == 0) { // we have to ignore completely ignorables
4514 m_ContInfo_.index++;
4518 return (latinOneCEs_[strength * latinOneTableLen_ + latinOneOffset]);
4524 * This is a fast strcoll, geared towards text in Latin-1. It supports contractions of size two, French secondaries
4525 * and case switching. You can use it with strengths primary to tertiary. It does not support shifted and case
4526 * level. It relies on the table build by setupLatin1Table. If it doesn't understand something, it will go to the
4528 * @param buffer collation buffer temporary state
4530 private final int compareUseLatin1(String source, String target, int startOffset, CollationBuffer buffer) {
4531 int sLen = source.length();
4532 int tLen = target.length();
4534 int strength = getStrength();
4536 int sIndex = startOffset, tIndex = startOffset;
4537 char sChar = 0, tChar = 0;
4538 int sOrder = 0, tOrder = 0;
4540 boolean endOfSource = false;
4542 // uint32_t *elements = coll->latinOneCEs;
4544 boolean haveContractions = false; // if we have contractions in our string
4545 // we cannot do French secondary
4547 int offset = latinOneTableLen_;
4549 // Do the primary level
4552 while (sOrder == 0) { // this loop skips primary ignorables
4553 // sOrder=getNextlatinOneCE(source);
4554 if (sIndex == sLen) {
4558 sChar = source.charAt(sIndex++); // [sIndex++];
4560 if (sChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out
4561 // fprintf(stderr, "R");
4562 return compareRegular(source, target, startOffset, buffer);
4564 sOrder = latinOneCEs_[sChar];
4565 if (isSpecial(sOrder)) { // if we got a special
4566 // specials can basically be either contractions or bail-out signs. If we get anything
4567 // else, we'll bail out anywasy
4568 if (getTag(sOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) {
4569 m_ContInfo_.index = sIndex;
4570 sOrder = getLatinOneContraction(0, sOrder, source);
4571 sIndex = m_ContInfo_.index;
4572 haveContractions = true; // if there are contractions, we cannot do French secondary
4573 // However, if there are contractions in the table, but we always use just one char,
4574 // we might be able to do French. This should be checked out.
4576 if (isSpecial(sOrder) /* == UCOL_BAIL_OUT_CE */) {
4577 // fprintf(stderr, "S");
4578 return compareRegular(source, target, startOffset, buffer);
4583 while (tOrder == 0) { // this loop skips primary ignorables
4584 // tOrder=getNextlatinOneCE(target);
4585 if (tIndex == tLen) {
4592 tChar = target.charAt(tIndex++); // [tIndex++];
4593 if (tChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out
4594 // fprintf(stderr, "R");
4595 return compareRegular(source, target, startOffset, buffer);
4597 tOrder = latinOneCEs_[tChar];
4598 if (isSpecial(tOrder)) {
4599 // Handling specials, see the comments for source
4600 if (getTag(tOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) {
4601 m_ContInfo_.index = tIndex;
4602 tOrder = getLatinOneContraction(0, tOrder, target);
4603 tIndex = m_ContInfo_.index;
4604 haveContractions = true;
4606 if (isSpecial(tOrder)/* == UCOL_BAIL_OUT_CE */) {
4607 // fprintf(stderr, "S");
4608 return compareRegular(source, target, startOffset, buffer);
4612 if (endOfSource) { // source is finished, but target is not, say the result.
4616 if (sOrder == tOrder) { // if we have same CEs, we continue the loop
4621 // compare current top bytes
4622 if (((sOrder ^ tOrder) & 0xFF000000) != 0) {
4623 // top bytes differ, return difference
4624 if (sOrder >>> 8 < tOrder >>> 8) {
4629 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
4630 // since we must return enum value
4633 // top bytes match, continue with following bytes
4639 // after primary loop, we definitely know the sizes of strings,
4640 // so we set it and use simpler loop for secondaries and tertiaries
4641 // sLen = sIndex; tLen = tIndex;
4642 if (strength >= SECONDARY) {
4643 // adjust the table beggining
4644 // latinOneCEs_ += coll->latinOneTableLen;
4645 endOfSource = false;
4647 if (!m_isFrenchCollation_) { // non French
4648 // This loop is a simplified copy of primary loop
4649 // at this point we know that whole strings are latin-1, so we don't
4650 // check for that. We also know that we only have contractions as
4652 // sIndex = 0; tIndex = 0;
4653 sIndex = startOffset;
4654 tIndex = startOffset;
4656 while (sOrder == 0) {
4657 if (sIndex == sLen) {
4661 sChar = source.charAt(sIndex++); // [sIndex++];
4662 sOrder = latinOneCEs_[offset + sChar];
4663 if (isSpecial(sOrder)) {
4664 m_ContInfo_.index = sIndex;
4665 sOrder = getLatinOneContraction(1, sOrder, source);
4666 sIndex = m_ContInfo_.index;
4670 while (tOrder == 0) {
4671 if (tIndex == tLen) {
4678 tChar = target.charAt(tIndex++); // [tIndex++];
4679 tOrder = latinOneCEs_[offset + tChar];
4680 if (isSpecial(tOrder)) {
4681 m_ContInfo_.index = tIndex;
4682 tOrder = getLatinOneContraction(1, tOrder, target);
4683 tIndex = m_ContInfo_.index;
4690 if (sOrder == tOrder) {
4695 // see primary loop for comments on this
4696 if (((sOrder ^ tOrder) & 0xFF000000) != 0) {
4697 if (sOrder >>> 8 < tOrder >>> 8) {
4708 if (haveContractions) { // if we have contractions, we have to bail out
4709 // since we don't really know how to handle them here
4710 return compareRegular(source, target, startOffset, buffer);
4712 // For French, we go backwards
4715 secFLoop: for (;;) {
4716 while (sOrder == 0) {
4717 if (sIndex == startOffset) {
4721 sChar = source.charAt(--sIndex); // [--sIndex];
4722 sOrder = latinOneCEs_[offset + sChar];
4723 // don't even look for contractions
4726 while (tOrder == 0) {
4727 if (tIndex == startOffset) {
4734 tChar = target.charAt(--tIndex); // [--tIndex];
4735 tOrder = latinOneCEs_[offset + tChar];
4736 // don't even look for contractions
4742 if (sOrder == tOrder) {
4747 // see the primary loop for comments
4748 if (((sOrder ^ tOrder) & 0xFF000000) != 0) {
4749 if (sOrder >>> 8 < tOrder >>> 8) {
4762 if (strength >= TERTIARY) {
4763 // tertiary loop is the same as secondary (except no French)
4764 offset += latinOneTableLen_;
4765 // sIndex = 0; tIndex = 0;
4766 sIndex = startOffset;
4767 tIndex = startOffset;
4768 endOfSource = false;
4770 while (sOrder == 0) {
4771 if (sIndex == sLen) {
4775 sChar = source.charAt(sIndex++); // [sIndex++];
4776 sOrder = latinOneCEs_[offset + sChar];
4777 if (isSpecial(sOrder)) {
4778 m_ContInfo_.index = sIndex;
4779 sOrder = getLatinOneContraction(2, sOrder, source);
4780 sIndex = m_ContInfo_.index;
4783 while (tOrder == 0) {
4784 if (tIndex == tLen) {
4786 return 0; // if both strings are at the end, they are equal
4791 tChar = target.charAt(tIndex++); // [tIndex++];
4792 tOrder = latinOneCEs_[offset + tChar];
4793 if (isSpecial(tOrder)) {
4794 m_ContInfo_.index = tIndex;
4795 tOrder = getLatinOneContraction(2, tOrder, target);
4796 tIndex = m_ContInfo_.index;
4802 if (sOrder == tOrder) {
4807 if (((sOrder ^ tOrder) & 0xff000000) != 0) {
4808 if (sOrder >>> 8 < tOrder >>> 8) {
4823 * Get the version of this collator object.
4825 * @return the version object associated with this collator
4828 public VersionInfo getVersion() {
4829 /* RunTime version */
4830 int rtVersion = VersionInfo.UCOL_RUNTIME_VERSION.getMajor();
4831 /* Builder version */
4832 int bdVersion = m_version_.getMajor();
4835 * Charset Version. Need to get the version from cnv files makeconv should populate cnv files with version and
4836 * an api has to be provided in ucnv.h to obtain this version
4840 /* combine the version info */
4841 int cmbVersion = ((rtVersion << 11) | (bdVersion << 6) | (csVersion)) & 0xFFFF;
4843 /* Tailoring rules */
4844 return VersionInfo.getInstance(cmbVersion >> 8, cmbVersion & 0xFF, m_version_.getMinor(),
4845 UCA_.m_UCA_version_.getMajor());
4847 // versionInfo[0] = (uint8_t)(cmbVersion>>8);
4848 // versionInfo[1] = (uint8_t)cmbVersion;
4849 // versionInfo[2] = coll->image->version[1];
4850 // versionInfo[3] = coll->UCA->image->UCAVersion[0];
4854 * Get the UCA version of this collator object.
4856 * @return the version object associated with this collator
4859 public VersionInfo getUCAVersion() {
4860 return UCA_.m_UCA_version_;
4863 private transient boolean m_reallocLatinOneCEs_;
4865 private CollationBuffer collationBuffer;
4867 private final CollationBuffer getCollationBuffer() {
4871 if (collationBuffer == null) {
4872 collationBuffer = new CollationBuffer();
4874 collationBuffer.resetBuffers();
4876 return collationBuffer;
4879 private final void releaseCollationBuffer(CollationBuffer buffer) {
4881 frozenLock.unlock();