3 *******************************************************************************
4 * Copyright (C) 1996-2009, International Business Machines Corporation and *
5 * others. All Rights Reserved. *
6 *******************************************************************************
8 package com.ibm.icu.text;
10 import java.io.IOException;
11 import java.text.CharacterIterator;
12 import java.text.ParseException;
13 import java.util.Arrays;
14 import java.util.MissingResourceException;
16 //#if defined(FOUNDATION10) || defined(J2SE13) || defined(ECLIPSE_FRAGMENT)
17 //##import com.ibm.icu.impl.ByteBuffer;
19 import java.nio.ByteBuffer;
22 import com.ibm.icu.impl.BOCU;
23 import com.ibm.icu.impl.ICUDebug;
24 import com.ibm.icu.impl.ICUResourceBundle;
25 import com.ibm.icu.impl.ImplicitCEGenerator;
26 import com.ibm.icu.impl.IntTrie;
27 import com.ibm.icu.impl.StringUCharacterIterator;
28 import com.ibm.icu.impl.Trie;
29 import com.ibm.icu.impl.TrieIterator;
30 import com.ibm.icu.impl.Utility;
31 import com.ibm.icu.lang.UCharacter;
32 import com.ibm.icu.util.RangeValueIterator;
33 import com.ibm.icu.util.ULocale;
34 import com.ibm.icu.util.UResourceBundle;
35 import com.ibm.icu.util.VersionInfo;
38 * <p>RuleBasedCollator is a concrete subclass of Collator. It allows
39 * customization of the Collator via user-specified rule sets.
40 * RuleBasedCollator is designed to be fully compliant to the <a
41 * href="http://www.unicode.org/unicode/reports/tr10/">Unicode
42 * Collation Algorithm (UCA)</a> and conforms to ISO 14651.</p>
44 * <p>Users are strongly encouraged to read <a
45 * href="http://www.icu-project.org/userguide/Collate_Intro.html">
46 * the users guide</a> for more information about the collation
47 * service before using this class.</p>
49 * <p>Create a RuleBasedCollator from a locale by calling the
50 * getInstance(Locale) factory method in the base class Collator.
51 * Collator.getInstance(Locale) creates a RuleBasedCollator object
52 * based on the collation rules defined by the argument locale. If a
53 * customized collation ordering ar attributes is required, use the
54 * RuleBasedCollator(String) constructor with the appropriate
55 * rules. The customized RuleBasedCollator will base its ordering on
56 * UCA, while re-adjusting the attributes and orders of the characters
57 * in the specified rule accordingly.</p>
59 * <p>RuleBasedCollator provides correct collation orders for most
60 * locales supported in ICU. If specific data for a locale is not
61 * available, the orders eventually falls back to the <a
62 * href="http://www.unicode.org/unicode/reports/tr10/">UCA collation
65 * <p>For information about the collation rule syntax and details
66 * about customization, please refer to the
67 * <a href="http://www.icu-project.org/userguide/Collate_Customization.html">
68 * Collation customization</a> section of the user's guide.</p>
70 * <p><strong>Note</strong> that there are some differences between
71 * the Collation rule syntax used in Java and ICU4J:
74 * <li>According to the JDK documentation:
77 * Modifier '!' : Turns on Thai/Lao vowel-consonant swapping. If this rule
78 * is in force when a Thai vowel of the range \U0E40-\U0E44 precedes a
79 * Thai consonant of the range \U0E01-\U0E2E OR a Lao vowel of the
80 * range \U0EC0-\U0EC4 precedes a Lao consonant of the range
81 * \U0E81-\U0EAE then the
82 * vowel is placed after the consonant for collation purposes.
85 * If a rule is without the modifier '!', the Thai/Lao vowel-consonant
86 * swapping is not turned on.
90 * ICU4J's RuleBasedCollator does not support turning off the Thai/Lao
91 * vowel-consonant swapping, since the UCA clearly states that it has to be
92 * supported to ensure a correct sorting order. If a '!' is encountered, it is
95 * <li>As mentioned in the documentation of the base class Collator,
96 * compatibility decomposition mode is not supported.
99 * <strong>Examples</strong>
102 * Creating Customized RuleBasedCollators:
105 * String simple = "& a < b < c < d";
106 * RuleBasedCollator simpleCollator = new RuleBasedCollator(simple);
108 * String norwegian = "& a , A < b , B < c , C < d , D < e , E "
109 * + "< f , F < g , G < h , H < i , I < j , "
110 * + "J < k , K < l , L < m , M < n , N < "
111 * + "o , O < p , P < q , Q < r , R < s , S < "
112 * + "t , T < u , U < v , V < w , W < x , X "
113 * + "< y , Y < z , Z < \u00E5 = a\u030A "
114 * + ", \u00C5 = A\u030A ; aa , AA < \u00E6 "
115 * + ", \u00C6 < \u00F8 , \u00D8";
116 * RuleBasedCollator norwegianCollator = new RuleBasedCollator(norwegian);
120 * Concatenating rules to combine <code>Collator</code>s:
123 * // Create an en_US Collator object
124 * RuleBasedCollator en_USCollator = (RuleBasedCollator)
125 * Collator.getInstance(new Locale("en", "US", ""));
126 * // Create a da_DK Collator object
127 * RuleBasedCollator da_DKCollator = (RuleBasedCollator)
128 * Collator.getInstance(new Locale("da", "DK", ""));
130 * // First, get the collation rules from en_USCollator
131 * String en_USRules = en_USCollator.getRules();
132 * // Second, get the collation rules from da_DKCollator
133 * String da_DKRules = da_DKCollator.getRules();
134 * RuleBasedCollator newCollator =
135 * new RuleBasedCollator(en_USRules + da_DKRules);
136 * // newCollator has the combined rules
140 * Making changes to an existing RuleBasedCollator to create a new
141 * <code>Collator</code> object, by appending changes to the existing rule:
144 * // Create a new Collator object with additional rules
145 * String addRules = "& C < ch, cH, Ch, CH";
146 * RuleBasedCollator myCollator =
147 * new RuleBasedCollator(en_USCollator.getRules() + addRules);
148 * // myCollator contains the new rules
152 * How to change the order of non-spacing accents:
155 * // old rule with main accents
156 * String oldRules = "= \u0301 ; \u0300 ; \u0302 ; \u0308 "
157 * + "; \u0327 ; \u0303 ; \u0304 ; \u0305 "
158 * + "; \u0306 ; \u0307 ; \u0309 ; \u030A "
159 * + "; \u030B ; \u030C ; \u030D ; \u030E "
160 * + "; \u030F ; \u0310 ; \u0311 ; \u0312 "
161 * + "< a , A ; ae, AE ; \u00e6 , \u00c6 "
162 * + "< b , B < c, C < e, E & C < d , D";
163 * // change the order of accent characters
164 * String addOn = "& \u0300 ; \u0308 ; \u0302";
165 * RuleBasedCollator myCollator = new RuleBasedCollator(oldRules + addOn);
169 * Putting in a new primary ordering before the default setting,
170 * e.g. sort English characters before or after Japanese characters in the Japanese
171 * <code>Collator</code>:
174 * // get en_US Collator rules
175 * RuleBasedCollator en_USCollator
176 * = (RuleBasedCollator)Collator.getInstance(Locale.US);
177 * // add a few Japanese characters to sort before English characters
178 * // suppose the last character before the first base letter 'a' in
179 * // the English collation rule is \u2212
180 * String jaString = "& \u2212 < \u3041, \u3042 < \u3043, "
182 * RuleBasedCollator myJapaneseCollator
183 * = new RuleBasedCollator(en_USCollator.getRules() + jaString);
188 * This class is not subclassable
190 * @author Syn Wee Quek
193 public final class RuleBasedCollator extends Collator
195 // public constructors ---------------------------------------------------
199 * Constructor that takes the argument rules for
200 * customization. The collator will be based on UCA,
201 * with the attributes and re-ordering of the characters specified in the
204 * <p>See the user guide's section on
205 * <a href="http://www.icu-project.org/userguide/Collate_Customization.html">
206 * Collation Customization</a> for details on the rule syntax.
208 * @param rules the collation rules to build the collation table from.
209 * @exception ParseException and IOException thrown. ParseException thrown
210 * when argument rules have an invalid syntax. IOException
211 * thrown when an error occured while reading internal data.
214 public RuleBasedCollator(String rules) throws Exception
218 throw new IllegalArgumentException(
219 "Collation rules can not be null");
224 // public methods --------------------------------------------------------
227 * Clones the RuleBasedCollator
228 * @return a new instance of this RuleBasedCollator object
231 public Object clone() throws CloneNotSupportedException
233 RuleBasedCollator result = (RuleBasedCollator)super.clone();
234 if (latinOneCEs_ != null) {
235 result.m_reallocLatinOneCEs_ = true;
236 result.m_ContInfo_ = new ContractionInfo();
239 // since all collation data in the RuleBasedCollator do not change
240 // we can safely assign the result.fields to this collator
241 result.initUtility(false); // let the new clone have their own util
247 * Return a CollationElementIterator for the given String.
248 * @see CollationElementIterator
251 public CollationElementIterator getCollationElementIterator(String source)
253 return new CollationElementIterator(source, this);
257 * Return a CollationElementIterator for the given CharacterIterator.
258 * The source iterator's integrity will be preserved since a new copy
259 * will be created for use.
260 * @see CollationElementIterator
263 public CollationElementIterator getCollationElementIterator(
264 CharacterIterator source)
266 CharacterIterator newsource = (CharacterIterator)source.clone();
267 return new CollationElementIterator(newsource, this);
271 * Return a CollationElementIterator for the given UCharacterIterator.
272 * The source iterator's integrity will be preserved since a new copy
273 * will be created for use.
274 * @see CollationElementIterator
277 public CollationElementIterator getCollationElementIterator(
278 UCharacterIterator source)
280 return new CollationElementIterator(source, this);
283 // public setters --------------------------------------------------------
286 * Sets the Hiragana Quaternary mode to be on or off.
287 * When the Hiragana Quaternary mode is turned on, the collator
288 * positions Hiragana characters before all non-ignorable characters in
289 * QUATERNARY strength. This is to produce a correct JIS collation order,
290 * distinguishing between Katakana and Hiragana characters.
291 * @param flag true if Hiragana Quaternary mode is to be on, false
293 * @see #setHiraganaQuaternaryDefault
294 * @see #isHiraganaQuaternary
297 public void setHiraganaQuaternary(boolean flag)
299 m_isHiragana4_ = flag;
300 updateInternalState();
304 * Sets the Hiragana Quaternary mode to the initial mode set during
305 * construction of the RuleBasedCollator.
306 * See setHiraganaQuaternary(boolean) for more details.
307 * @see #setHiraganaQuaternary(boolean)
308 * @see #isHiraganaQuaternary
311 public void setHiraganaQuaternaryDefault()
313 m_isHiragana4_ = m_defaultIsHiragana4_;
314 updateInternalState();
318 * Sets whether uppercase characters sort before lowercase
319 * characters or vice versa, in strength TERTIARY. The default
320 * mode is false, and so lowercase characters sort before uppercase
322 * If true, sort upper case characters first.
323 * @param upperfirst true to sort uppercase characters before
324 * lowercase characters, false to sort lowercase
325 * characters before uppercase characters
326 * @see #isLowerCaseFirst
327 * @see #isUpperCaseFirst
328 * @see #setLowerCaseFirst
329 * @see #setCaseFirstDefault
332 public void setUpperCaseFirst(boolean upperfirst)
335 if(m_caseFirst_ != AttributeValue.UPPER_FIRST_) {
336 latinOneRegenTable_ = true;
338 m_caseFirst_ = AttributeValue.UPPER_FIRST_;
341 if(m_caseFirst_ != AttributeValue.OFF_) {
342 latinOneRegenTable_ = true;
344 m_caseFirst_ = AttributeValue.OFF_;
346 updateInternalState();
350 * Sets the orders of lower cased characters to sort before upper cased
351 * characters, in strength TERTIARY. The default
353 * If true is set, the RuleBasedCollator will sort lower cased characters
354 * before the upper cased ones.
355 * Otherwise, if false is set, the RuleBasedCollator will ignore case
357 * @param lowerfirst true for sorting lower cased characters before
358 * upper cased characters, false to ignore case
360 * @see #isLowerCaseFirst
361 * @see #isUpperCaseFirst
362 * @see #setUpperCaseFirst
363 * @see #setCaseFirstDefault
366 public void setLowerCaseFirst(boolean lowerfirst)
369 if(m_caseFirst_ != AttributeValue.LOWER_FIRST_) {
370 latinOneRegenTable_ = true;
372 m_caseFirst_ = AttributeValue.LOWER_FIRST_;
375 if(m_caseFirst_ != AttributeValue.OFF_) {
376 latinOneRegenTable_ = true;
378 m_caseFirst_ = AttributeValue.OFF_;
380 updateInternalState();
384 * Sets the case first mode to the initial mode set during
385 * construction of the RuleBasedCollator.
386 * See setUpperCaseFirst(boolean) and setLowerCaseFirst(boolean) for more
388 * @see #isLowerCaseFirst
389 * @see #isUpperCaseFirst
390 * @see #setLowerCaseFirst(boolean)
391 * @see #setUpperCaseFirst(boolean)
394 public final void setCaseFirstDefault()
396 if(m_caseFirst_ != m_defaultCaseFirst_) {
397 latinOneRegenTable_ = true;
399 m_caseFirst_ = m_defaultCaseFirst_;
400 updateInternalState();
404 * Sets the alternate handling mode to the initial mode set during
405 * construction of the RuleBasedCollator.
406 * See setAlternateHandling(boolean) for more details.
407 * @see #setAlternateHandlingShifted(boolean)
408 * @see #isAlternateHandlingShifted()
411 public void setAlternateHandlingDefault()
413 m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_;
414 updateInternalState();
418 * Sets the case level mode to the initial mode set during
419 * construction of the RuleBasedCollator.
420 * See setCaseLevel(boolean) for more details.
421 * @see #setCaseLevel(boolean)
425 public void setCaseLevelDefault()
427 m_isCaseLevel_ = m_defaultIsCaseLevel_;
428 updateInternalState();
432 * Sets the decomposition mode to the initial mode set during construction
433 * of the RuleBasedCollator.
434 * See setDecomposition(int) for more details.
435 * @see #getDecomposition
436 * @see #setDecomposition(int)
439 public void setDecompositionDefault()
441 setDecomposition(m_defaultDecomposition_);
442 updateInternalState();
446 * Sets the French collation mode to the initial mode set during
447 * construction of the RuleBasedCollator.
448 * See setFrenchCollation(boolean) for more details.
449 * @see #isFrenchCollation
450 * @see #setFrenchCollation(boolean)
453 public void setFrenchCollationDefault()
455 if(m_isFrenchCollation_ != m_defaultIsFrenchCollation_) {
456 latinOneRegenTable_ = true;
458 m_isFrenchCollation_ = m_defaultIsFrenchCollation_;
459 updateInternalState();
463 * Sets the collation strength to the initial mode set during the
464 * construction of the RuleBasedCollator.
465 * See setStrength(int) for more details.
466 * @see #setStrength(int)
470 public void setStrengthDefault()
472 setStrength(m_defaultStrength_);
473 updateInternalState();
477 * Method to set numeric collation to its default value.
478 * When numeric collation is turned on, this Collator generates a collation
479 * key for the numeric value of substrings of digits. This is a way to get
480 * '100' to sort AFTER '2'
481 * @see #getNumericCollation
482 * @see #setNumericCollation
485 public void setNumericCollationDefault()
487 setNumericCollation(m_defaultIsNumericCollation_);
488 updateInternalState();
492 * Sets the mode for the direction of SECONDARY weights to be used in
494 * The default value is false, which treats SECONDARY weights in the order
496 * If set to true, the SECONDARY weights will be sorted backwards.
498 * <a href="http://www.icu-project.org/userguide/Collate_ServiceArchitecture.html">
499 * French collation</a> for more information.
500 * @param flag true to set the French collation on, false to set it off
502 * @see #isFrenchCollation
503 * @see #setFrenchCollationDefault
505 public void setFrenchCollation(boolean flag)
507 if(m_isFrenchCollation_ != flag) {
508 latinOneRegenTable_ = true;
510 m_isFrenchCollation_ = flag;
511 updateInternalState();
515 * Sets the alternate handling for QUATERNARY strength to be either
516 * shifted or non-ignorable.
517 * See the UCA definition on
518 * <a href="http://www.unicode.org/unicode/reports/tr10/#Variable_Weighting">
519 * Alternate Weighting</a>.
520 * This attribute will only be effective when QUATERNARY strength is set.
521 * The default value for this mode is false, corresponding to the
522 * NON_IGNORABLE mode in UCA. In the NON-IGNORABLE mode, the
523 * RuleBasedCollator will treats all the codepoints with non-ignorable
524 * primary weights in the same way.
525 * If the mode is set to true, the behaviour corresponds to SHIFTED defined
526 * in UCA, this causes codepoints with PRIMARY orders that are equal or
527 * below the variable top value to be ignored in PRIMARY order and
528 * moved to the QUATERNARY order.
529 * @param shifted true if SHIFTED behaviour for alternate handling is
530 * desired, false for the NON_IGNORABLE behaviour.
531 * @see #isAlternateHandlingShifted
532 * @see #setAlternateHandlingDefault
535 public void setAlternateHandlingShifted(boolean shifted)
537 m_isAlternateHandlingShifted_ = shifted;
538 updateInternalState();
543 * When case level is set to true, an additional weight is formed
544 * between the SECONDARY and TERTIARY weight, known as the case level.
545 * The case level is used to distinguish large and small Japanese Kana
546 * characters. Case level could also be used in other situations.
547 * For example to distinguish certain Pinyin characters.
548 * The default value is false, which means the case level is not generated.
549 * The contents of the case level are affected by the case first
550 * mode. A simple way to ignore accent differences in a string is to set
551 * the strength to PRIMARY and enable case level.
555 * <a href="http://www.icu-project.org/userguide/Collate_ServiceArchitecture.html">
556 * case level</a> for more information.
558 * @param flag true if case level sorting is required, false otherwise
560 * @see #setCaseLevelDefault
563 public void setCaseLevel(boolean flag)
565 m_isCaseLevel_ = flag;
566 updateInternalState();
571 * Sets this Collator's strength property. The strength property
572 * determines the minimum level of difference considered significant
575 * <p>See the Collator class description for an example of use.</p>
576 * @param newStrength the new strength value.
578 * @see #setStrengthDefault
584 * @exception IllegalArgumentException If the new strength value is not one
585 * of PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL.
588 public void setStrength(int newStrength)
590 super.setStrength(newStrength);
591 updateInternalState();
596 * Variable top is a two byte primary value which causes all the codepoints
597 * with primary values that are less or equal than the variable top to be
598 * shifted when alternate handling is set to SHIFTED.
601 * Sets the variable top to a collation element value of a string supplied.
603 * @param varTop one or more (if contraction) characters to which the
604 * variable top should be set
605 * @return a int value containing the value of the variable top in upper 16
606 * bits. Lower 16 bits are undefined.
607 * @exception IllegalArgumentException is thrown if varTop argument is not
608 * a valid variable top element. A variable top element is
611 * <li>it is a contraction that does not exist in the
613 * <li>when the PRIMARY strength collation element for the
614 * variable top has more than two bytes
615 * <li>when the varTop argument is null or zero in length.
617 * @see #getVariableTop
618 * @see RuleBasedCollator#setAlternateHandlingShifted
621 public int setVariableTop(String varTop)
623 if (varTop == null || varTop.length() == 0) {
624 throw new IllegalArgumentException(
625 "Variable top argument string can not be null or zero in length.");
627 if (m_srcUtilIter_ == null) {
631 m_srcUtilColEIter_.setText(varTop);
632 int ce = m_srcUtilColEIter_.next();
634 // here we check if we have consumed all characters
635 // you can put in either one character or a contraction
636 // you shouldn't put more...
637 if (m_srcUtilColEIter_.getOffset() != varTop.length()
638 || ce == CollationElementIterator.NULLORDER) {
639 throw new IllegalArgumentException(
640 "Variable top argument string is a contraction that does not exist "
641 + "in the Collation order");
644 int nextCE = m_srcUtilColEIter_.next();
646 if ((nextCE != CollationElementIterator.NULLORDER)
647 && (!isContinuation(nextCE) || (nextCE & CE_PRIMARY_MASK_) != 0)) {
648 throw new IllegalArgumentException(
649 "Variable top argument string can only have a single collation "
650 + "element that has less than or equal to two PRIMARY strength "
654 m_variableTopValue_ = (ce & CE_PRIMARY_MASK_) >> 16;
656 return ce & CE_PRIMARY_MASK_;
660 * Sets the variable top to a collation element value supplied.
661 * Variable top is set to the upper 16 bits.
662 * Lower 16 bits are ignored.
663 * @param varTop Collation element value, as returned by setVariableTop or
665 * @see #getVariableTop
666 * @see #setVariableTop(String)
669 public void setVariableTop(int varTop)
671 m_variableTopValue_ = (varTop & CE_PRIMARY_MASK_) >> 16;
675 * When numeric collation is turned on, this Collator generates a collation
676 * key for the numeric value of substrings of digits. This is a way to get
677 * '100' to sort AFTER '2'
678 * @param flag true to turn numeric collation on and false to turn it off
679 * @see #getNumericCollation
680 * @see #setNumericCollationDefault
683 public void setNumericCollation(boolean flag)
685 // sort substrings of digits as numbers
686 m_isNumericCollation_ = flag;
687 updateInternalState();
690 // public getters --------------------------------------------------------
693 * Gets the collation rules for this RuleBasedCollator.
694 * Equivalent to String getRules(RuleOption.FULL_RULES).
695 * @return returns the collation rules
696 * @see #getRules(boolean)
699 public String getRules()
705 * Returns current rules. The argument defines whether full rules
706 * (UCA + tailored) rules are returned or just the tailoring.
707 * @param fullrules true if the rules that defines the full set of
708 * collation order is required, otherwise false for returning only
710 * @return the current rules that defines this Collator.
714 public String getRules(boolean fullrules)
719 // take the UCA rules and append real rules at the end
720 return UCA_.m_rules_.concat(m_rules_);
724 * Get an UnicodeSet that contains all the characters and sequences
725 * tailored in this collator.
726 * @return a pointer to a UnicodeSet object containing all the
727 * code points and sequences that may sort differently than
729 * @exception ParseException thrown when argument rules have an
730 * invalid syntax. IOException
733 public UnicodeSet getTailoredSet()
736 CollationRuleParser src = new CollationRuleParser(getRules());
737 return src.getTailoredSet();
738 } catch(Exception e) {
739 throw new IllegalStateException("A tailoring rule should not " +
740 "have errors. Something is quite wrong!");
744 private class contContext {
745 RuleBasedCollator coll;
746 UnicodeSet contractions;
747 UnicodeSet expansions;
748 UnicodeSet removedContractions;
750 contContext(RuleBasedCollator coll, UnicodeSet contractions, UnicodeSet expansions,
751 UnicodeSet removedContractions, boolean addPrefixes) {
753 this.contractions = contractions;
754 this.expansions = expansions;
755 this.removedContractions = removedContractions;
756 this.addPrefixes = addPrefixes;
761 addSpecial(contContext c, StringBuffer buffer, int CE)
763 StringBuffer b = new StringBuffer();
764 int offset = (CE & 0xFFFFFF) - c.coll.m_contractionOffset_;
765 int newCE = c.coll.m_contractionCE_[offset];
766 // we might have a contraction that ends from previous level
767 if(newCE != CollationElementIterator.CE_NOT_FOUND_) {
768 if(isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_
769 && isSpecial(newCE) && getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_
771 addSpecial(c, buffer, newCE);
773 if(buffer.length() > 1) {
774 if(c.contractions != null) {
775 c.contractions.add(buffer.toString());
777 if(c.expansions != null && isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) {
778 c.expansions.add(buffer.toString());
784 // check whether we're doing contraction or prefix
785 if(getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) {
786 while(c.coll.m_contractionIndex_[offset] != 0xFFFF) {
787 b.delete(0, b.length());
789 newCE = c.coll.m_contractionCE_[offset];
790 b.insert(0, c.coll.m_contractionIndex_[offset]);
791 if(isSpecial(newCE) && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) {
792 addSpecial(c, b, newCE);
794 if(c.contractions != null) {
795 c.contractions.add(b.toString());
797 if(c.expansions != null && isSpecial(newCE) && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) {
798 c.expansions.add(b.toString());
803 } else if(getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_) {
804 while(c.coll.m_contractionIndex_[offset] != 0xFFFF) {
805 b.delete(0, b.length());
807 newCE = c.coll.m_contractionCE_[offset];
808 b.append(c.coll.m_contractionIndex_[offset]);
809 if(isSpecial(newCE) && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) {
810 addSpecial(c, b, newCE);
812 if(c.contractions != null) {
813 c.contractions.add(b.toString());
815 if(c.expansions != null && isSpecial(newCE) && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) {
816 c.expansions.add(b.toString());
825 void processSpecials(contContext c)
827 int internalBufferSize = 512;
828 TrieIterator trieiterator
829 = new TrieIterator(c.coll.m_trie_);
830 RangeValueIterator.Element element = new RangeValueIterator.Element();
831 while (trieiterator.next(element)) {
832 int start = element.start;
833 int limit = element.limit;
834 int CE = element.value;
835 StringBuffer contraction = new StringBuffer(internalBufferSize);
838 if(((getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) || getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_)) {
839 while(start < limit) {
840 // if there are suppressed contractions, we don't
842 if(c.removedContractions != null && c.removedContractions.contains(start)) {
846 // we start our contraction from middle, since we don't know if it
847 // will grow toward right or left
848 contraction.append((char) start);
849 addSpecial(c, contraction, CE);
852 } else if(c.expansions != null && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) {
853 while(start < limit) {
854 c.expansions.add(start++);
862 * Gets unicode sets containing contractions and/or expansions of a collator
863 * @param contractions if not null, set to contain contractions
864 * @param expansions if not null, set to contain expansions
865 * @param addPrefixes add the prefix contextual elements to contractions
870 getContractionsAndExpansions(UnicodeSet contractions, UnicodeSet expansions,
871 boolean addPrefixes) throws Exception {
872 if(contractions != null) {
873 contractions.clear();
875 if(expansions != null) {
878 String rules = getRules();
880 CollationRuleParser src = new CollationRuleParser(rules);
881 contContext c = new contContext(RuleBasedCollator.UCA_,
882 contractions, expansions, src.m_removeSet_, addPrefixes);
884 // Add the UCA contractions
886 // This is collator specific. Add contractions from a collator
888 c.removedContractions = null;
890 } catch (Exception e) {
897 * Get a Collation key for the argument String source from this
901 * General recommendation: <br>
902 * If comparison are to be done to the same String multiple times, it would
903 * be more efficient to generate CollationKeys for the Strings and use
904 * CollationKey.compareTo(CollationKey) for the comparisons.
905 * If the each Strings are compared to only once, using the method
906 * RuleBasedCollator.compare(String, String) will have a better performance.
909 * See the class documentation for an explanation about CollationKeys.
911 * @param source the text String to be transformed into a collation key.
912 * @return the CollationKey for the given String based on this
913 * RuleBasedCollator's collation rules. If the source String is
914 * null, a null CollationKey is returned.
916 * @see #compare(String, String)
917 * @see #getRawCollationKey
920 public CollationKey getCollationKey(String source) {
921 if (source == null) {
924 m_utilRawCollationKey_ = getRawCollationKey(source,
925 m_utilRawCollationKey_);
926 return new CollationKey(source, m_utilRawCollationKey_);
930 * Gets the simpler form of a CollationKey for the String source following
931 * the rules of this Collator and stores the result into the user provided
933 * If key has a internal byte array of length that's too small for the
934 * result, the internal byte array will be grown to the exact required
936 * @param source the text String to be transformed into a RawCollationKey
937 * @param key output RawCollationKey to store results
938 * @return If key is null, a new instance of RawCollationKey will be
939 * created and returned, otherwise the user provided key will be
941 * @see #getCollationKey
942 * @see #compare(String, String)
943 * @see RawCollationKey
946 public RawCollationKey getRawCollationKey(String source,
949 if (source == null) {
952 int strength = getStrength();
953 m_utilCompare0_ = m_isCaseLevel_;
954 //m_utilCompare1_ = true;
955 m_utilCompare2_ = strength >= SECONDARY;
956 m_utilCompare3_ = strength >= TERTIARY;
957 m_utilCompare4_ = strength >= QUATERNARY;
958 m_utilCompare5_ = strength == IDENTICAL;
960 m_utilBytesCount0_ = 0;
961 m_utilBytesCount1_ = 0;
962 m_utilBytesCount2_ = 0;
963 m_utilBytesCount3_ = 0;
964 m_utilBytesCount4_ = 0;
965 //m_utilBytesCount5_ = 0;
972 boolean doFrench = m_isFrenchCollation_ && m_utilCompare2_;
973 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted.
974 // If we have no qShifted, we don't need to set UCOL_COMMON_BOT4 so
976 int commonBottom4 = ((m_variableTopValue_ >>> 8) + 1) & LAST_BYTE_MASK_;
978 if (m_isHiragana4_ && m_utilCompare4_) {
979 // allocate one more space for hiragana, value for hiragana
980 hiragana4 = (byte)commonBottom4;
984 int bottomCount4 = 0xFF - commonBottom4;
985 // If we need to normalize, we'll do it all at once at the beginning!
986 if (m_utilCompare5_ && Normalizer.quickCheck(source, Normalizer.NFD,0)
988 // if it is identical strength, we have to normalize the string to
989 // NFD so that it will be appended correctly to the end of the sort
991 source = Normalizer.decompose(source, false);
993 else if (getDecomposition() != NO_DECOMPOSITION
994 && Normalizer.quickCheck(source, Normalizer.FCD,0)
996 // for the rest of the strength, if decomposition is on, FCD is
997 // enough for us to work on.
998 source = Normalizer.normalize(source,Normalizer.FCD);
1000 getSortKeyBytes(source, doFrench, hiragana4, commonBottom4,
1003 key = new RawCollationKey();
1005 getSortKey(source, doFrench, commonBottom4, bottomCount4, key);
1010 * Return true if an uppercase character is sorted before the corresponding lowercase character.
1011 * See setCaseFirst(boolean) for details.
1012 * @see #setUpperCaseFirst
1013 * @see #setLowerCaseFirst
1014 * @see #isLowerCaseFirst
1015 * @see #setCaseFirstDefault
1016 * @return true if upper cased characters are sorted before lower cased
1017 * characters, false otherwise
1020 public boolean isUpperCaseFirst()
1022 return (m_caseFirst_ == AttributeValue.UPPER_FIRST_);
1026 * Return true if a lowercase character is sorted before the corresponding uppercase character.
1027 * See setCaseFirst(boolean) for details.
1028 * @see #setUpperCaseFirst
1029 * @see #setLowerCaseFirst
1030 * @see #isUpperCaseFirst
1031 * @see #setCaseFirstDefault
1032 * @return true lower cased characters are sorted before upper cased
1033 * characters, false otherwise
1036 public boolean isLowerCaseFirst()
1038 return (m_caseFirst_ == AttributeValue.LOWER_FIRST_);
1042 * Checks if the alternate handling behaviour is the UCA defined SHIFTED or
1044 * If return value is true, then the alternate handling attribute for the
1045 * Collator is SHIFTED. Otherwise if return value is false, then the
1046 * alternate handling attribute for the Collator is NON_IGNORABLE
1047 * See setAlternateHandlingShifted(boolean) for more details.
1048 * @return true or false
1049 * @see #setAlternateHandlingShifted(boolean)
1050 * @see #setAlternateHandlingDefault
1053 public boolean isAlternateHandlingShifted()
1055 return m_isAlternateHandlingShifted_;
1059 * Checks if case level is set to true.
1060 * See setCaseLevel(boolean) for details.
1061 * @return the case level mode
1062 * @see #setCaseLevelDefault
1064 * @see #setCaseLevel(boolean)
1067 public boolean isCaseLevel()
1069 return m_isCaseLevel_;
1073 * Checks if French Collation is set to true.
1074 * See setFrenchCollation(boolean) for details.
1075 * @return true if French Collation is set to true, false otherwise
1076 * @see #setFrenchCollation(boolean)
1077 * @see #setFrenchCollationDefault
1080 public boolean isFrenchCollation()
1082 return m_isFrenchCollation_;
1086 * Checks if the Hiragana Quaternary mode is set on.
1087 * See setHiraganaQuaternary(boolean) for more details.
1088 * @return flag true if Hiragana Quaternary mode is on, false otherwise
1089 * @see #setHiraganaQuaternaryDefault
1090 * @see #setHiraganaQuaternary(boolean)
1093 public boolean isHiraganaQuaternary()
1095 return m_isHiragana4_;
1099 * Gets the variable top value of a Collator.
1100 * Lower 16 bits are undefined and should be ignored.
1101 * @return the variable top value of a Collator.
1102 * @see #setVariableTop
1105 public int getVariableTop()
1107 return m_variableTopValue_ << 16;
1111 * Method to retrieve the numeric collation value.
1112 * When numeric collation is turned on, this Collator generates a collation
1113 * key for the numeric value of substrings of digits. This is a way to get
1114 * '100' to sort AFTER '2'
1115 * @see #setNumericCollation
1116 * @see #setNumericCollationDefault
1117 * @return true if numeric collation is turned on, false otherwise
1120 public boolean getNumericCollation()
1122 return m_isNumericCollation_;
1125 // public other methods -------------------------------------------------
1128 * Compares the equality of two RuleBasedCollator objects.
1129 * RuleBasedCollator objects are equal if they have the same collation
1130 * rules and the same attributes.
1131 * @param obj the RuleBasedCollator to be compared to.
1132 * @return true if this RuleBasedCollator has exactly the same
1133 * collation behaviour as obj, false otherwise.
1136 public boolean equals(Object obj)
1139 return false; // super does class check
1144 if (getClass() != obj.getClass()) {
1147 RuleBasedCollator other = (RuleBasedCollator)obj;
1148 // all other non-transient information is also contained in rules.
1149 if (getStrength() != other.getStrength()
1150 || getDecomposition() != other.getDecomposition()
1151 || other.m_caseFirst_ != m_caseFirst_
1152 || other.m_caseSwitch_ != m_caseSwitch_
1153 || other.m_isAlternateHandlingShifted_
1154 != m_isAlternateHandlingShifted_
1155 || other.m_isCaseLevel_ != m_isCaseLevel_
1156 || other.m_isFrenchCollation_ != m_isFrenchCollation_
1157 || other.m_isHiragana4_ != m_isHiragana4_) {
1160 boolean rules = m_rules_ == other.m_rules_;
1161 if (!rules && (m_rules_ != null && other.m_rules_ != null)) {
1162 rules = m_rules_.equals(other.m_rules_);
1164 if (!rules || !ICUDebug.enabled("collation")) {
1167 if (m_addition3_ != other.m_addition3_
1168 || m_bottom3_ != other.m_bottom3_
1169 || m_bottomCount3_ != other.m_bottomCount3_
1170 || m_common3_ != other.m_common3_
1171 || m_isSimple3_ != other.m_isSimple3_
1172 || m_mask3_ != other.m_mask3_
1173 || m_minContractionEnd_ != other.m_minContractionEnd_
1174 || m_minUnsafe_ != other.m_minUnsafe_
1175 || m_top3_ != other.m_top3_
1176 || m_topCount3_ != other.m_topCount3_
1177 || !Arrays.equals(m_unsafe_, other.m_unsafe_)) {
1180 if (!m_trie_.equals(other.m_trie_)) {
1181 // we should use the trie iterator here, but then this part is
1182 // only used in the test.
1183 for (int i = UCharacter.MAX_VALUE; i >= UCharacter.MIN_VALUE; i --)
1185 int v = m_trie_.getCodePointValue(i);
1186 int otherv = other.m_trie_.getCodePointValue(i);
1188 int mask = v & (CE_TAG_MASK_ | CE_SPECIAL_FLAG_);
1189 if (mask == (otherv & 0xff000000)) {
1192 if (mask == 0xf1000000) {
1193 v -= (m_expansionOffset_ << 4);
1194 otherv -= (other.m_expansionOffset_ << 4);
1196 else if (mask == 0xf2000000) {
1197 v -= m_contractionOffset_;
1198 otherv -= other.m_contractionOffset_;
1208 if (Arrays.equals(m_contractionCE_, other.m_contractionCE_)
1209 && Arrays.equals(m_contractionEnd_, other.m_contractionEnd_)
1210 && Arrays.equals(m_contractionIndex_, other.m_contractionIndex_)
1211 && Arrays.equals(m_expansion_, other.m_expansion_)
1212 && Arrays.equals(m_expansionEndCE_, other.m_expansionEndCE_)) {
1213 // not comparing paddings
1214 for (int i = 0; i < m_expansionEndCE_.length; i ++) {
1215 if (m_expansionEndCEMaxSize_[i]
1216 != other.m_expansionEndCEMaxSize_[i]) {
1226 * Generates a unique hash code for this RuleBasedCollator.
1227 * @return the unique hash code for this Collator
1230 public int hashCode()
1232 String rules = getRules();
1233 if (rules == null) {
1236 return rules.hashCode();
1240 * Compares the source text String to the target text String according to
1241 * the collation rules, strength and decomposition mode for this
1242 * RuleBasedCollator.
1243 * Returns an integer less than,
1244 * equal to or greater than zero depending on whether the source String is
1245 * less than, equal to or greater than the target String. See the Collator
1246 * class description for an example of use.
1249 * General recommendation: <br>
1250 * If comparison are to be done to the same String multiple times, it would
1251 * be more efficient to generate CollationKeys for the Strings and use
1252 * CollationKey.compareTo(CollationKey) for the comparisons.
1253 * If speed performance is critical and object instantiation is to be
1254 * reduced, further optimization may be achieved by generating a simpler
1255 * key of the form RawCollationKey and reusing this RawCollationKey
1256 * object with the method RuleBasedCollator.getRawCollationKey. Internal
1257 * byte representation can be directly accessed via RawCollationKey and
1258 * stored for future use. Like CollationKey, RawCollationKey provides a
1259 * method RawCollationKey.compareTo for key comparisons.
1260 * If the each Strings are compared to only once, using the method
1261 * RuleBasedCollator.compare(String, String) will have a better performance.
1263 * @param source the source text String.
1264 * @param target the target text String.
1265 * @return Returns an integer value. Value is less than zero if source is
1266 * less than target, value is zero if source and target are equal,
1267 * value is greater than zero if source is greater than target.
1269 * @see #getCollationKey
1272 public int compare(String source, String target)
1274 if (source == target) {
1278 // Find the length of any leading portion that is equal
1279 int offset = getFirstUnmatchedOffset(source, target);
1280 //return compareRegular(source, target, offset);
1282 if ((offset < source.length()
1283 && source.charAt(offset) > ENDOFLATINONERANGE_)
1284 || (offset < target.length()
1285 && target.charAt(offset) > ENDOFLATINONERANGE_)) {
1286 // source or target start with non-latin-1
1287 return compareRegular(source, target, offset);
1289 return compareUseLatin1(source, target, offset);
1292 return compareRegular(source, target, offset);
1296 // package private inner interfaces --------------------------------------
1299 * Attribute values to be used when setting the Collator options
1301 static interface AttributeValue
1304 * Indicates that the default attribute value will be used.
1305 * See individual attribute for details on its default value.
1307 static final int DEFAULT_ = -1;
1309 * Primary collation strength
1311 static final int PRIMARY_ = Collator.PRIMARY;
1313 * Secondary collation strength
1315 static final int SECONDARY_ = Collator.SECONDARY;
1317 * Tertiary collation strength
1319 static final int TERTIARY_ = Collator.TERTIARY;
1321 * Default collation strength
1323 static final int DEFAULT_STRENGTH_ = Collator.TERTIARY;
1325 * Internal use for strength checks in Collation elements
1327 static final int CE_STRENGTH_LIMIT_ = Collator.TERTIARY + 1;
1329 * Quaternary collation strength
1331 static final int QUATERNARY_ = 3;
1333 * Identical collation strength
1335 static final int IDENTICAL_ = Collator.IDENTICAL;
1337 * Internal use for strength checks
1339 static final int STRENGTH_LIMIT_ = Collator.IDENTICAL + 1;
1341 * Turn the feature off - works for FRENCH_COLLATION, CASE_LEVEL,
1342 * HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE
1344 static final int OFF_ = 16;
1346 * Turn the feature on - works for FRENCH_COLLATION, CASE_LEVEL,
1347 * HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE
1349 static final int ON_ = 17;
1351 * Valid for ALTERNATE_HANDLING. Alternate handling will be shifted
1353 static final int SHIFTED_ = 20;
1355 * Valid for ALTERNATE_HANDLING. Alternate handling will be non
1358 static final int NON_IGNORABLE_ = 21;
1360 * Valid for CASE_FIRST - lower case sorts before upper case
1362 static final int LOWER_FIRST_ = 24;
1364 * Upper case sorts before lower case
1366 static final int UPPER_FIRST_ = 25;
1368 * Number of attribute values
1370 static final int LIMIT_ = 29;
1374 * Attributes that collation service understands. All the attributes can
1375 * take DEFAULT value, as well as the values specific to each one.
1377 static interface Attribute
1380 * Attribute for direction of secondary weights - used in French.
1381 * Acceptable values are ON, which results in secondary weights being
1382 * considered backwards and OFF which treats secondary weights in the
1383 * order they appear.
1385 static final int FRENCH_COLLATION_ = 0;
1387 * Attribute for handling variable elements. Acceptable values are
1388 * NON_IGNORABLE (default) which treats all the codepoints with
1389 * non-ignorable primary weights in the same way, and SHIFTED which
1390 * causes codepoints with primary weights that are equal or below the
1391 * variable top value to be ignored on primary level and moved to the
1394 static final int ALTERNATE_HANDLING_ = 1;
1396 * Controls the ordering of upper and lower case letters. Acceptable
1397 * values are OFF (default), which orders upper and lower case letters
1398 * in accordance to their tertiary weights, UPPER_FIRST which forces
1399 * upper case letters to sort before lower case letters, and
1400 * LOWER_FIRST which does the opposite.
1402 static final int CASE_FIRST_ = 2;
1404 * Controls whether an extra case level (positioned before the third
1405 * level) is generated or not. Acceptable values are OFF (default),
1406 * when case level is not generated, and ON which causes the case
1407 * level to be generated. Contents of the case level are affected by
1408 * the value of CASE_FIRST attribute. A simple way to ignore accent
1409 * differences in a string is to set the strength to PRIMARY and
1410 * enable case level.
1412 static final int CASE_LEVEL_ = 3;
1414 * Controls whether the normalization check and necessary
1415 * normalizations are performed. When set to OFF (default) no
1416 * normalization check is performed. The correctness of the result is
1417 * guaranteed only if the input data is in so-called FCD form (see
1418 * users manual for more info). When set to ON, an incremental check
1419 * is performed to see whether the input data is in the FCD form. If
1420 * the data is not in the FCD form, incremental NFD normalization is
1423 static final int NORMALIZATION_MODE_ = 4;
1425 * The strength attribute. Can be either PRIMARY, SECONDARY, TERTIARY,
1426 * QUATERNARY or IDENTICAL. The usual strength for most locales
1427 * (except Japanese) is tertiary. Quaternary strength is useful when
1428 * combined with shifted setting for alternate handling attribute and
1429 * for JIS x 4061 collation, when it is used to distinguish between
1430 * Katakana and Hiragana (this is achieved by setting the
1431 * HIRAGANA_QUATERNARY mode to on. Otherwise, quaternary level is
1432 * affected only by the number of non ignorable code points in the
1433 * string. Identical strength is rarely useful, as it amounts to
1434 * codepoints of the NFD form of the string.
1436 static final int STRENGTH_ = 5;
1438 * When turned on, this attribute positions Hiragana before all
1439 * non-ignorables on quaternary level. This is a sneaky way to produce
1442 static final int HIRAGANA_QUATERNARY_MODE_ = 6;
1446 static final int LIMIT_ = 7;
1450 * DataManipulate singleton
1452 static class DataManipulate implements Trie.DataManipulate
1454 // public methods ----------------------------------------------------
1457 * Internal method called to parse a lead surrogate's ce for the offset
1458 * to the next trail surrogate data.
1459 * @param ce collation element of the lead surrogate
1460 * @return data offset or 0 for the next trail surrogate
1463 public final int getFoldingOffset(int ce)
1465 if (isSpecial(ce) && getTag(ce) == CE_SURROGATE_TAG_) {
1466 return (ce & 0xFFFFFF);
1472 * Get singleton object
1474 public static final DataManipulate getInstance()
1476 if (m_instance_ == null) {
1477 m_instance_ = new DataManipulate();
1482 // private data member ----------------------------------------------
1485 * Singleton instance
1487 private static DataManipulate m_instance_;
1489 // private constructor ----------------------------------------------
1492 * private to prevent initialization
1494 private DataManipulate()
1502 static final class UCAConstants
1504 int FIRST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000
1505 int LAST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000
1506 int FIRST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x00008705
1507 int FIRST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000000
1508 int LAST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000500
1509 int LAST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x0000DD05
1510 int FIRST_VARIABLE_[] = new int[2]; // 0x05070505
1511 int LAST_VARIABLE_[] = new int[2]; // 0x13CF0505
1512 int FIRST_NON_VARIABLE_[] = new int[2]; // 0x16200505
1513 int LAST_NON_VARIABLE_[] = new int[2]; // 0x767C0505
1514 int RESET_TOP_VALUE_[] = new int[2]; // 0x9F000303
1515 int FIRST_IMPLICIT_[] = new int[2];
1516 int LAST_IMPLICIT_[] = new int[2];
1517 int FIRST_TRAILING_[] = new int[2];
1518 int LAST_TRAILING_[] = new int[2];
1519 int PRIMARY_TOP_MIN_;
1520 int PRIMARY_IMPLICIT_MIN_; // 0xE8000000
1521 int PRIMARY_IMPLICIT_MAX_; // 0xF0000000
1522 int PRIMARY_TRAILING_MIN_; // 0xE8000000
1523 int PRIMARY_TRAILING_MAX_; // 0xF0000000
1524 int PRIMARY_SPECIAL_MIN_; // 0xE8000000
1525 int PRIMARY_SPECIAL_MAX_; // 0xF0000000
1528 // package private data member -------------------------------------------
1530 static final byte BYTE_FIRST_TAILORED_ = (byte)0x04;
1531 static final byte BYTE_COMMON_ = (byte)0x05;
1532 static final int COMMON_TOP_2_ = 0x86; // int for unsigness
1533 static final int COMMON_BOTTOM_2_ = BYTE_COMMON_;
1534 static final int COMMON_BOTTOM_3 = 0x05;
1536 * Case strength mask
1538 static final int CE_CASE_BIT_MASK_ = 0xC0;
1539 static final int CE_TAG_SHIFT_ = 24;
1540 static final int CE_TAG_MASK_ = 0x0F000000;
1542 static final int CE_SPECIAL_FLAG_ = 0xF0000000;
1544 * Lead surrogate that is tailored and doesn't start a contraction
1546 static final int CE_SURROGATE_TAG_ = 5;
1548 * Mask to get the primary strength of the collation element
1550 static final int CE_PRIMARY_MASK_ = 0xFFFF0000;
1552 * Mask to get the secondary strength of the collation element
1554 static final int CE_SECONDARY_MASK_ = 0xFF00;
1556 * Mask to get the tertiary strength of the collation element
1558 static final int CE_TERTIARY_MASK_ = 0xFF;
1560 * Primary strength shift
1562 static final int CE_PRIMARY_SHIFT_ = 16;
1564 * Secondary strength shift
1566 static final int CE_SECONDARY_SHIFT_ = 8;
1568 * Continuation marker
1570 static final int CE_CONTINUATION_MARKER_ = 0xC0;
1573 * Size of collator raw data headers and options before the expansion
1574 * data. This is used when expansion ces are to be retrieved. ICU4C uses
1575 * the expansion offset starting from UCollator.UColHeader, hence ICU4J
1576 * will have to minus that off to get the right expansion ce offset. In
1579 int m_expansionOffset_;
1581 * Size of collator raw data headers, options and expansions before
1582 * contraction data. This is used when contraction ces are to be retrieved.
1583 * ICU4C uses contraction offset starting from UCollator.UColHeader, hence
1584 * ICU4J will have to minus that off to get the right contraction ce
1585 * offset. In number of chars.
1587 int m_contractionOffset_;
1589 * Flag indicator if Jamo is special
1591 boolean m_isJamoSpecial_;
1593 // Collator options ------------------------------------------------------
1595 int m_defaultVariableTopValue_;
1596 boolean m_defaultIsFrenchCollation_;
1597 boolean m_defaultIsAlternateHandlingShifted_;
1598 int m_defaultCaseFirst_;
1599 boolean m_defaultIsCaseLevel_;
1600 int m_defaultDecomposition_;
1601 int m_defaultStrength_;
1602 boolean m_defaultIsHiragana4_;
1603 boolean m_defaultIsNumericCollation_;
1606 * Value of the variable top
1608 int m_variableTopValue_;
1610 * Attribute for special Hiragana
1612 boolean m_isHiragana4_;
1614 * Case sorting customization
1618 * Numeric collation option
1620 boolean m_isNumericCollation_;
1622 // end Collator options --------------------------------------------------
1629 * Contraction index table
1631 char m_contractionIndex_[];
1633 * Contraction CE table
1635 int m_contractionCE_[];
1641 * Table to store all collation elements that are the last element of an
1642 * expansion. This is for use in StringSearch.
1644 int m_expansionEndCE_[];
1646 * Table to store the maximum size of any expansions that end with the
1647 * corresponding collation element in m_expansionEndCE_. For use in
1650 byte m_expansionEndCEMaxSize_[];
1652 * Heuristic table to store information on whether a char character is
1653 * considered "unsafe". "Unsafe" character are combining marks or those
1654 * belonging to some contraction sequence from the offset 1 onwards.
1655 * E.g. if "ABC" is the only contraction, then 'B' and 'C' are considered
1656 * unsafe. If we have another contraction "ZA" with the one above, then
1657 * 'A', 'B', 'C' are "unsafe" but 'Z' is not.
1661 * Table to store information on whether a codepoint can occur as the last
1662 * character in a contraction
1664 byte m_contractionEnd_[];
1666 * Original collation rules
1670 * The smallest "unsafe" codepoint
1674 * The smallest codepoint that could be the end of a contraction
1676 char m_minContractionEnd_;
1678 * General version of the collator
1680 VersionInfo m_version_;
1684 VersionInfo m_UCA_version_;
1688 VersionInfo m_UCD_version_;
1691 * UnicodeData.txt property object
1693 static final RuleBasedCollator UCA_;
1697 static final UCAConstants UCA_CONSTANTS_;
1699 * Table for UCA and builder use
1701 static final char UCA_CONTRACTIONS_[];
1703 private static boolean UCA_INIT_COMPLETE;
1706 * Implicit generator
1708 static final ImplicitCEGenerator impCEGen_;
1710 // * Implicit constants
1712 // static final int IMPLICIT_BASE_BYTE_;
1713 // static final int IMPLICIT_LIMIT_BYTE_;
1714 // static final int IMPLICIT_4BYTE_BOUNDARY_;
1715 // static final int LAST_MULTIPLIER_;
1716 // static final int LAST2_MULTIPLIER_;
1717 // static final int IMPLICIT_BASE_3BYTE_;
1718 // static final int IMPLICIT_BASE_4BYTE_;
1719 // static final int BYTES_TO_AVOID_ = 3;
1720 // static final int OTHER_COUNT_ = 256 - BYTES_TO_AVOID_;
1721 // static final int LAST_COUNT_ = OTHER_COUNT_ / 2;
1723 // * Room for intervening, without expanding to 5 bytes
1725 // static final int LAST_COUNT2_ = OTHER_COUNT_ / 21;
1726 // static final int IMPLICIT_3BYTE_COUNT_ = 1;
1728 static final byte SORT_LEVEL_TERMINATOR_ = 1;
1730 // These are values from UCA required for
1731 // implicit generation and supressing sort key compression
1732 // they should regularly be in the UCA, but if one
1733 // is running without UCA, it could be a problem
1734 static final int maxRegularPrimary = 0xA0;
1735 static final int minImplicitPrimary = 0xE0;
1736 static final int maxImplicitPrimary = 0xE4;
1739 // block to initialise character property database
1742 // take pains to let static class init succeed, otherwise the class itself won't exist and
1743 // clients will get a NoClassDefFoundException. Instead, make the constructors fail if
1744 // we can't load the UCA data.
1746 RuleBasedCollator iUCA_ = null;
1747 UCAConstants iUCA_CONSTANTS_ = null;
1748 char iUCA_CONTRACTIONS_[] = null;
1749 ImplicitCEGenerator iimpCEGen_ = null;
1752 // !!! note what's going on here...
1753 // even though the static init of the class is not yet complete, we
1754 // instantiate an instance of the class. So we'd better be sure that
1755 // instantiation doesn't rely on the static initialization that's
1756 // not complete yet!
1757 iUCA_ = new RuleBasedCollator();
1758 iUCA_CONSTANTS_ = new UCAConstants();
1759 iUCA_CONTRACTIONS_ = CollatorReader.read(iUCA_, iUCA_CONSTANTS_);
1761 // called before doing canonical closure for the UCA.
1762 iimpCEGen_ = new ImplicitCEGenerator(minImplicitPrimary, maxImplicitPrimary);
1763 //iimpCEGen_ = new ImplicitCEGenerator(iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_, iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_);
1765 ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_COLLATION_BASE_NAME, ULocale.ENGLISH);
1766 iUCA_.m_rules_ = (String)rb.getObject("UCARules");
1768 catch (MissingResourceException ex)
1772 catch (IOException e)
1774 // e.printStackTrace();
1775 // throw new MissingResourceException(e.getMessage(),"","");
1779 UCA_CONSTANTS_ = iUCA_CONSTANTS_;
1780 UCA_CONTRACTIONS_ = iUCA_CONTRACTIONS_;
1781 impCEGen_ = iimpCEGen_;
1783 UCA_INIT_COMPLETE = true;
1787 private static void checkUCA() throws MissingResourceException {
1788 if (UCA_INIT_COMPLETE && UCA_ == null) {
1789 throw new MissingResourceException("Collator UCA data unavailable", "", "");
1793 // package private constructors ------------------------------------------
1796 * <p>Private contructor for use by subclasses.
1797 * Public access to creating Collators is handled by the API
1798 * Collator.getInstance() or RuleBasedCollator(String rules).
1801 * This constructor constructs the UCA collator internally
1811 * Constructors a RuleBasedCollator from the argument locale.
1812 * If no resource bundle is associated with the locale, UCA is used
1816 RuleBasedCollator(ULocale locale)
1819 ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_COLLATION_BASE_NAME, locale);
1823 // Use keywords, if supplied for lookup
1824 String collkey = locale.getKeywordValue("collation");
1825 if(collkey == null) {
1826 collkey = rb.getStringWithFallback("collations/default");
1829 // collations/default will always give a string back
1830 // keyword for the real collation data
1831 // if "collations/collkey" will return null if collkey == null
1832 ICUResourceBundle elements = rb.getWithFallback("collations/" + collkey);
1833 if (elements != null) {
1834 // TODO: Determine actual & valid locale correctly
1835 ULocale uloc = rb.getULocale();
1836 setLocale(uloc, uloc);
1838 m_rules_ = elements.getString("Sequence");
1839 ByteBuffer buf = elements.get("%%CollationBin").getBinary();
1842 // m_rules_ = (String)rules[1][1];
1843 byte map[] = buf.array();
1844 CollatorReader.initRBC(this, map);
1846 BufferedInputStream input =
1847 new BufferedInputStream(
1848 new ByteArrayInputStream(map));
1850 CollatorReader reader = new CollatorReader(input, false);
1851 if (map.length > MIN_BINARY_DATA_SIZE_) {
1852 reader.read(this, null);
1855 reader.readHeader(this);
1856 reader.readOptions(this);
1857 // duplicating UCA_'s data
1861 // at this point, we have read in the collator
1862 // now we need to check whether the binary image has
1863 // the right UCA and other versions
1864 if(!m_UCA_version_.equals(UCA_.m_UCA_version_) ||
1865 !m_UCD_version_.equals(UCA_.m_UCD_version_)) {
1878 catch (Exception e) {
1879 // e.printStackTrace();
1880 // if failed use UCA.
1886 // package private methods -----------------------------------------------
1889 * Sets this collator to use the tables in UCA. Note options not taken
1892 final void setWithUCATables()
1894 m_contractionOffset_ = UCA_.m_contractionOffset_;
1895 m_expansionOffset_ = UCA_.m_expansionOffset_;
1896 m_expansion_ = UCA_.m_expansion_;
1897 m_contractionIndex_ = UCA_.m_contractionIndex_;
1898 m_contractionCE_ = UCA_.m_contractionCE_;
1899 m_trie_ = UCA_.m_trie_;
1900 m_expansionEndCE_ = UCA_.m_expansionEndCE_;
1901 m_expansionEndCEMaxSize_ = UCA_.m_expansionEndCEMaxSize_;
1902 m_unsafe_ = UCA_.m_unsafe_;
1903 m_contractionEnd_ = UCA_.m_contractionEnd_;
1904 m_minUnsafe_ = UCA_.m_minUnsafe_;
1905 m_minContractionEnd_ = UCA_.m_minContractionEnd_;
1909 * Sets this collator to use the all options and tables in UCA.
1911 final void setWithUCAData()
1913 latinOneFailed_ = true;
1915 m_addition3_ = UCA_.m_addition3_;
1916 m_bottom3_ = UCA_.m_bottom3_;
1917 m_bottomCount3_ = UCA_.m_bottomCount3_;
1918 m_caseFirst_ = UCA_.m_caseFirst_;
1919 m_caseSwitch_ = UCA_.m_caseSwitch_;
1920 m_common3_ = UCA_.m_common3_;
1921 m_contractionOffset_ = UCA_.m_contractionOffset_;
1922 setDecomposition(UCA_.getDecomposition());
1923 m_defaultCaseFirst_ = UCA_.m_defaultCaseFirst_;
1924 m_defaultDecomposition_ = UCA_.m_defaultDecomposition_;
1925 m_defaultIsAlternateHandlingShifted_
1926 = UCA_.m_defaultIsAlternateHandlingShifted_;
1927 m_defaultIsCaseLevel_ = UCA_.m_defaultIsCaseLevel_;
1928 m_defaultIsFrenchCollation_ = UCA_.m_defaultIsFrenchCollation_;
1929 m_defaultIsHiragana4_ = UCA_.m_defaultIsHiragana4_;
1930 m_defaultStrength_ = UCA_.m_defaultStrength_;
1931 m_defaultVariableTopValue_ = UCA_.m_defaultVariableTopValue_;
1932 m_defaultIsNumericCollation_ = UCA_.m_defaultIsNumericCollation_;
1933 m_expansionOffset_ = UCA_.m_expansionOffset_;
1934 m_isAlternateHandlingShifted_ = UCA_.m_isAlternateHandlingShifted_;
1935 m_isCaseLevel_ = UCA_.m_isCaseLevel_;
1936 m_isFrenchCollation_ = UCA_.m_isFrenchCollation_;
1937 m_isHiragana4_ = UCA_.m_isHiragana4_;
1938 m_isJamoSpecial_ = UCA_.m_isJamoSpecial_;
1939 m_isSimple3_ = UCA_.m_isSimple3_;
1940 m_mask3_ = UCA_.m_mask3_;
1941 m_minContractionEnd_ = UCA_.m_minContractionEnd_;
1942 m_minUnsafe_ = UCA_.m_minUnsafe_;
1943 m_rules_ = UCA_.m_rules_;
1944 setStrength(UCA_.getStrength());
1945 m_top3_ = UCA_.m_top3_;
1946 m_topCount3_ = UCA_.m_topCount3_;
1947 m_variableTopValue_ = UCA_.m_variableTopValue_;
1948 m_isNumericCollation_ = UCA_.m_isNumericCollation_;
1950 latinOneFailed_ = false;
1954 * Test whether a char character is potentially "unsafe" for use as a
1955 * collation starting point. "Unsafe" characters are combining marks or
1956 * those belonging to some contraction sequence from the offset 1 onwards.
1957 * E.g. if "ABC" is the only contraction, then 'B' and
1958 * 'C' are considered unsafe. If we have another contraction "ZA" with
1959 * the one above, then 'A', 'B', 'C' are "unsafe" but 'Z' is not.
1960 * @param ch character to determin
1961 * @return true if ch is unsafe, false otherwise
1963 final boolean isUnsafe(char ch)
1965 if (ch < m_minUnsafe_) {
1969 if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) {
1970 if (UTF16.isLeadSurrogate(ch)
1971 || UTF16.isTrailSurrogate(ch)) {
1972 // Trail surrogate are always considered unsafe.
1975 ch &= HEURISTIC_OVERFLOW_MASK_;
1976 ch += HEURISTIC_OVERFLOW_OFFSET_;
1978 int value = m_unsafe_[ch >> HEURISTIC_SHIFT_];
1979 return ((value >> (ch & HEURISTIC_MASK_)) & 1) != 0;
1983 * Approximate determination if a char character is at a contraction end.
1984 * Guaranteed to be true if a character is at the end of a contraction,
1985 * otherwise it is not deterministic.
1986 * @param ch character to be determined
1988 final boolean isContractionEnd(char ch)
1990 if (UTF16.isTrailSurrogate(ch)) {
1994 if (ch < m_minContractionEnd_) {
1998 if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) {
1999 ch &= HEURISTIC_OVERFLOW_MASK_;
2000 ch += HEURISTIC_OVERFLOW_OFFSET_;
2002 int value = m_contractionEnd_[ch >> HEURISTIC_SHIFT_];
2003 return ((value >> (ch & HEURISTIC_MASK_)) & 1) != 0;
2007 * Retrieve the tag of a special ce
2008 * @param ce ce to test
2011 static int getTag(int ce)
2013 return (ce & CE_TAG_MASK_) >> CE_TAG_SHIFT_;
2017 * Checking if ce is special
2018 * @param ce to check
2019 * @return true if ce is special
2021 static boolean isSpecial(int ce)
2023 return (ce & CE_SPECIAL_FLAG_) == CE_SPECIAL_FLAG_;
2027 * Checks if the argument ce is a continuation
2028 * @param ce collation element to test
2029 * @return true if ce is a continuation
2031 static final boolean isContinuation(int ce)
2033 return ce != CollationElementIterator.NULLORDER
2034 && (ce & CE_CONTINUATION_TAG_) == CE_CONTINUATION_TAG_;
2037 // private inner classes ------------------------------------------------
2039 // private variables -----------------------------------------------------
2042 * The smallest natural unsafe or contraction end char character before
2044 * This is a combining mark.
2046 private static final int DEFAULT_MIN_HEURISTIC_ = 0x300;
2048 * Heuristic table table size. Size is 32 bytes, 1 bit for each
2049 * latin 1 char, and some power of two for hashing the rest of the chars.
2052 private static final char HEURISTIC_SIZE_ = 1056;
2054 * Mask value down to "some power of two" - 1,
2055 * number of bits, not num of bytes.
2057 private static final char HEURISTIC_OVERFLOW_MASK_ = 0x1fff;
2059 * Unsafe character shift
2061 private static final int HEURISTIC_SHIFT_ = 3;
2063 * Unsafe character addition for character too large, it has to be folded
2066 private static final char HEURISTIC_OVERFLOW_OFFSET_ = 256;
2068 * Mask value to get offset in heuristic table.
2070 private static final char HEURISTIC_MASK_ = 7;
2072 private int m_caseSwitch_;
2073 private int m_common3_;
2074 private int m_mask3_;
2076 * When switching case, we need to add or subtract different values.
2078 private int m_addition3_;
2080 * Upper range when compressing
2082 private int m_top3_;
2084 * Upper range when compressing
2086 private int m_bottom3_;
2087 private int m_topCount3_;
2088 private int m_bottomCount3_;
2090 * Case first constants
2092 private static final int CASE_SWITCH_ = 0xC0;
2093 private static final int NO_CASE_SWITCH_ = 0;
2095 * Case level constants
2097 private static final int CE_REMOVE_CASE_ = 0x3F;
2098 private static final int CE_KEEP_CASE_ = 0xFF;
2100 * Case strength mask
2102 private static final int CE_CASE_MASK_3_ = 0xFF;
2104 * Sortkey size factor. Values can be changed.
2106 private static final double PROPORTION_2_ = 0.5;
2107 private static final double PROPORTION_3_ = 0.667;
2109 // These values come from the UCA ----------------------------------------
2112 * This is an enum that lists magic special byte values from the
2115 //private static final byte BYTE_ZERO_ = 0x0;
2116 //private static final byte BYTE_LEVEL_SEPARATOR_ = (byte)0x01;
2117 //private static final byte BYTE_SORTKEY_GLUE_ = (byte)0x02;
2118 private static final byte BYTE_SHIFT_PREFIX_ = (byte)0x03;
2119 /*private*/ static final byte BYTE_UNSHIFTED_MIN_ = BYTE_SHIFT_PREFIX_;
2120 //private static final byte BYTE_FIRST_UCA_ = BYTE_COMMON_;
2121 static final byte CODAN_PLACEHOLDER = 0x27;
2122 //private static final byte BYTE_LAST_LATIN_PRIMARY_ = (byte)0x4C;
2123 private static final byte BYTE_FIRST_NON_LATIN_PRIMARY_ = (byte)0x4D;
2124 private static final byte BYTE_UNSHIFTED_MAX_ = (byte)0xFF;
2125 private static final int TOTAL_2_ = COMMON_TOP_2_ - COMMON_BOTTOM_2_ - 1;
2126 private static final int FLAG_BIT_MASK_CASE_SWITCH_OFF_ = 0x80;
2127 private static final int FLAG_BIT_MASK_CASE_SWITCH_ON_ = 0x40;
2128 private static final int COMMON_TOP_CASE_SWITCH_OFF_3_ = 0x85;
2129 private static final int COMMON_TOP_CASE_SWITCH_LOWER_3_ = 0x45;
2130 private static final int COMMON_TOP_CASE_SWITCH_UPPER_3_ = 0xC5;
2131 private static final int COMMON_BOTTOM_3_ = 0x05;
2132 private static final int COMMON_BOTTOM_CASE_SWITCH_UPPER_3_ = 0x86;
2133 private static final int COMMON_BOTTOM_CASE_SWITCH_LOWER_3_ =
2135 private static final int TOP_COUNT_2_ = (int)(PROPORTION_2_ * TOTAL_2_);
2136 private static final int BOTTOM_COUNT_2_ = TOTAL_2_ - TOP_COUNT_2_;
2137 private static final int COMMON_2_ = COMMON_BOTTOM_2_;
2138 private static final int COMMON_UPPER_FIRST_3_ = 0xC5;
2139 private static final int COMMON_NORMAL_3_ = COMMON_BOTTOM_3_;
2140 //private static final int COMMON_4_ = (byte)0xFF;
2145 * Minimum size required for the binary collation data in bytes.
2146 * Size of UCA header + size of options to 4 bytes
2148 //private static final int MIN_BINARY_DATA_SIZE_ = (42 + 25) << 2;
2151 * If this collator is to generate only simple tertiaries for fast path
2153 private boolean m_isSimple3_;
2156 * French collation sorting flag
2158 private boolean m_isFrenchCollation_;
2160 * Flag indicating if shifted is requested for Quaternary alternate
2161 * handling. If this is not true, the default for alternate handling will
2164 private boolean m_isAlternateHandlingShifted_;
2166 * Extra case level for sorting
2168 private boolean m_isCaseLevel_;
2170 private static final int SORT_BUFFER_INIT_SIZE_ = 128;
2171 private static final int SORT_BUFFER_INIT_SIZE_1_ =
2172 SORT_BUFFER_INIT_SIZE_ << 3;
2173 private static final int SORT_BUFFER_INIT_SIZE_2_ = SORT_BUFFER_INIT_SIZE_;
2174 private static final int SORT_BUFFER_INIT_SIZE_3_ = SORT_BUFFER_INIT_SIZE_;
2175 private static final int SORT_BUFFER_INIT_SIZE_CASE_ =
2176 SORT_BUFFER_INIT_SIZE_ >> 2;
2177 private static final int SORT_BUFFER_INIT_SIZE_4_ = SORT_BUFFER_INIT_SIZE_;
2179 private static final int CE_CONTINUATION_TAG_ = 0xC0;
2180 private static final int CE_REMOVE_CONTINUATION_MASK_ = 0xFFFFFF3F;
2182 private static final int LAST_BYTE_MASK_ = 0xFF;
2184 //private static final int CE_RESET_TOP_VALUE_ = 0x9F000303;
2185 //private static final int CE_NEXT_TOP_VALUE_ = 0xE8960303;
2187 private static final byte SORT_CASE_BYTE_START_ = (byte)0x80;
2188 private static final byte SORT_CASE_SHIFT_START_ = (byte)7;
2193 private static final int CE_BUFFER_SIZE_ = 512;
2195 // variables for Latin-1 processing
2196 boolean latinOneUse_ = false;
2197 boolean latinOneRegenTable_ = false;
2198 boolean latinOneFailed_ = false;
2200 int latinOneTableLen_ = 0;
2201 int latinOneCEs_[] = null;
2203 * Bunch of utility iterators
2205 private StringUCharacterIterator m_srcUtilIter_;
2206 private CollationElementIterator m_srcUtilColEIter_;
2207 private StringUCharacterIterator m_tgtUtilIter_;
2208 private CollationElementIterator m_tgtUtilColEIter_;
2210 * Utility comparison flags
2212 private boolean m_utilCompare0_;
2213 //private boolean m_utilCompare1_;
2214 private boolean m_utilCompare2_;
2215 private boolean m_utilCompare3_;
2216 private boolean m_utilCompare4_;
2217 private boolean m_utilCompare5_;
2219 * Utility byte buffer
2221 private byte m_utilBytes0_[];
2222 private byte m_utilBytes1_[];
2223 private byte m_utilBytes2_[];
2224 private byte m_utilBytes3_[];
2225 private byte m_utilBytes4_[];
2226 //private byte m_utilBytes5_[];
2227 private RawCollationKey m_utilRawCollationKey_;
2229 private int m_utilBytesCount0_;
2230 private int m_utilBytesCount1_;
2231 private int m_utilBytesCount2_;
2232 private int m_utilBytesCount3_;
2233 private int m_utilBytesCount4_;
2234 //private int m_utilBytesCount5_;
2235 //private int m_utilCount0_;
2236 //private int m_utilCount1_;
2237 private int m_utilCount2_;
2238 private int m_utilCount3_;
2239 private int m_utilCount4_;
2240 //private int m_utilCount5_;
2242 private int m_utilFrenchStart_;
2243 private int m_utilFrenchEnd_;
2246 * Preparing the CE buffers. will be filled during the primary phase
2248 private int m_srcUtilCEBuffer_[];
2249 private int m_tgtUtilCEBuffer_[];
2250 private int m_srcUtilCEBufferSize_;
2251 private int m_tgtUtilCEBufferSize_;
2253 private int m_srcUtilContOffset_;
2254 private int m_tgtUtilContOffset_;
2256 private int m_srcUtilOffset_;
2257 private int m_tgtUtilOffset_;
2259 // private methods -------------------------------------------------------
2261 private void init(String rules) throws Exception
2264 CollationParsedRuleBuilder builder
2265 = new CollationParsedRuleBuilder(rules);
2266 builder.setRules(this);
2272 private final int compareRegular(String source, String target, int offset) {
2273 if (m_srcUtilIter_ == null) {
2276 int strength = getStrength();
2277 // setting up the collator parameters
2278 m_utilCompare0_ = m_isCaseLevel_;
2279 //m_utilCompare1_ = true;
2280 m_utilCompare2_ = strength >= SECONDARY;
2281 m_utilCompare3_ = strength >= TERTIARY;
2282 m_utilCompare4_ = strength >= QUATERNARY;
2283 m_utilCompare5_ = strength == IDENTICAL;
2284 boolean doFrench = m_isFrenchCollation_ && m_utilCompare2_;
2285 boolean doShift4 = m_isAlternateHandlingShifted_ && m_utilCompare4_;
2286 boolean doHiragana4 = m_isHiragana4_ && m_utilCompare4_;
2288 if (doHiragana4 && doShift4) {
2289 String sourcesub = source.substring(offset);
2290 String targetsub = target.substring(offset);
2291 return compareBySortKeys(sourcesub, targetsub);
2294 // This is the lowest primary value that will not be ignored if shifted
2295 int lowestpvalue = m_isAlternateHandlingShifted_
2296 ? m_variableTopValue_ << 16 : 0;
2297 m_srcUtilCEBufferSize_ = 0;
2298 m_tgtUtilCEBufferSize_ = 0;
2299 int result = doPrimaryCompare(doHiragana4, lowestpvalue, source,
2301 if (m_srcUtilCEBufferSize_ == -1
2302 && m_tgtUtilCEBufferSize_ == -1) {
2303 // since the cebuffer is cleared when we have determined that
2304 // either source is greater than target or vice versa, the return
2305 // result is the comparison result and not the hiragana result
2309 int hiraganaresult = result;
2311 if (m_utilCompare2_) {
2312 result = doSecondaryCompare(doFrench);
2317 // doing the case bit
2318 if (m_utilCompare0_) {
2319 result = doCaseCompare();
2325 if (m_utilCompare3_) {
2326 result = doTertiaryCompare();
2332 if (doShift4) { // checkQuad
2333 result = doQuaternaryCompare(lowestpvalue);
2338 else if (doHiragana4 && hiraganaresult != 0) {
2339 // If we're fine on quaternaries, we might be different
2340 // on Hiragana. This, however, might fail us in shifted.
2341 return hiraganaresult;
2344 // For IDENTICAL comparisons, we use a bitwise character comparison
2345 // as a tiebreaker if all else is equal.
2346 // Getting here should be quite rare - strings are not identical -
2347 // that is checked first, but compared == through all other checks.
2348 if (m_utilCompare5_) {
2349 return doIdenticalCompare(source, target, offset, true);
2355 * Gets the 2 bytes of primary order and adds it to the primary byte array
2356 * @param ce current ce
2357 * @param notIsContinuation flag indicating if the current bytes belong to
2359 * @param doShift flag indicating if ce is to be shifted
2360 * @param leadPrimary lead primary used for compression
2361 * @param commonBottom4 common byte value for Quaternary
2362 * @param bottomCount4 smallest byte value for Quaternary
2363 * @return the new lead primary for compression
2365 private final int doPrimaryBytes(int ce, boolean notIsContinuation,
2366 boolean doShift, int leadPrimary,
2367 int commonBottom4, int bottomCount4)
2370 int p2 = (ce >>>= 16) & LAST_BYTE_MASK_; // in ints for unsigned
2371 int p1 = ce >>> 8; // comparison
2373 if (m_utilCount4_ > 0) {
2374 while (m_utilCount4_ > bottomCount4) {
2375 m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
2376 (byte)(commonBottom4 + bottomCount4));
2377 m_utilBytesCount4_ ++;
2378 m_utilCount4_ -= bottomCount4;
2380 m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
2381 (byte)(commonBottom4
2382 + (m_utilCount4_ - 1)));
2383 m_utilBytesCount4_ ++;
2386 // dealing with a variable and we're treating them as shifted
2387 // This is a shifted ignorable
2389 // we need to check this since we could be in continuation
2390 m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
2392 m_utilBytesCount4_ ++;
2395 m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
2397 m_utilBytesCount4_ ++;
2401 // Note: This code assumes that the table is well built
2402 // i.e. not having 0 bytes where they are not supposed to be.
2403 // Usually, we'll have non-zero primary1 & primary2, except
2404 // in cases of LatinOne and friends, when primary2 will be
2405 // regular and simple sortkey calc
2406 if (p1 != CollationElementIterator.IGNORABLE) {
2407 if (notIsContinuation) {
2408 if (leadPrimary == p1) {
2409 m_utilBytes1_ = append(m_utilBytes1_,
2410 m_utilBytesCount1_, (byte)p2);
2411 m_utilBytesCount1_ ++;
2414 if (leadPrimary != 0) {
2415 m_utilBytes1_ = append(m_utilBytes1_,
2418 ? BYTE_UNSHIFTED_MAX_
2419 : BYTE_UNSHIFTED_MIN_));
2420 m_utilBytesCount1_ ++;
2422 if (p2 == CollationElementIterator.IGNORABLE) {
2423 // one byter, not compressed
2424 m_utilBytes1_ = append(m_utilBytes1_,
2427 m_utilBytesCount1_ ++;
2430 else if (p1 < BYTE_FIRST_NON_LATIN_PRIMARY_
2431 || (p1 > maxRegularPrimary
2432 //> (RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_[0]
2434 && p1 < minImplicitPrimary
2435 //< (RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_[0]
2440 m_utilBytes1_ = append(m_utilBytes1_,
2443 m_utilBytesCount1_ ++;
2444 m_utilBytes1_ = append(m_utilBytes1_,
2447 m_utilBytesCount1_ ++;
2451 m_utilBytes1_ = append(m_utilBytes1_,
2454 m_utilBytesCount1_ ++;
2455 m_utilBytes1_ = append(m_utilBytes1_,
2456 m_utilBytesCount1_, (byte)p2);
2457 m_utilBytesCount1_ ++;
2462 // continuation, add primary to the key, no compression
2463 m_utilBytes1_ = append(m_utilBytes1_,
2464 m_utilBytesCount1_, (byte)p1);
2465 m_utilBytesCount1_ ++;
2466 if (p2 != CollationElementIterator.IGNORABLE) {
2467 m_utilBytes1_ = append(m_utilBytes1_,
2468 m_utilBytesCount1_, (byte)p2);
2470 m_utilBytesCount1_ ++;
2479 * Gets the secondary byte and adds it to the secondary byte array
2480 * @param ce current ce
2481 * @param notIsContinuation flag indicating if the current bytes belong to
2483 * @param doFrench flag indicator if french sort is to be performed
2485 private final void doSecondaryBytes(int ce, boolean notIsContinuation,
2488 int s = (ce >>= 8) & LAST_BYTE_MASK_; // int for comparison
2491 // This is compression code.
2492 if (s == COMMON_2_ && notIsContinuation) {
2496 if (m_utilCount2_ > 0) {
2497 if (s > COMMON_2_) { // not necessary for 4th level.
2498 while (m_utilCount2_ > TOP_COUNT_2_) {
2499 m_utilBytes2_ = append(m_utilBytes2_,
2501 (byte)(COMMON_TOP_2_ - TOP_COUNT_2_));
2502 m_utilBytesCount2_ ++;
2503 m_utilCount2_ -= TOP_COUNT_2_;
2505 m_utilBytes2_ = append(m_utilBytes2_,
2507 (byte)(COMMON_TOP_2_
2508 - (m_utilCount2_ - 1)));
2509 m_utilBytesCount2_ ++;
2512 while (m_utilCount2_ > BOTTOM_COUNT_2_) {
2513 m_utilBytes2_ = append(m_utilBytes2_,
2515 (byte)(COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
2516 m_utilBytesCount2_ ++;
2517 m_utilCount2_ -= BOTTOM_COUNT_2_;
2519 m_utilBytes2_ = append(m_utilBytes2_,
2521 (byte)(COMMON_BOTTOM_2_
2522 + (m_utilCount2_ - 1)));
2523 m_utilBytesCount2_ ++;
2527 m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,
2529 m_utilBytesCount2_ ++;
2533 m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,
2535 m_utilBytesCount2_ ++;
2536 // Do the special handling for French secondaries
2537 // We need to get continuation elements and do intermediate
2539 // abc1c2c3de with french secondaries need to be edc1c2c3ba
2541 if (notIsContinuation) {
2542 if (m_utilFrenchStart_ != -1) {
2543 // reverse secondaries from frenchStartPtr up to
2545 reverseBuffer(m_utilBytes2_);
2546 m_utilFrenchStart_ = -1;
2550 if (m_utilFrenchStart_ == -1) {
2551 m_utilFrenchStart_ = m_utilBytesCount2_ - 2;
2553 m_utilFrenchEnd_ = m_utilBytesCount2_ - 1;
2560 * Reverse the argument buffer
2561 * @param buffer to reverse
2563 private void reverseBuffer(byte buffer[])
2565 int start = m_utilFrenchStart_;
2566 int end = m_utilFrenchEnd_;
2567 while (start < end) {
2568 byte b = buffer[start];
2569 buffer[start ++] = buffer[end];
2575 * Insert the case shifting byte if required
2576 * @param caseshift value
2577 * @return new caseshift value
2579 private final int doCaseShift(int caseshift)
2581 if (caseshift == 0) {
2582 m_utilBytes0_ = append(m_utilBytes0_, m_utilBytesCount0_,
2583 SORT_CASE_BYTE_START_);
2584 m_utilBytesCount0_ ++;
2585 caseshift = SORT_CASE_SHIFT_START_;
2591 * Performs the casing sort
2592 * @param tertiary byte in ints for easy comparison
2593 * @param notIsContinuation flag indicating if the current bytes belong to
2596 * @return the new value of case shift
2598 private final int doCaseBytes(int tertiary, boolean notIsContinuation,
2601 caseshift = doCaseShift(caseshift);
2603 if (notIsContinuation && tertiary != 0) {
2604 byte casebits = (byte)(tertiary & 0xC0);
2605 if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {
2606 if (casebits == 0) {
2607 m_utilBytes0_[m_utilBytesCount0_ - 1]
2608 |= (1 << (-- caseshift));
2612 caseshift = doCaseShift(caseshift - 1);
2613 m_utilBytes0_[m_utilBytesCount0_ - 1]
2614 |= ((casebits >> 6) & 1) << (-- caseshift);
2618 if (casebits != 0) {
2619 m_utilBytes0_[m_utilBytesCount0_ - 1]
2620 |= 1 << (-- caseshift);
2622 caseshift = doCaseShift(caseshift);
2623 m_utilBytes0_[m_utilBytesCount0_ - 1]
2624 |= ((casebits >> 7) & 1) << (-- caseshift);
2636 * Gets the tertiary byte and adds it to the tertiary byte array
2637 * @param tertiary byte in int for easy comparison
2638 * @param notIsContinuation flag indicating if the current bytes belong to
2641 private final void doTertiaryBytes(int tertiary, boolean notIsContinuation)
2643 if (tertiary != 0) {
2644 // This is compression code.
2645 // sequence size check is included in the if clause
2646 if (tertiary == m_common3_ && notIsContinuation) {
2650 int common3 = m_common3_ & LAST_BYTE_MASK_;
2651 if (tertiary > common3 && m_common3_ == COMMON_NORMAL_3_) {
2652 tertiary += m_addition3_;
2654 else if (tertiary <= common3
2655 && m_common3_ == COMMON_UPPER_FIRST_3_) {
2656 tertiary -= m_addition3_;
2658 if (m_utilCount3_ > 0) {
2659 if (tertiary > common3) {
2660 while (m_utilCount3_ > m_topCount3_) {
2661 m_utilBytes3_ = append(m_utilBytes3_,
2663 (byte)(m_top3_ - m_topCount3_));
2664 m_utilBytesCount3_ ++;
2665 m_utilCount3_ -= m_topCount3_;
2667 m_utilBytes3_ = append(m_utilBytes3_,
2670 - (m_utilCount3_ - 1)));
2671 m_utilBytesCount3_ ++;
2674 while (m_utilCount3_ > m_bottomCount3_) {
2675 m_utilBytes3_ = append(m_utilBytes3_,
2677 (byte)(m_bottom3_ + m_bottomCount3_));
2678 m_utilBytesCount3_ ++;
2679 m_utilCount3_ -= m_bottomCount3_;
2681 m_utilBytes3_ = append(m_utilBytes3_,
2684 + (m_utilCount3_ - 1)));
2685 m_utilBytesCount3_ ++;
2689 m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
2691 m_utilBytesCount3_ ++;
2697 * Gets the Quaternary byte and adds it to the Quaternary byte array
2698 * @param isCodePointHiragana flag indicator if the previous codepoint
2699 * we dealt with was Hiragana
2700 * @param commonBottom4 smallest common Quaternary byte
2701 * @param bottomCount4 smallest Quaternary byte
2702 * @param hiragana4 hiragana Quaternary byte
2704 private final void doQuaternaryBytes(boolean isCodePointHiragana,
2705 int commonBottom4, int bottomCount4,
2708 if (isCodePointHiragana) { // This was Hiragana, need to note it
2709 if (m_utilCount4_ > 0) { // Close this part
2710 while (m_utilCount4_ > bottomCount4) {
2711 m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
2712 (byte)(commonBottom4
2714 m_utilBytesCount4_ ++;
2715 m_utilCount4_ -= bottomCount4;
2717 m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
2718 (byte)(commonBottom4
2719 + (m_utilCount4_ - 1)));
2720 m_utilBytesCount4_ ++;
2723 m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
2724 hiragana4); // Add the Hiragana
2725 m_utilBytesCount4_ ++;
2727 else { // This wasn't Hiragana, so we can continue adding stuff
2733 * Iterates through the argument string for all ces.
2734 * Split the ces into their relevant primaries, secondaries etc.
2735 * @param source normalized string
2736 * @param doFrench flag indicator if special handling of French has to be
2738 * @param hiragana4 offset for Hiragana quaternary
2739 * @param commonBottom4 smallest common quaternary byte
2740 * @param bottomCount4 smallest quaternary byte
2742 private final void getSortKeyBytes(String source, boolean doFrench,
2743 byte hiragana4, int commonBottom4,
2747 if (m_srcUtilIter_ == null) {
2750 int backupDecomposition = getDecomposition();
2751 setDecomposition(NO_DECOMPOSITION); // have to revert to backup later
2752 m_srcUtilIter_.setText(source);
2753 m_srcUtilColEIter_.setText(m_srcUtilIter_);
2754 m_utilFrenchStart_ = -1;
2755 m_utilFrenchEnd_ = -1;
2757 // scriptorder not implemented yet
2758 // const uint8_t *scriptOrder = coll->scriptOrder;
2760 boolean doShift = false;
2761 boolean notIsContinuation = false;
2763 int leadPrimary = 0; // int for easier comparison
2767 int ce = m_srcUtilColEIter_.next();
2768 if (ce == CollationElementIterator.NULLORDER) {
2772 if (ce == CollationElementIterator.IGNORABLE) {
2776 notIsContinuation = !isContinuation(ce);
2779 * if (notIsContinuation) {
2780 if (scriptOrder != NULL) {
2781 primary1 = scriptOrder[primary1];
2784 boolean isPrimaryByteIgnorable = (ce & CE_PRIMARY_MASK_) == 0;
2785 // actually we can just check that the first byte is 0
2786 // generation stuffs the order left first
2787 boolean isSmallerThanVariableTop = (ce >>> CE_PRIMARY_SHIFT_)
2788 <= m_variableTopValue_;
2789 doShift = (m_isAlternateHandlingShifted_
2790 && ((notIsContinuation && isSmallerThanVariableTop
2791 && !isPrimaryByteIgnorable) // primary byte not 0
2792 || (!notIsContinuation && doShift))
2793 || (doShift && isPrimaryByteIgnorable));
2794 if (doShift && isPrimaryByteIgnorable) {
2795 // amendment to the UCA says that primary ignorables and other
2796 // ignorables should be removed if following a shifted code
2798 // if we were shifted and we got an ignorable code point
2799 // we should just completely ignore it
2802 leadPrimary = doPrimaryBytes(ce, notIsContinuation, doShift,
2803 leadPrimary, commonBottom4,
2808 if (m_utilCompare2_) {
2809 doSecondaryBytes(ce, notIsContinuation, doFrench);
2812 int t = ce & LAST_BYTE_MASK_;
2813 if (!notIsContinuation) {
2814 t = ce & CE_REMOVE_CONTINUATION_MASK_;
2817 if (m_utilCompare0_ && (!isPrimaryByteIgnorable || m_utilCompare2_)) {
2818 // do the case level if we need to do it. We don't want to calculate
2819 // case level for primary ignorables if we have only primary strength and case level
2820 // otherwise we would break well formedness of CEs
2821 caseShift = doCaseBytes(t, notIsContinuation, caseShift);
2823 else if (notIsContinuation) {
2829 if (m_utilCompare3_) {
2830 doTertiaryBytes(t, notIsContinuation);
2833 if (m_utilCompare4_ && notIsContinuation) { // compare quad
2834 doQuaternaryBytes(m_srcUtilColEIter_.m_isCodePointHiragana_,
2835 commonBottom4, bottomCount4, hiragana4);
2838 setDecomposition(backupDecomposition); // reverts to original
2839 if (m_utilFrenchStart_ != -1) {
2840 // one last round of checks
2841 reverseBuffer(m_utilBytes2_);
2846 * From the individual strength byte results the final compact sortkey
2847 * will be calculated.
2848 * @param source text string
2849 * @param doFrench flag indicating that special handling of French has to
2851 * @param commonBottom4 smallest common quaternary byte
2852 * @param bottomCount4 smallest quaternary byte
2853 * @param key output RawCollationKey to store results, key cannot be null
2855 private final void getSortKey(String source, boolean doFrench,
2858 RawCollationKey key)
2860 // we have done all the CE's, now let's put them together to form
2862 if (m_utilCompare2_) {
2863 doSecondary(doFrench);
2865 // adding case level should be independent of secondary level
2866 if (m_utilCompare0_) {
2869 if (m_utilCompare3_) {
2871 if (m_utilCompare4_) {
2872 doQuaternary(commonBottom4, bottomCount4);
2873 if (m_utilCompare5_) {
2874 doIdentical(source);
2879 m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte)0);
2880 m_utilBytesCount1_ ++;
2882 key.set(m_utilBytes1_, 0, m_utilBytesCount1_);
2886 * Packs the French bytes
2888 private final void doFrench()
2890 for (int i = 0; i < m_utilBytesCount2_; i ++) {
2891 byte s = m_utilBytes2_[m_utilBytesCount2_ - i - 1];
2892 // This is compression code.
2893 if (s == COMMON_2_) {
2897 if (m_utilCount2_ > 0) {
2898 // getting the unsigned value
2899 if ((s & LAST_BYTE_MASK_) > COMMON_2_) {
2900 // not necessary for 4th level.
2901 while (m_utilCount2_ > TOP_COUNT_2_) {
2902 m_utilBytes1_ = append(m_utilBytes1_,
2904 (byte)(COMMON_TOP_2_ - TOP_COUNT_2_));
2905 m_utilBytesCount1_ ++;
2906 m_utilCount2_ -= TOP_COUNT_2_;
2908 m_utilBytes1_ = append(m_utilBytes1_,
2910 (byte)(COMMON_TOP_2_
2911 - (m_utilCount2_ - 1)));
2912 m_utilBytesCount1_ ++;
2915 while (m_utilCount2_ > BOTTOM_COUNT_2_) {
2916 m_utilBytes1_ = append(m_utilBytes1_,
2918 (byte)(COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
2919 m_utilBytesCount1_ ++;
2920 m_utilCount2_ -= BOTTOM_COUNT_2_;
2922 m_utilBytes1_ = append(m_utilBytes1_,
2924 (byte)(COMMON_BOTTOM_2_
2925 + (m_utilCount2_ - 1)));
2926 m_utilBytesCount1_ ++;
2930 m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, s);
2931 m_utilBytesCount1_ ++;
2934 if (m_utilCount2_ > 0) {
2935 while (m_utilCount2_ > BOTTOM_COUNT_2_) {
2936 m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
2937 (byte)(COMMON_BOTTOM_2_
2938 + BOTTOM_COUNT_2_));
2939 m_utilBytesCount1_ ++;
2940 m_utilCount2_ -= BOTTOM_COUNT_2_;
2942 m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
2943 (byte)(COMMON_BOTTOM_2_
2944 + (m_utilCount2_ - 1)));
2945 m_utilBytesCount1_ ++;
2950 * Compacts the secondary bytes and stores them into the primary array
2951 * @param doFrench flag indicator that French has to be handled specially
2953 private final void doSecondary(boolean doFrench)
2955 if (m_utilCount2_ > 0) {
2956 while (m_utilCount2_ > BOTTOM_COUNT_2_) {
2957 m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,
2958 (byte)(COMMON_BOTTOM_2_
2959 + BOTTOM_COUNT_2_));
2960 m_utilBytesCount2_ ++;
2961 m_utilCount2_ -= BOTTOM_COUNT_2_;
2963 m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,
2964 (byte)(COMMON_BOTTOM_2_ +
2965 (m_utilCount2_ - 1)));
2966 m_utilBytesCount2_ ++;
2969 m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
2970 SORT_LEVEL_TERMINATOR_);
2971 m_utilBytesCount1_ ++;
2973 if (doFrench) { // do the reverse copy
2977 if (m_utilBytes1_.length <= m_utilBytesCount1_
2978 + m_utilBytesCount2_) {
2979 m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
2980 m_utilBytesCount2_);
2982 System.arraycopy(m_utilBytes2_, 0, m_utilBytes1_,
2983 m_utilBytesCount1_, m_utilBytesCount2_);
2984 m_utilBytesCount1_ += m_utilBytesCount2_;
2989 * Increase buffer size
2990 * @param buffer array of bytes
2991 * @param size of the byte array
2992 * @param incrementsize size to increase
2993 * @return the new buffer
2995 private static final byte[] increase(byte buffer[], int size,
2998 byte result[] = new byte[buffer.length + incrementsize];
2999 System.arraycopy(buffer, 0, result, 0, size);
3004 * Increase buffer size
3005 * @param buffer array of ints
3006 * @param size of the byte array
3007 * @param incrementsize size to increase
3008 * @return the new buffer
3010 private static final int[] increase(int buffer[], int size,
3013 int result[] = new int[buffer.length + incrementsize];
3014 System.arraycopy(buffer, 0, result, 0, size);
3019 * Compacts the case bytes and stores them into the primary array
3021 private final void doCase()
3023 m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
3024 SORT_LEVEL_TERMINATOR_);
3025 m_utilBytesCount1_ ++;
3026 if (m_utilBytes1_.length <= m_utilBytesCount1_ + m_utilBytesCount0_) {
3027 m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
3028 m_utilBytesCount0_);
3030 System.arraycopy(m_utilBytes0_, 0, m_utilBytes1_, m_utilBytesCount1_,
3031 m_utilBytesCount0_);
3032 m_utilBytesCount1_ += m_utilBytesCount0_;
3036 * Compacts the tertiary bytes and stores them into the primary array
3038 private final void doTertiary()
3040 if (m_utilCount3_ > 0) {
3041 if (m_common3_ != COMMON_BOTTOM_3_) {
3042 while (m_utilCount3_ >= m_topCount3_) {
3043 m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
3044 (byte)(m_top3_ - m_topCount3_));
3045 m_utilBytesCount3_ ++;
3046 m_utilCount3_ -= m_topCount3_;
3048 m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
3049 (byte)(m_top3_ - m_utilCount3_));
3050 m_utilBytesCount3_ ++;
3053 while (m_utilCount3_ > m_bottomCount3_) {
3054 m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
3056 + m_bottomCount3_));
3057 m_utilBytesCount3_ ++;
3058 m_utilCount3_ -= m_bottomCount3_;
3060 m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
3062 + (m_utilCount3_ - 1)));
3063 m_utilBytesCount3_ ++;
3066 m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
3067 SORT_LEVEL_TERMINATOR_);
3068 m_utilBytesCount1_ ++;
3069 if (m_utilBytes1_.length <= m_utilBytesCount1_ + m_utilBytesCount3_) {
3070 m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
3071 m_utilBytesCount3_);
3073 System.arraycopy(m_utilBytes3_, 0, m_utilBytes1_, m_utilBytesCount1_,
3074 m_utilBytesCount3_);
3075 m_utilBytesCount1_ += m_utilBytesCount3_;
3079 * Compacts the quaternary bytes and stores them into the primary array
3081 private final void doQuaternary(int commonbottom4, int bottomcount4)
3083 if (m_utilCount4_ > 0) {
3084 while (m_utilCount4_ > bottomcount4) {
3085 m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
3086 (byte)(commonbottom4 + bottomcount4));
3087 m_utilBytesCount4_ ++;
3088 m_utilCount4_ -= bottomcount4;
3090 m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
3091 (byte)(commonbottom4
3092 + (m_utilCount4_ - 1)));
3093 m_utilBytesCount4_ ++;
3095 m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
3096 SORT_LEVEL_TERMINATOR_);
3097 m_utilBytesCount1_ ++;
3098 if (m_utilBytes1_.length <= m_utilBytesCount1_ + m_utilBytesCount4_) {
3099 m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
3100 m_utilBytesCount4_);
3102 System.arraycopy(m_utilBytes4_, 0, m_utilBytes1_, m_utilBytesCount1_,
3103 m_utilBytesCount4_);
3104 m_utilBytesCount1_ += m_utilBytesCount4_;
3108 * Deals with the identical sort.
3109 * Appends the BOCSU version of the source string to the ends of the
3111 * @param source text string
3113 private final void doIdentical(String source)
3115 int isize = BOCU.getCompressionLength(source);
3116 m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
3117 SORT_LEVEL_TERMINATOR_);
3118 m_utilBytesCount1_ ++;
3119 if (m_utilBytes1_.length <= m_utilBytesCount1_ + isize) {
3120 m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
3123 m_utilBytesCount1_ = BOCU.compress(source, m_utilBytes1_,
3124 m_utilBytesCount1_);
3128 * Gets the offset of the first unmatched characters in source and target.
3129 * This method returns the offset of the start of a contraction or a
3130 * combining sequence, if the first difference is in the middle of such a
3132 * @param source string
3133 * @param target string
3134 * @return offset of the first unmatched characters in source and target.
3136 private final int getFirstUnmatchedOffset(String source, String target)
3139 int slength = source.length();
3140 int tlength = target.length();
3141 int minlength = slength;
3142 if (minlength > tlength) {
3143 minlength = tlength;
3145 while (result < minlength
3146 && source.charAt(result) == target.charAt(result)) {
3150 // There is an identical portion at the beginning of the two
3151 // strings. If the identical portion ends within a contraction or a
3152 // combining character sequence, back up to the start of that
3156 if (result < minlength) {
3157 schar = source.charAt(result); // first differing chars
3158 tchar = target.charAt(result);
3161 schar = source.charAt(minlength - 1);
3162 if (isUnsafe(schar)) {
3165 else if (slength == tlength) {
3168 else if (slength < tlength) {
3169 tchar = target.charAt(result);
3172 schar = source.charAt(result);
3175 if (isUnsafe(schar) || isUnsafe(tchar))
3177 // We are stopped in the middle of a contraction or combining
3179 // Look backwards for the part of the string for the start of
3181 // It doesn't matter which string we scan, since they are the
3182 // same in this region.
3186 while (result > 0 && isUnsafe(source.charAt(result)));
3193 * Appending an byte to an array of bytes and increases it if we run out of
3195 * @param array of byte arrays
3196 * @param appendindex index in the byte array to append
3197 * @param value to append
3198 * @return array if array size can accomodate the new value, otherwise
3199 * a bigger array will be created and returned
3201 private static final byte[] append(byte array[], int appendindex,
3205 array[appendindex] = value;
3207 catch (ArrayIndexOutOfBoundsException e) {
3208 array = increase(array, appendindex, SORT_BUFFER_INIT_SIZE_);
3209 array[appendindex] = value;
3215 * This is a trick string compare function that goes in and uses sortkeys
3216 * to compare. It is used when compare gets in trouble and needs to bail
3218 * @param source text string
3219 * @param target text string
3221 private final int compareBySortKeys(String source, String target)
3224 m_utilRawCollationKey_ = getRawCollationKey(source,
3225 m_utilRawCollationKey_);
3226 // this method is very seldom called
3227 RawCollationKey targetkey = getRawCollationKey(target, null);
3228 return m_utilRawCollationKey_.compareTo(targetkey);
3232 * Performs the primary comparisons, and fills up the CE buffer at the
3234 * The return value toggles between the comparison result and the hiragana
3235 * result. If either the source is greater than target or vice versa, the
3236 * return result is the comparison result, ie 1 or -1, furthermore the
3237 * cebuffers will be cleared when that happens. If the primary comparisons
3238 * are equal, we'll have to continue with secondary comparison. In this case
3239 * the cebuffer will not be cleared and the return result will be the
3241 * @param doHiragana4 flag indicator that Hiragana Quaternary has to be
3243 * @param lowestpvalue the lowest primary value that will not be ignored if
3244 * alternate handling is shifted
3245 * @param source text string
3246 * @param target text string
3247 * @param textoffset offset in text to start the comparison
3248 * @return comparion result if a primary difference is found, otherwise
3251 private final int doPrimaryCompare(boolean doHiragana4, int lowestpvalue,
3252 String source, String target,
3256 // Preparing the context objects for iterating over strings
3257 m_srcUtilIter_.setText(source);
3258 m_srcUtilColEIter_.setText(m_srcUtilIter_, textoffset);
3259 m_tgtUtilIter_.setText(target);
3260 m_tgtUtilColEIter_.setText(m_tgtUtilIter_, textoffset);
3262 // Non shifted primary processing is quite simple
3263 if (!m_isAlternateHandlingShifted_) {
3264 int hiraganaresult = 0;
3267 // We fetch CEs until we hit a non ignorable primary or end.
3269 sorder = m_srcUtilColEIter_.next();
3270 m_srcUtilCEBuffer_ = append(m_srcUtilCEBuffer_,
3271 m_srcUtilCEBufferSize_, sorder);
3272 m_srcUtilCEBufferSize_ ++;
3273 sorder &= CE_PRIMARY_MASK_;
3274 } while (sorder == CollationElementIterator.IGNORABLE);
3278 torder = m_tgtUtilColEIter_.next();
3279 m_tgtUtilCEBuffer_ = append(m_tgtUtilCEBuffer_,
3280 m_tgtUtilCEBufferSize_, torder);
3281 m_tgtUtilCEBufferSize_ ++;
3282 torder &= CE_PRIMARY_MASK_;
3283 } while (torder == CollationElementIterator.IGNORABLE);
3285 // if both primaries are the same
3286 if (sorder == torder) {
3287 // and there are no more CEs, we advance to the next level
3288 // see if we are at the end of either string
3289 if (m_srcUtilCEBuffer_[m_srcUtilCEBufferSize_ - 1]
3290 == CollationElementIterator.NULLORDER) {
3291 if (m_tgtUtilCEBuffer_[m_tgtUtilCEBufferSize_ - 1]
3292 != CollationElementIterator.NULLORDER) {
3297 else if (m_tgtUtilCEBuffer_[m_tgtUtilCEBufferSize_ - 1]
3298 == CollationElementIterator.NULLORDER) {
3301 if (doHiragana4 && hiraganaresult == 0
3302 && m_srcUtilColEIter_.m_isCodePointHiragana_ !=
3303 m_tgtUtilColEIter_.m_isCodePointHiragana_) {
3304 if (m_srcUtilColEIter_.m_isCodePointHiragana_) {
3305 hiraganaresult = -1;
3313 // if two primaries are different, we are done
3314 return endPrimaryCompare(sorder, torder);
3317 // no primary difference... do the rest from the buffers
3318 return hiraganaresult;
3320 else { // shifted - do a slightly more complicated processing :)
3322 int sorder = getPrimaryShiftedCompareCE(m_srcUtilColEIter_,
3323 lowestpvalue, true);
3324 int torder = getPrimaryShiftedCompareCE(m_tgtUtilColEIter_,
3325 lowestpvalue, false);
3326 if (sorder == torder) {
3327 if (m_srcUtilCEBuffer_[m_srcUtilCEBufferSize_ - 1]
3328 == CollationElementIterator.NULLORDER) {
3336 return endPrimaryCompare(sorder, torder);
3338 } // no primary difference... do the rest from the buffers
3344 * This is used only for primary strength when we know that sorder is
3345 * already different from torder.
3346 * Compares sorder and torder, returns -1 if sorder is less than torder.
3347 * Clears the cebuffer at the same time.
3348 * @param sorder source strength order
3349 * @param torder target strength order
3350 * @return the comparison result of sorder and torder
3352 private final int endPrimaryCompare(int sorder, int torder)
3354 // if we reach here, the ce offset accessed is the last ce
3355 // appended to the buffer
3356 boolean isSourceNullOrder = (m_srcUtilCEBuffer_[
3357 m_srcUtilCEBufferSize_ - 1]
3358 == CollationElementIterator.NULLORDER);
3359 boolean isTargetNullOrder = (m_tgtUtilCEBuffer_[
3360 m_tgtUtilCEBufferSize_ - 1]
3361 == CollationElementIterator.NULLORDER);
3362 m_srcUtilCEBufferSize_ = -1;
3363 m_tgtUtilCEBufferSize_ = -1;
3364 if (isSourceNullOrder) {
3367 if (isTargetNullOrder) {
3370 // getting rid of the sign
3371 sorder >>>= CE_PRIMARY_SHIFT_;
3372 torder >>>= CE_PRIMARY_SHIFT_;
3373 if (sorder < torder) {
3380 * Calculates the next primary shifted value and fills up cebuffer with the
3381 * next non-ignorable ce.
3382 * @param coleiter collation element iterator
3383 * @param doHiragana4 flag indicator if hiragana quaternary is to be
3385 * @param lowestpvalue lowest primary shifted value that will not be
3387 * @return result next modified ce
3389 private final int getPrimaryShiftedCompareCE(
3390 CollationElementIterator coleiter,
3391 int lowestpvalue, boolean isSrc)
3394 boolean shifted = false;
3395 int result = CollationElementIterator.IGNORABLE;
3396 int cebuffer[] = m_srcUtilCEBuffer_;
3397 int cebuffersize = m_srcUtilCEBufferSize_;
3399 cebuffer = m_tgtUtilCEBuffer_;
3400 cebuffersize = m_tgtUtilCEBufferSize_;
3403 result = coleiter.next();
3404 if (result == CollationElementIterator.NULLORDER) {
3405 cebuffer = append(cebuffer, cebuffersize, result);
3409 else if (result == CollationElementIterator.IGNORABLE
3411 && (result & CE_PRIMARY_MASK_)
3412 == CollationElementIterator.IGNORABLE)) {
3413 // UCA amendment - ignore ignorables that follow shifted code
3417 else if (isContinuation(result)) {
3418 if ((result & CE_PRIMARY_MASK_)
3419 != CollationElementIterator.IGNORABLE) {
3420 // There is primary value
3422 result = (result & CE_PRIMARY_MASK_)
3423 | CE_CONTINUATION_MARKER_;
3424 // preserve interesting continuation
3425 cebuffer = append(cebuffer, cebuffersize, result);
3430 cebuffer = append(cebuffer, cebuffersize, result);
3435 else { // Just lower level values
3437 cebuffer = append(cebuffer, cebuffersize, result);
3443 if (Utility.compareUnsigned(result & CE_PRIMARY_MASK_,
3444 lowestpvalue) > 0) {
3445 cebuffer = append(cebuffer, cebuffersize, result);
3450 if ((result & CE_PRIMARY_MASK_) != 0) {
3452 result &= CE_PRIMARY_MASK_;
3453 cebuffer = append(cebuffer, cebuffersize, result);
3458 cebuffer = append(cebuffer, cebuffersize, result);
3467 m_srcUtilCEBuffer_ = cebuffer;
3468 m_srcUtilCEBufferSize_ = cebuffersize;
3471 m_tgtUtilCEBuffer_ = cebuffer;
3472 m_tgtUtilCEBufferSize_ = cebuffersize;
3474 result &= CE_PRIMARY_MASK_;
3479 * Appending an int to an array of ints and increases it if we run out of
3481 * @param array of int arrays
3482 * @param appendindex index at which value will be appended
3483 * @param value to append
3484 * @return array if size is not increased, otherwise a new array will be
3487 private static final int[] append(int array[], int appendindex, int value)
3489 if (appendindex + 1 >= array.length) {
3490 array = increase(array, appendindex, CE_BUFFER_SIZE_);
3492 array[appendindex] = value;
3497 * Does secondary strength comparison based on the collected ces.
3498 * @param doFrench flag indicates if French ordering is to be done
3499 * @return the secondary strength comparison result
3501 private final int doSecondaryCompare(boolean doFrench)
3503 // now, we're gonna reexamine collected CEs
3504 if (!doFrench) { // normal
3508 int sorder = CollationElementIterator.IGNORABLE;
3509 while (sorder == CollationElementIterator.IGNORABLE) {
3510 sorder = m_srcUtilCEBuffer_[soffset ++]
3511 & CE_SECONDARY_MASK_;
3513 int torder = CollationElementIterator.IGNORABLE;
3514 while (torder == CollationElementIterator.IGNORABLE) {
3515 torder = m_tgtUtilCEBuffer_[toffset ++]
3516 & CE_SECONDARY_MASK_;
3519 if (sorder == torder) {
3520 if (m_srcUtilCEBuffer_[soffset - 1]
3521 == CollationElementIterator.NULLORDER) {
3522 if (m_tgtUtilCEBuffer_[toffset - 1]
3523 != CollationElementIterator.NULLORDER) {
3528 else if (m_tgtUtilCEBuffer_[toffset - 1]
3529 == CollationElementIterator.NULLORDER) {
3534 if (m_srcUtilCEBuffer_[soffset - 1] ==
3535 CollationElementIterator.NULLORDER) {
3538 if (m_tgtUtilCEBuffer_[toffset - 1] ==
3539 CollationElementIterator.NULLORDER) {
3542 return (sorder < torder) ? -1 : 1;
3546 else { // do the French
3547 m_srcUtilContOffset_ = 0;
3548 m_tgtUtilContOffset_ = 0;
3549 m_srcUtilOffset_ = m_srcUtilCEBufferSize_ - 2;
3550 m_tgtUtilOffset_ = m_tgtUtilCEBufferSize_ - 2;
3552 int sorder = getSecondaryFrenchCE(true);
3553 int torder = getSecondaryFrenchCE(false);
3554 if (sorder == torder) {
3555 if ((m_srcUtilOffset_ < 0 && m_tgtUtilOffset_ < 0)
3556 || (m_srcUtilOffset_ >= 0
3557 && m_srcUtilCEBuffer_[m_srcUtilOffset_]
3558 == CollationElementIterator.NULLORDER)) {
3563 return (sorder < torder) ? -1 : 1;
3571 * Calculates the next secondary french CE.
3572 * @param isSrc flag indicator if we are calculating the src ces
3573 * @return result next modified ce
3575 private final int getSecondaryFrenchCE(boolean isSrc)
3577 int result = CollationElementIterator.IGNORABLE;
3578 int offset = m_srcUtilOffset_;
3579 int continuationoffset = m_srcUtilContOffset_;
3580 int cebuffer[] = m_srcUtilCEBuffer_;
3582 offset = m_tgtUtilOffset_;
3583 continuationoffset = m_tgtUtilContOffset_;
3584 cebuffer = m_tgtUtilCEBuffer_;
3587 while (result == CollationElementIterator.IGNORABLE
3589 if (continuationoffset == 0) {
3590 result = cebuffer[offset];
3591 while (isContinuation(cebuffer[offset --])){
3593 // after this, sorder is at the start of continuation,
3594 // and offset points before that
3595 if (isContinuation(cebuffer[offset + 1])) {
3596 // save offset for later
3597 continuationoffset = offset;
3602 result = cebuffer[offset ++];
3603 if (!isContinuation(result)) {
3604 // we have finished with this continuation
3605 offset = continuationoffset;
3606 // reset the pointer to before continuation
3607 continuationoffset = 0;
3611 result &= CE_SECONDARY_MASK_; // remove continuation bit
3614 m_srcUtilOffset_ = offset;
3615 m_srcUtilContOffset_ = continuationoffset;
3618 m_tgtUtilOffset_ = offset;
3619 m_tgtUtilContOffset_ = continuationoffset;
3625 * Does case strength comparison based on the collected ces.
3626 * @return the case strength comparison result
3628 private final int doCaseCompare()
3633 int sorder = CollationElementIterator.IGNORABLE;
3634 int torder = CollationElementIterator.IGNORABLE;
3635 while ((sorder & CE_REMOVE_CASE_)
3636 == CollationElementIterator.IGNORABLE) {
3637 sorder = m_srcUtilCEBuffer_[soffset ++];
3638 if (!isContinuation(sorder) && ((sorder & CE_PRIMARY_MASK_) != 0 || m_utilCompare2_ == true)) {
3639 // primary ignorables should not be considered on the case level when the strength is primary
3640 // otherwise, the CEs stop being well-formed
3641 sorder &= CE_CASE_MASK_3_;
3642 sorder ^= m_caseSwitch_;
3645 sorder = CollationElementIterator.IGNORABLE;
3649 while ((torder & CE_REMOVE_CASE_)
3650 == CollationElementIterator.IGNORABLE) {
3651 torder = m_tgtUtilCEBuffer_[toffset ++];
3652 if (!isContinuation(torder) && ((torder & CE_PRIMARY_MASK_) != 0 || m_utilCompare2_ == true)) {
3653 // primary ignorables should not be considered on the case level when the strength is primary
3654 // otherwise, the CEs stop being well-formed
3655 torder &= CE_CASE_MASK_3_;
3656 torder ^= m_caseSwitch_;
3659 torder = CollationElementIterator.IGNORABLE;
3663 sorder &= CE_CASE_BIT_MASK_;
3664 torder &= CE_CASE_BIT_MASK_;
3665 if (sorder == torder) {
3666 // checking end of strings
3667 if (m_srcUtilCEBuffer_[soffset - 1]
3668 == CollationElementIterator.NULLORDER) {
3669 if (m_tgtUtilCEBuffer_[toffset - 1]
3670 != CollationElementIterator.NULLORDER) {
3675 else if (m_tgtUtilCEBuffer_[toffset - 1]
3676 == CollationElementIterator.NULLORDER) {
3681 if (m_srcUtilCEBuffer_[soffset - 1]
3682 == CollationElementIterator.NULLORDER) {
3685 if (m_tgtUtilCEBuffer_[soffset - 1]
3686 == CollationElementIterator.NULLORDER) {
3689 return (sorder < torder) ? -1 : 1;
3696 * Does tertiary strength comparison based on the collected ces.
3697 * @return the tertiary strength comparison result
3699 private final int doTertiaryCompare()
3704 int sorder = CollationElementIterator.IGNORABLE;
3705 int torder = CollationElementIterator.IGNORABLE;
3706 while ((sorder & CE_REMOVE_CASE_)
3707 == CollationElementIterator.IGNORABLE) {
3708 sorder = m_srcUtilCEBuffer_[soffset ++] & m_mask3_;
3709 if (!isContinuation(sorder)) {
3710 sorder ^= m_caseSwitch_;
3713 sorder &= CE_REMOVE_CASE_;
3717 while ((torder & CE_REMOVE_CASE_)
3718 == CollationElementIterator.IGNORABLE) {
3719 torder = m_tgtUtilCEBuffer_[toffset ++] & m_mask3_;
3720 if (!isContinuation(torder)) {
3721 torder ^= m_caseSwitch_;
3724 torder &= CE_REMOVE_CASE_;
3728 if (sorder == torder) {
3729 if (m_srcUtilCEBuffer_[soffset - 1]
3730 == CollationElementIterator.NULLORDER) {
3731 if (m_tgtUtilCEBuffer_[toffset - 1]
3732 != CollationElementIterator.NULLORDER) {
3737 else if (m_tgtUtilCEBuffer_[toffset - 1]
3738 == CollationElementIterator.NULLORDER) {
3743 if (m_srcUtilCEBuffer_[soffset - 1] ==
3744 CollationElementIterator.NULLORDER) {
3747 if (m_tgtUtilCEBuffer_[toffset - 1] ==
3748 CollationElementIterator.NULLORDER) {
3751 return (sorder < torder) ? -1 : 1;
3758 * Does quaternary strength comparison based on the collected ces.
3759 * @param lowestpvalue the lowest primary value that will not be ignored if
3760 * alternate handling is shifted
3761 * @return the quaternary strength comparison result
3763 private final int doQuaternaryCompare(int lowestpvalue)
3765 boolean sShifted = true;
3766 boolean tShifted = true;
3770 int sorder = CollationElementIterator.IGNORABLE;
3771 int torder = CollationElementIterator.IGNORABLE;
3772 while (sorder == CollationElementIterator.IGNORABLE
3773 || (isContinuation(sorder) && !sShifted)) {
3774 sorder = m_srcUtilCEBuffer_[soffset ++];
3775 if (isContinuation(sorder)) {
3780 else if (Utility.compareUnsigned(sorder, lowestpvalue) > 0
3781 || (sorder & CE_PRIMARY_MASK_)
3782 == CollationElementIterator.IGNORABLE) {
3784 sorder = CE_PRIMARY_MASK_;
3791 sorder >>>= CE_PRIMARY_SHIFT_;
3792 while (torder == CollationElementIterator.IGNORABLE
3793 || (isContinuation(torder) && !tShifted)) {
3794 torder = m_tgtUtilCEBuffer_[toffset ++];
3795 if (isContinuation(torder)) {
3800 else if (Utility.compareUnsigned(torder, lowestpvalue) > 0
3801 || (torder & CE_PRIMARY_MASK_)
3802 == CollationElementIterator.IGNORABLE) {
3804 torder = CE_PRIMARY_MASK_;
3811 torder >>>= CE_PRIMARY_SHIFT_;
3813 if (sorder == torder) {
3814 if (m_srcUtilCEBuffer_[soffset - 1]
3815 == CollationElementIterator.NULLORDER) {
3816 if (m_tgtUtilCEBuffer_[toffset - 1]
3817 != CollationElementIterator.NULLORDER) {
3822 else if (m_tgtUtilCEBuffer_[toffset - 1]
3823 == CollationElementIterator.NULLORDER) {
3828 if (m_srcUtilCEBuffer_[soffset - 1] ==
3829 CollationElementIterator.NULLORDER) {
3832 if (m_tgtUtilCEBuffer_[toffset - 1] ==
3833 CollationElementIterator.NULLORDER) {
3836 return (sorder < torder) ? -1 : 1;
3843 * Internal function. Does byte level string compare. Used by strcoll if
3844 * strength == identical and strings are otherwise equal. This is a rare
3845 * case. Comparison must be done on NFD normalized strings. FCD is not good
3847 * @param source text
3848 * @param target text
3849 * @param offset of the first difference in the text strings
3850 * @param normalize flag indicating if we are to normalize the text before
3852 * @return 1 if source is greater than target, -1 less than and 0 if equals
3854 private static final int doIdenticalCompare(String source, String target,
3855 int offset, boolean normalize)
3859 if (Normalizer.quickCheck(source, Normalizer.NFD,0)
3860 != Normalizer.YES) {
3861 source = Normalizer.decompose(source, false);
3864 if (Normalizer.quickCheck(target, Normalizer.NFD,0)
3865 != Normalizer.YES) {
3866 target = Normalizer.decompose(target, false);
3871 return doStringCompare(source, target, offset);
3875 * Compares string for their codepoint order.
3876 * This comparison handles surrogate characters and place them after the
3877 * all non surrogate characters.
3878 * @param source text
3879 * @param target text
3880 * @param offset start offset for comparison
3881 * @return 1 if source is greater than target, -1 less than and 0 if equals
3883 private static final int doStringCompare(String source,
3887 // compare identical prefixes - they do not need to be fixed up
3890 int slength = source.length();
3891 int tlength = target.length();
3892 int minlength = Math.min(slength, tlength);
3893 while (offset < minlength) {
3894 schar = source.charAt(offset);
3895 tchar = target.charAt(offset ++);
3896 if (schar != tchar) {
3901 if (schar == tchar && offset == minlength) {
3902 if (slength > minlength) {
3905 if (tlength > minlength) {
3911 // if both values are in or above the surrogate range, Fix them up.
3912 if (schar >= UTF16.LEAD_SURROGATE_MIN_VALUE
3913 && tchar >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
3914 schar = fixupUTF16(schar);
3915 tchar = fixupUTF16(tchar);
3918 // now c1 and c2 are in UTF-32-compatible order
3919 return (schar < tchar) ? -1 : 1; // schar and tchar has to be different
3923 * Rotate surrogates to the top to get code point order
3925 private static final char fixupUTF16(char ch)
3937 * Resets the internal case data members and compression values.
3939 private void updateInternalState()
3941 if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {
3942 m_caseSwitch_ = CASE_SWITCH_;
3945 m_caseSwitch_ = NO_CASE_SWITCH_;
3948 if (m_isCaseLevel_ || m_caseFirst_ == AttributeValue.OFF_) {
3949 m_mask3_ = CE_REMOVE_CASE_;
3950 m_common3_ = COMMON_NORMAL_3_;
3951 m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_OFF_;
3952 m_top3_ = COMMON_TOP_CASE_SWITCH_OFF_3_;
3953 m_bottom3_ = COMMON_BOTTOM_3_;
3956 m_mask3_ = CE_KEEP_CASE_;
3957 m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_ON_;
3958 if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {
3959 m_common3_ = COMMON_UPPER_FIRST_3_;
3960 m_top3_ = COMMON_TOP_CASE_SWITCH_UPPER_3_;
3961 m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_UPPER_3_;
3963 m_common3_ = COMMON_NORMAL_3_;
3964 m_top3_ = COMMON_TOP_CASE_SWITCH_LOWER_3_;
3965 m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_LOWER_3_;
3969 // Set the compression values
3970 int total3 = m_top3_ - COMMON_BOTTOM_3_ - 1;
3971 // we multilply double with int, but need only int
3972 m_topCount3_ = (int)(PROPORTION_3_ * total3);
3973 m_bottomCount3_ = total3 - m_topCount3_;
3975 if (!m_isCaseLevel_ && getStrength() == AttributeValue.TERTIARY_
3976 && !m_isFrenchCollation_ && !m_isAlternateHandlingShifted_) {
3977 m_isSimple3_ = true;
3980 m_isSimple3_ = false;
3982 if(!m_isCaseLevel_ && getStrength() <= AttributeValue.TERTIARY_ && !m_isNumericCollation_
3983 && !m_isAlternateHandlingShifted_ && !latinOneFailed_) {
3984 if(latinOneCEs_ == null || latinOneRegenTable_) {
3985 if(setUpLatinOne()) { // if we succeed in building latin1 table, we'll use it
3986 latinOneUse_ = true;
3988 latinOneUse_ = false;
3989 latinOneFailed_ = true;
3991 latinOneRegenTable_ = false;
3992 } else { // latin1Table exists and it doesn't need to be regenerated, just use it
3993 latinOneUse_ = true;
3996 latinOneUse_ = false;
4002 * Initializes the RuleBasedCollator
4004 private final void init()
4006 for (m_minUnsafe_ = 0; m_minUnsafe_ < DEFAULT_MIN_HEURISTIC_;
4008 // Find the smallest unsafe char.
4009 if (isUnsafe(m_minUnsafe_)) {
4014 for (m_minContractionEnd_ = 0;
4015 m_minContractionEnd_ < DEFAULT_MIN_HEURISTIC_;
4016 m_minContractionEnd_ ++) {
4017 // Find the smallest contraction-ending char.
4018 if (isContractionEnd(m_minContractionEnd_)) {
4022 latinOneFailed_ = true;
4023 setStrength(m_defaultStrength_);
4024 setDecomposition(m_defaultDecomposition_);
4025 m_variableTopValue_ = m_defaultVariableTopValue_;
4026 m_isFrenchCollation_ = m_defaultIsFrenchCollation_;
4027 m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_;
4028 m_isCaseLevel_ = m_defaultIsCaseLevel_;
4029 m_caseFirst_ = m_defaultCaseFirst_;
4030 m_isHiragana4_ = m_defaultIsHiragana4_;
4031 m_isNumericCollation_ = m_defaultIsNumericCollation_;
4032 latinOneFailed_ = false;
4033 updateInternalState();
4037 * Initializes utility iterators and byte buffer used by compare
4039 private final void initUtility(boolean allocate) {
4041 if (m_srcUtilIter_ == null) {
4042 m_srcUtilIter_ = new StringUCharacterIterator();
4043 m_srcUtilColEIter_ = new CollationElementIterator(m_srcUtilIter_, this);
4044 m_tgtUtilIter_ = new StringUCharacterIterator();
4045 m_tgtUtilColEIter_ = new CollationElementIterator(m_tgtUtilIter_, this);
4046 m_utilBytes0_ = new byte[SORT_BUFFER_INIT_SIZE_CASE_]; // case
4047 m_utilBytes1_ = new byte[SORT_BUFFER_INIT_SIZE_1_]; // primary
4048 m_utilBytes2_ = new byte[SORT_BUFFER_INIT_SIZE_2_]; // secondary
4049 m_utilBytes3_ = new byte[SORT_BUFFER_INIT_SIZE_3_]; // tertiary
4050 m_utilBytes4_ = new byte[SORT_BUFFER_INIT_SIZE_4_]; // Quaternary
4051 m_srcUtilCEBuffer_ = new int[CE_BUFFER_SIZE_];
4052 m_tgtUtilCEBuffer_ = new int[CE_BUFFER_SIZE_];
4055 m_srcUtilIter_ = null;
4056 m_srcUtilColEIter_ = null;
4057 m_tgtUtilIter_ = null;
4058 m_tgtUtilColEIter_ = null;
4059 m_utilBytes0_ = null;
4060 m_utilBytes1_ = null;
4061 m_utilBytes2_ = null;
4062 m_utilBytes3_ = null;
4063 m_utilBytes4_ = null;
4064 m_srcUtilCEBuffer_ = null;
4065 m_tgtUtilCEBuffer_ = null;
4069 // Consts for Latin-1 special processing
4070 private static final int ENDOFLATINONERANGE_ = 0xFF;
4071 private static final int LATINONETABLELEN_ = (ENDOFLATINONERANGE_+50);
4072 private static final int BAIL_OUT_CE_ = 0xFF000000;
4075 * Generate latin-1 tables
4078 private class shiftValues {
4085 addLatinOneEntry(char ch, int CE, shiftValues sh) {
4086 int primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
4087 boolean reverseSecondary = false;
4088 if(!isContinuation(CE)) {
4089 tertiary = ((CE & m_mask3_));
4090 tertiary ^= m_caseSwitch_;
4091 reverseSecondary = true;
4093 tertiary = (byte)((CE & CE_REMOVE_CONTINUATION_MASK_));
4094 tertiary &= CE_REMOVE_CASE_;
4095 reverseSecondary = false;
4098 secondary = ((CE >>>= 8) & LAST_BYTE_MASK_);
4099 primary2 = ((CE >>>= 8) & LAST_BYTE_MASK_);
4100 primary1 = (CE >>> 8);
4103 latinOneCEs_[ch] |= (primary1 << sh.primShift);
4107 if(sh.primShift < 0) {
4108 latinOneCEs_[ch] = BAIL_OUT_CE_;
4109 latinOneCEs_[latinOneTableLen_+ch] = BAIL_OUT_CE_;
4110 latinOneCEs_[2*latinOneTableLen_+ch] = BAIL_OUT_CE_;
4113 latinOneCEs_[ch] |= (primary2 << sh.primShift);
4116 if(secondary != 0) {
4117 if(reverseSecondary && m_isFrenchCollation_) { // reverse secondary
4118 latinOneCEs_[latinOneTableLen_+ch] >>>= 8; // make space for secondary
4119 latinOneCEs_[latinOneTableLen_+ch] |= (secondary << 24);
4120 } else { // normal case
4121 latinOneCEs_[latinOneTableLen_+ch] |= (secondary << sh.secShift);
4126 latinOneCEs_[2*latinOneTableLen_+ch] |= (tertiary << sh.terShift);
4132 resizeLatinOneTable(int newSize) {
4133 int newTable[] = new int[3*newSize];
4134 int sizeToCopy = ((newSize<latinOneTableLen_)?newSize:latinOneTableLen_);
4135 //uprv_memset(newTable, 0, newSize*sizeof(uint32_t)*3); // automatically cleared.
4136 System.arraycopy(latinOneCEs_, 0, newTable, 0, sizeToCopy);
4137 System.arraycopy(latinOneCEs_, latinOneTableLen_, newTable, newSize, sizeToCopy);
4138 System.arraycopy(latinOneCEs_, 2*latinOneTableLen_, newTable, 2*newSize, sizeToCopy);
4139 latinOneTableLen_ = newSize;
4140 latinOneCEs_ = newTable;
4143 private final boolean setUpLatinOne() {
4144 if(latinOneCEs_ == null || m_reallocLatinOneCEs_) {
4145 latinOneCEs_ = new int[3*LATINONETABLELEN_];
4146 latinOneTableLen_ = LATINONETABLELEN_;
4147 m_reallocLatinOneCEs_ = false;
4149 Arrays.fill(latinOneCEs_, 0);
4151 if(m_ContInfo_ == null) {
4152 m_ContInfo_ = new ContractionInfo();
4155 //StringBuffer sCh = new StringBuffer();
4156 //CollationElementIterator it = getCollationElementIterator(sCh.toString());
4157 CollationElementIterator it = getCollationElementIterator("");
4159 shiftValues s = new shiftValues();
4161 char contractionOffset = ENDOFLATINONERANGE_+1;
4163 for(ch = 0; ch <= ENDOFLATINONERANGE_; ch++) {
4164 s.primShift = 24; s.secShift = 24; s.terShift = 24;
4166 CE = m_trie_.getLatin1LinearValue(ch);
4168 CE = m_trie_.getLeadValue(ch);
4169 if(CE == CollationElementIterator.CE_NOT_FOUND_) {
4170 CE = UCA_.m_trie_.getLeadValue(ch);
4173 if(!isSpecial(CE)) {
4174 addLatinOneEntry(ch, CE, s);
4176 switch (RuleBasedCollator.getTag(CE)) {
4177 case CollationElementIterator.CE_EXPANSION_TAG_:
4178 case CollationElementIterator.CE_DIGIT_TAG_:
4179 //sCh.delete(0, sCh.length());
4181 //it.setText(sCh.toString());
4182 it.setText(UCharacter.toString(ch));
4183 while((CE = it.next()) != CollationElementIterator.NULLORDER) {
4184 if(s.primShift < 0 || s.secShift < 0 || s.terShift < 0) {
4185 latinOneCEs_[ch] = BAIL_OUT_CE_;
4186 latinOneCEs_[latinOneTableLen_+ch] = BAIL_OUT_CE_;
4187 latinOneCEs_[2*latinOneTableLen_+ch] = BAIL_OUT_CE_;
4190 addLatinOneEntry(ch, CE, s);
4193 case CollationElementIterator.CE_CONTRACTION_TAG_:
4194 // here is the trick
4195 // F2 is contraction. We do something very similar to contractions
4196 // but have two indices, one in the real contraction table and the
4197 // other to where we stuffed things. This hopes that we don't have
4198 // many contractions (this should work for latin-1 tables).
4200 if((CE & 0x00FFF000) != 0) {
4201 latinOneFailed_ = true;
4205 int UCharOffset = (CE & 0xFFFFFF) - m_contractionOffset_; //getContractionOffset(CE)]
4207 CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
4209 latinOneCEs_[ch] = CE;
4210 latinOneCEs_[latinOneTableLen_+ch] = CE;
4211 latinOneCEs_[2*latinOneTableLen_+ch] = CE;
4213 // We're going to jump into contraction table, pick the elements
4216 //CE = *(contractionCEs + (UCharOffset - contractionIndex));
4217 CE = m_contractionCE_[UCharOffset];
4220 == CollationElementIterator.CE_EXPANSION_TAG_) {
4221 int i; /* general counter */
4222 //uint32_t *CEOffset = (uint32_t *)image+getExpansionOffset(CE); /* find the offset to expansion table */
4223 int offset = ((CE & 0xFFFFF0) >> 4) - m_expansionOffset_; //it.getExpansionOffset(this, CE);
4224 int size = CE & 0xF; // getExpansionCount(CE);
4226 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
4227 for(i = 0; i<size; i++) {
4228 if(s.primShift < 0 || s.secShift < 0 || s.terShift < 0) {
4229 latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;
4230 latinOneCEs_[latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;
4231 latinOneCEs_[2*latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;
4234 addLatinOneEntry(contractionOffset, m_expansion_[offset+i], s);
4236 } else { /* else, we do */
4237 while(m_expansion_[offset] != 0) {
4238 if(s.primShift < 0 || s.secShift < 0 || s.terShift < 0) {
4239 latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;
4240 latinOneCEs_[latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;
4241 latinOneCEs_[2*latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;
4244 addLatinOneEntry(contractionOffset, m_expansion_[offset++], s);
4247 contractionOffset++;
4248 } else if(!isSpecial(CE)) {
4249 addLatinOneEntry(contractionOffset++, CE, s);
4251 latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;
4252 latinOneCEs_[latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;
4253 latinOneCEs_[2*latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;
4254 contractionOffset++;
4257 s.primShift = 24; s.secShift = 24; s.terShift = 24;
4258 if(contractionOffset == latinOneTableLen_) { // we need to reallocate
4259 resizeLatinOneTable(2*latinOneTableLen_);
4261 } while(m_contractionIndex_[UCharOffset] != 0xFFFF);
4264 case CollationElementIterator.CE_SPEC_PROC_TAG_:
4266 // 0xB7 is a precontext character defined in UCA5.1, a special
4267 // handle is implemeted in order to save LatinOne table for
4270 addLatinOneEntry(ch, CE, s);
4273 latinOneFailed_ = true;
4279 latinOneFailed_ = true;
4285 if(contractionOffset < latinOneTableLen_) {
4286 resizeLatinOneTable(contractionOffset);
4291 private class ContractionInfo {
4295 ContractionInfo m_ContInfo_;
4298 getLatinOneContraction(int strength, int CE, String s) {
4299 //int strength, int CE, String s, Integer ind) {
4300 int len = s.length();
4301 //const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
4302 int UCharOffset = (CE & 0xFFF) - m_contractionOffset_;
4304 int latinOneOffset = (CE & 0x00FFF000) >>> 12;
4305 char schar = 0, tchar = 0;
4310 if(s[*index] == 0) { // end of string
4311 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
4317 if(m_ContInfo_.index == len) {
4318 return(latinOneCEs_[strength*latinOneTableLen_+latinOneOffset]);
4320 schar = s.charAt(m_ContInfo_.index);
4324 while(schar > (tchar = m_contractionIndex_[UCharOffset+offset]/**(UCharOffset+offset)*/)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
4328 if (schar == tchar) {
4329 m_ContInfo_.index++;
4330 return(latinOneCEs_[strength*latinOneTableLen_+latinOneOffset+offset]);
4334 if(schar > ENDOFLATINONERANGE_ /*& 0xFF00*/) {
4335 return BAIL_OUT_CE_;
4337 // skip completely ignorables
4338 int isZeroCE = m_trie_.getLeadValue(schar); //UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
4339 if(isZeroCE == 0) { // we have to ignore completely ignorables
4340 m_ContInfo_.index++;
4344 return(latinOneCEs_[strength*latinOneTableLen_+latinOneOffset]);
4351 * This is a fast strcoll, geared towards text in Latin-1.
4352 * It supports contractions of size two, French secondaries
4353 * and case switching. You can use it with strengths primary
4354 * to tertiary. It does not support shifted and case level.
4355 * It relies on the table build by setupLatin1Table. If it
4356 * doesn't understand something, it will go to the regular
4360 compareUseLatin1(String source, String target, int startOffset)
4362 int sLen = source.length();
4363 int tLen = target.length();
4365 int strength = getStrength();
4367 int sIndex = startOffset, tIndex = startOffset;
4368 char sChar = 0, tChar = 0;
4369 int sOrder=0, tOrder=0;
4371 boolean endOfSource = false;
4373 //uint32_t *elements = coll->latinOneCEs;
4375 boolean haveContractions = false; // if we have contractions in our string
4376 // we cannot do French secondary
4378 int offset = latinOneTableLen_;
4380 // Do the primary level
4383 while(sOrder==0) { // this loop skips primary ignorables
4384 // sOrder=getNextlatinOneCE(source);
4389 sChar=source.charAt(sIndex++); //[sIndex++];
4391 if(sChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out
4392 //fprintf(stderr, "R");
4393 return compareRegular(source, target, startOffset);
4395 sOrder = latinOneCEs_[sChar];
4396 if(isSpecial(sOrder)) { // if we got a special
4397 // specials can basically be either contractions or bail-out signs. If we get anything
4398 // else, we'll bail out anywasy
4399 if(getTag(sOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) {
4400 m_ContInfo_.index = sIndex;
4401 sOrder = getLatinOneContraction(0, sOrder, source);
4402 sIndex = m_ContInfo_.index;
4403 haveContractions = true; // if there are contractions, we cannot do French secondary
4404 // However, if there are contractions in the table, but we always use just one char,
4405 // we might be able to do French. This should be checked out.
4407 if(isSpecial(sOrder) /*== UCOL_BAIL_OUT_CE*/) {
4408 //fprintf(stderr, "S");
4409 return compareRegular(source, target, startOffset);
4414 while(tOrder==0) { // this loop skips primary ignorables
4415 // tOrder=getNextlatinOneCE(target);
4423 tChar=target.charAt(tIndex++); //[tIndex++];
4424 if(tChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out
4425 //fprintf(stderr, "R");
4426 return compareRegular(source, target, startOffset);
4428 tOrder = latinOneCEs_[tChar];
4429 if(isSpecial(tOrder)) {
4430 // Handling specials, see the comments for source
4431 if(getTag(tOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) {
4432 m_ContInfo_.index = tIndex;
4433 tOrder = getLatinOneContraction(0, tOrder, target);
4434 tIndex = m_ContInfo_.index;
4435 haveContractions = true;
4437 if(isSpecial(tOrder)/*== UCOL_BAIL_OUT_CE*/) {
4438 //fprintf(stderr, "S");
4439 return compareRegular(source, target, startOffset);
4443 if(endOfSource) { // source is finished, but target is not, say the result.
4447 if(sOrder == tOrder) { // if we have same CEs, we continue the loop
4448 sOrder = 0; tOrder = 0;
4451 // compare current top bytes
4452 if(((sOrder^tOrder)&0xFF000000)!=0) {
4453 // top bytes differ, return difference
4454 if(sOrder >>> 8 < tOrder >>> 8) {
4459 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
4460 // since we must return enum value
4463 // top bytes match, continue with following bytes
4469 // after primary loop, we definitely know the sizes of strings,
4470 // so we set it and use simpler loop for secondaries and tertiaries
4471 //sLen = sIndex; tLen = tIndex;
4472 if(strength >= SECONDARY) {
4473 // adjust the table beggining
4474 //latinOneCEs_ += coll->latinOneTableLen;
4475 endOfSource = false;
4477 if(!m_isFrenchCollation_) { // non French
4478 // This loop is a simplified copy of primary loop
4479 // at this point we know that whole strings are latin-1, so we don't
4480 // check for that. We also know that we only have contractions as
4482 //sIndex = 0; tIndex = 0;
4483 sIndex = startOffset; tIndex = startOffset;
4491 sChar=source.charAt(sIndex++); //[sIndex++];
4492 sOrder = latinOneCEs_[offset+sChar];
4493 if(isSpecial(sOrder)) {
4494 m_ContInfo_.index = sIndex;
4495 sOrder = getLatinOneContraction(1, sOrder, source);
4496 sIndex = m_ContInfo_.index;
4508 tChar=target.charAt(tIndex++); //[tIndex++];
4509 tOrder = latinOneCEs_[offset+tChar];
4510 if(isSpecial(tOrder)) {
4511 m_ContInfo_.index = tIndex;
4512 tOrder = getLatinOneContraction(1, tOrder, target);
4513 tIndex = m_ContInfo_.index;
4520 if(sOrder == tOrder) {
4521 sOrder = 0; tOrder = 0;
4524 // see primary loop for comments on this
4525 if(((sOrder^tOrder)&0xFF000000)!=0) {
4526 if(sOrder >>> 8 < tOrder >>> 8) {
4537 if(haveContractions) { // if we have contractions, we have to bail out
4538 // since we don't really know how to handle them here
4539 return compareRegular(source, target, startOffset);
4541 // For French, we go backwards
4542 sIndex = sLen; tIndex = tLen;
4546 if(sIndex==startOffset) {
4550 sChar=source.charAt(--sIndex); //[--sIndex];
4551 sOrder = latinOneCEs_[offset+sChar];
4552 // don't even look for contractions
4556 if(tIndex==startOffset) {
4563 tChar=target.charAt(--tIndex); //[--tIndex];
4564 tOrder = latinOneCEs_[offset+tChar];
4565 // don't even look for contractions
4571 if(sOrder == tOrder) {
4572 sOrder = 0; tOrder = 0;
4575 // see the primary loop for comments
4576 if(((sOrder^tOrder)&0xFF000000)!=0) {
4577 if(sOrder >>> 8 < tOrder >>> 8) {
4590 if(strength >= TERTIARY) {
4591 // tertiary loop is the same as secondary (except no French)
4592 offset += latinOneTableLen_;
4593 //sIndex = 0; tIndex = 0;
4594 sIndex = startOffset; tIndex = startOffset;
4595 endOfSource = false;
4602 sChar=source.charAt(sIndex++); //[sIndex++];
4603 sOrder = latinOneCEs_[offset+sChar];
4604 if(isSpecial(sOrder)) {
4605 m_ContInfo_.index = sIndex;
4606 sOrder = getLatinOneContraction(2, sOrder, source);
4607 sIndex = m_ContInfo_.index;
4613 return 0; // if both strings are at the end, they are equal
4618 tChar=target.charAt(tIndex++); //[tIndex++];
4619 tOrder = latinOneCEs_[offset+tChar];
4620 if(isSpecial(tOrder)) {
4621 m_ContInfo_.index = tIndex;
4622 tOrder = getLatinOneContraction(2, tOrder, target);
4623 tIndex = m_ContInfo_.index;
4629 if(sOrder == tOrder) {
4630 sOrder = 0; tOrder = 0;
4633 if(((sOrder^tOrder)&0xff000000)!=0) {
4634 if(sOrder >>> 8 < tOrder >>> 8) {
4648 * Get the version of this collator object.
4649 * @return the version object associated with this collator
4652 public VersionInfo getVersion() {
4653 /* RunTime version */
4654 int rtVersion = VersionInfo.UCOL_RUNTIME_VERSION.getMajor();
4655 /* Builder version*/
4656 int bdVersion = m_version_.getMajor();
4658 /* Charset Version. Need to get the version from cnv files
4659 * makeconv should populate cnv files with version and
4660 * an api has to be provided in ucnv.h to obtain this version
4664 /* combine the version info */
4665 int cmbVersion = ((rtVersion<<11) | (bdVersion<<6) | (csVersion)) & 0xFFFF;
4667 /* Tailoring rules */
4668 return VersionInfo.getInstance(cmbVersion>>8,
4670 m_version_.getMinor(),
4671 UCA_.m_UCA_version_.getMajor());
4673 // versionInfo[0] = (uint8_t)(cmbVersion>>8);
4674 // versionInfo[1] = (uint8_t)cmbVersion;
4675 // versionInfo[2] = coll->image->version[1];
4676 // versionInfo[3] = coll->UCA->image->UCAVersion[0];
4680 * Get the UCA version of this collator object.
4681 * @return the version object associated with this collator
4684 public VersionInfo getUCAVersion() {
4685 return UCA_.m_UCA_version_;
4688 private transient boolean m_reallocLatinOneCEs_;