3 *******************************************************************************
\r
4 * Copyright (C) 1996-2009, International Business Machines Corporation and *
\r
5 * others. All Rights Reserved. *
\r
6 *******************************************************************************
\r
8 package com.ibm.icu.text;
\r
10 import java.io.IOException;
\r
11 import java.text.CharacterIterator;
\r
12 import java.text.ParseException;
\r
13 import java.util.Arrays;
\r
14 import java.util.MissingResourceException;
\r
16 //#if defined(FOUNDATION10) || defined(J2SE13) || defined(ECLIPSE_FRAGMENT)
\r
17 //##import com.ibm.icu.impl.ByteBuffer;
\r
19 import java.nio.ByteBuffer;
\r
22 import com.ibm.icu.impl.BOCU;
\r
23 import com.ibm.icu.impl.ICUDebug;
\r
24 import com.ibm.icu.impl.ICUResourceBundle;
\r
25 import com.ibm.icu.impl.ImplicitCEGenerator;
\r
26 import com.ibm.icu.impl.IntTrie;
\r
27 import com.ibm.icu.impl.StringUCharacterIterator;
\r
28 import com.ibm.icu.impl.Trie;
\r
29 import com.ibm.icu.impl.TrieIterator;
\r
30 import com.ibm.icu.impl.Utility;
\r
31 import com.ibm.icu.lang.UCharacter;
\r
32 import com.ibm.icu.util.RangeValueIterator;
\r
33 import com.ibm.icu.util.ULocale;
\r
34 import com.ibm.icu.util.UResourceBundle;
\r
35 import com.ibm.icu.util.VersionInfo;
\r
38 * <p>RuleBasedCollator is a concrete subclass of Collator. It allows
\r
39 * customization of the Collator via user-specified rule sets.
\r
40 * RuleBasedCollator is designed to be fully compliant to the <a
\r
41 * href="http://www.unicode.org/unicode/reports/tr10/">Unicode
\r
42 * Collation Algorithm (UCA)</a> and conforms to ISO 14651.</p>
\r
44 * <p>Users are strongly encouraged to read <a
\r
45 * href="http://www.icu-project.org/userguide/Collate_Intro.html">
\r
46 * the users guide</a> for more information about the collation
\r
47 * service before using this class.</p>
\r
49 * <p>Create a RuleBasedCollator from a locale by calling the
\r
50 * getInstance(Locale) factory method in the base class Collator.
\r
51 * Collator.getInstance(Locale) creates a RuleBasedCollator object
\r
52 * based on the collation rules defined by the argument locale. If a
\r
53 * customized collation ordering ar attributes is required, use the
\r
54 * RuleBasedCollator(String) constructor with the appropriate
\r
55 * rules. The customized RuleBasedCollator will base its ordering on
\r
56 * UCA, while re-adjusting the attributes and orders of the characters
\r
57 * in the specified rule accordingly.</p>
\r
59 * <p>RuleBasedCollator provides correct collation orders for most
\r
60 * locales supported in ICU. If specific data for a locale is not
\r
61 * available, the orders eventually falls back to the <a
\r
62 * href="http://www.unicode.org/unicode/reports/tr10/">UCA collation
\r
65 * <p>For information about the collation rule syntax and details
\r
66 * about customization, please refer to the
\r
67 * <a href="http://www.icu-project.org/userguide/Collate_Customization.html">
\r
68 * Collation customization</a> section of the user's guide.</p>
\r
70 * <p><strong>Note</strong> that there are some differences between
\r
71 * the Collation rule syntax used in Java and ICU4J:
\r
74 * <li>According to the JDK documentation:
\r
77 * Modifier '!' : Turns on Thai/Lao vowel-consonant swapping. If this rule
\r
78 * is in force when a Thai vowel of the range \U0E40-\U0E44 precedes a
\r
79 * Thai consonant of the range \U0E01-\U0E2E OR a Lao vowel of the
\r
80 * range \U0EC0-\U0EC4 precedes a Lao consonant of the range
\r
81 * \U0E81-\U0EAE then the
\r
82 * vowel is placed after the consonant for collation purposes.
\r
85 * If a rule is without the modifier '!', the Thai/Lao vowel-consonant
\r
86 * swapping is not turned on.
\r
90 * ICU4J's RuleBasedCollator does not support turning off the Thai/Lao
\r
91 * vowel-consonant swapping, since the UCA clearly states that it has to be
\r
92 * supported to ensure a correct sorting order. If a '!' is encountered, it is
\r
95 * <li>As mentioned in the documentation of the base class Collator,
\r
96 * compatibility decomposition mode is not supported.
\r
99 * <strong>Examples</strong>
\r
102 * Creating Customized RuleBasedCollators:
\r
105 * String simple = "& a < b < c < d";
\r
106 * RuleBasedCollator simpleCollator = new RuleBasedCollator(simple);
\r
108 * String norwegian = "& a , A < b , B < c , C < d , D < e , E "
\r
109 * + "< f , F < g , G < h , H < i , I < j , "
\r
110 * + "J < k , K < l , L < m , M < n , N < "
\r
111 * + "o , O < p , P < q , Q < r , R < s , S < "
\r
112 * + "t , T < u , U < v , V < w , W < x , X "
\r
113 * + "< y , Y < z , Z < \u00E5 = a\u030A "
\r
114 * + ", \u00C5 = A\u030A ; aa , AA < \u00E6 "
\r
115 * + ", \u00C6 < \u00F8 , \u00D8";
\r
116 * RuleBasedCollator norwegianCollator = new RuleBasedCollator(norwegian);
\r
120 * Concatenating rules to combine <code>Collator</code>s:
\r
123 * // Create an en_US Collator object
\r
124 * RuleBasedCollator en_USCollator = (RuleBasedCollator)
\r
125 * Collator.getInstance(new Locale("en", "US", ""));
\r
126 * // Create a da_DK Collator object
\r
127 * RuleBasedCollator da_DKCollator = (RuleBasedCollator)
\r
128 * Collator.getInstance(new Locale("da", "DK", ""));
\r
129 * // Combine the two
\r
130 * // First, get the collation rules from en_USCollator
\r
131 * String en_USRules = en_USCollator.getRules();
\r
132 * // Second, get the collation rules from da_DKCollator
\r
133 * String da_DKRules = da_DKCollator.getRules();
\r
134 * RuleBasedCollator newCollator =
\r
135 * new RuleBasedCollator(en_USRules + da_DKRules);
\r
136 * // newCollator has the combined rules
\r
140 * Making changes to an existing RuleBasedCollator to create a new
\r
141 * <code>Collator</code> object, by appending changes to the existing rule:
\r
144 * // Create a new Collator object with additional rules
\r
145 * String addRules = "& C < ch, cH, Ch, CH";
\r
146 * RuleBasedCollator myCollator =
\r
147 * new RuleBasedCollator(en_USCollator.getRules() + addRules);
\r
148 * // myCollator contains the new rules
\r
152 * How to change the order of non-spacing accents:
\r
155 * // old rule with main accents
\r
156 * String oldRules = "= \u0301 ; \u0300 ; \u0302 ; \u0308 "
\r
157 * + "; \u0327 ; \u0303 ; \u0304 ; \u0305 "
\r
158 * + "; \u0306 ; \u0307 ; \u0309 ; \u030A "
\r
159 * + "; \u030B ; \u030C ; \u030D ; \u030E "
\r
160 * + "; \u030F ; \u0310 ; \u0311 ; \u0312 "
\r
161 * + "< a , A ; ae, AE ; \u00e6 , \u00c6 "
\r
162 * + "< b , B < c, C < e, E & C < d , D";
\r
163 * // change the order of accent characters
\r
164 * String addOn = "& \u0300 ; \u0308 ; \u0302";
\r
165 * RuleBasedCollator myCollator = new RuleBasedCollator(oldRules + addOn);
\r
169 * Putting in a new primary ordering before the default setting,
\r
170 * e.g. sort English characters before or after Japanese characters in the Japanese
\r
171 * <code>Collator</code>:
\r
174 * // get en_US Collator rules
\r
175 * RuleBasedCollator en_USCollator
\r
176 * = (RuleBasedCollator)Collator.getInstance(Locale.US);
\r
177 * // add a few Japanese characters to sort before English characters
\r
178 * // suppose the last character before the first base letter 'a' in
\r
179 * // the English collation rule is \u2212
\r
180 * String jaString = "& \u2212 < \u3041, \u3042 < \u3043, "
\r
182 * RuleBasedCollator myJapaneseCollator
\r
183 * = new RuleBasedCollator(en_USCollator.getRules() + jaString);
\r
188 * This class is not subclassable
\r
190 * @author Syn Wee Quek
\r
193 public final class RuleBasedCollator extends Collator
\r
195 // public constructors ---------------------------------------------------
\r
199 * Constructor that takes the argument rules for
\r
200 * customization. The collator will be based on UCA,
\r
201 * with the attributes and re-ordering of the characters specified in the
\r
204 * <p>See the user guide's section on
\r
205 * <a href="http://www.icu-project.org/userguide/Collate_Customization.html">
\r
206 * Collation Customization</a> for details on the rule syntax.
\r
208 * @param rules the collation rules to build the collation table from.
\r
209 * @exception ParseException and IOException thrown. ParseException thrown
\r
210 * when argument rules have an invalid syntax. IOException
\r
211 * thrown when an error occured while reading internal data.
\r
214 public RuleBasedCollator(String rules) throws Exception
\r
217 if (rules == null) {
\r
218 throw new IllegalArgumentException(
\r
219 "Collation rules can not be null");
\r
224 // public methods --------------------------------------------------------
\r
227 * Clones the RuleBasedCollator
\r
228 * @return a new instance of this RuleBasedCollator object
\r
231 public Object clone() throws CloneNotSupportedException
\r
233 RuleBasedCollator result = (RuleBasedCollator)super.clone();
\r
234 if (latinOneCEs_ != null) {
\r
235 result.m_reallocLatinOneCEs_ = true;
\r
236 result.m_ContInfo_ = new ContractionInfo();
\r
239 // since all collation data in the RuleBasedCollator do not change
\r
240 // we can safely assign the result.fields to this collator
\r
241 result.initUtility(false); // let the new clone have their own util
\r
247 * Return a CollationElementIterator for the given String.
\r
248 * @see CollationElementIterator
\r
251 public CollationElementIterator getCollationElementIterator(String source)
\r
253 return new CollationElementIterator(source, this);
\r
257 * Return a CollationElementIterator for the given CharacterIterator.
\r
258 * The source iterator's integrity will be preserved since a new copy
\r
259 * will be created for use.
\r
260 * @see CollationElementIterator
\r
263 public CollationElementIterator getCollationElementIterator(
\r
264 CharacterIterator source)
\r
266 CharacterIterator newsource = (CharacterIterator)source.clone();
\r
267 return new CollationElementIterator(newsource, this);
\r
271 * Return a CollationElementIterator for the given UCharacterIterator.
\r
272 * The source iterator's integrity will be preserved since a new copy
\r
273 * will be created for use.
\r
274 * @see CollationElementIterator
\r
277 public CollationElementIterator getCollationElementIterator(
\r
278 UCharacterIterator source)
\r
280 return new CollationElementIterator(source, this);
\r
283 // public setters --------------------------------------------------------
\r
286 * Sets the Hiragana Quaternary mode to be on or off.
\r
287 * When the Hiragana Quaternary mode is turned on, the collator
\r
288 * positions Hiragana characters before all non-ignorable characters in
\r
289 * QUATERNARY strength. This is to produce a correct JIS collation order,
\r
290 * distinguishing between Katakana and Hiragana characters.
\r
291 * @param flag true if Hiragana Quaternary mode is to be on, false
\r
293 * @see #setHiraganaQuaternaryDefault
\r
294 * @see #isHiraganaQuaternary
\r
297 public void setHiraganaQuaternary(boolean flag)
\r
299 m_isHiragana4_ = flag;
\r
300 updateInternalState();
\r
304 * Sets the Hiragana Quaternary mode to the initial mode set during
\r
305 * construction of the RuleBasedCollator.
\r
306 * See setHiraganaQuaternary(boolean) for more details.
\r
307 * @see #setHiraganaQuaternary(boolean)
\r
308 * @see #isHiraganaQuaternary
\r
311 public void setHiraganaQuaternaryDefault()
\r
313 m_isHiragana4_ = m_defaultIsHiragana4_;
\r
314 updateInternalState();
\r
318 * Sets whether uppercase characters sort before lowercase
\r
319 * characters or vice versa, in strength TERTIARY. The default
\r
320 * mode is false, and so lowercase characters sort before uppercase
\r
322 * If true, sort upper case characters first.
\r
323 * @param upperfirst true to sort uppercase characters before
\r
324 * lowercase characters, false to sort lowercase
\r
325 * characters before uppercase characters
\r
326 * @see #isLowerCaseFirst
\r
327 * @see #isUpperCaseFirst
\r
328 * @see #setLowerCaseFirst
\r
329 * @see #setCaseFirstDefault
\r
332 public void setUpperCaseFirst(boolean upperfirst)
\r
335 if(m_caseFirst_ != AttributeValue.UPPER_FIRST_) {
\r
336 latinOneRegenTable_ = true;
\r
338 m_caseFirst_ = AttributeValue.UPPER_FIRST_;
\r
341 if(m_caseFirst_ != AttributeValue.OFF_) {
\r
342 latinOneRegenTable_ = true;
\r
344 m_caseFirst_ = AttributeValue.OFF_;
\r
346 updateInternalState();
\r
350 * Sets the orders of lower cased characters to sort before upper cased
\r
351 * characters, in strength TERTIARY. The default
\r
353 * If true is set, the RuleBasedCollator will sort lower cased characters
\r
354 * before the upper cased ones.
\r
355 * Otherwise, if false is set, the RuleBasedCollator will ignore case
\r
357 * @param lowerfirst true for sorting lower cased characters before
\r
358 * upper cased characters, false to ignore case
\r
360 * @see #isLowerCaseFirst
\r
361 * @see #isUpperCaseFirst
\r
362 * @see #setUpperCaseFirst
\r
363 * @see #setCaseFirstDefault
\r
366 public void setLowerCaseFirst(boolean lowerfirst)
\r
369 if(m_caseFirst_ != AttributeValue.LOWER_FIRST_) {
\r
370 latinOneRegenTable_ = true;
\r
372 m_caseFirst_ = AttributeValue.LOWER_FIRST_;
\r
375 if(m_caseFirst_ != AttributeValue.OFF_) {
\r
376 latinOneRegenTable_ = true;
\r
378 m_caseFirst_ = AttributeValue.OFF_;
\r
380 updateInternalState();
\r
384 * Sets the case first mode to the initial mode set during
\r
385 * construction of the RuleBasedCollator.
\r
386 * See setUpperCaseFirst(boolean) and setLowerCaseFirst(boolean) for more
\r
388 * @see #isLowerCaseFirst
\r
389 * @see #isUpperCaseFirst
\r
390 * @see #setLowerCaseFirst(boolean)
\r
391 * @see #setUpperCaseFirst(boolean)
\r
394 public final void setCaseFirstDefault()
\r
396 if(m_caseFirst_ != m_defaultCaseFirst_) {
\r
397 latinOneRegenTable_ = true;
\r
399 m_caseFirst_ = m_defaultCaseFirst_;
\r
400 updateInternalState();
\r
404 * Sets the alternate handling mode to the initial mode set during
\r
405 * construction of the RuleBasedCollator.
\r
406 * See setAlternateHandling(boolean) for more details.
\r
407 * @see #setAlternateHandlingShifted(boolean)
\r
408 * @see #isAlternateHandlingShifted()
\r
411 public void setAlternateHandlingDefault()
\r
413 m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_;
\r
414 updateInternalState();
\r
418 * Sets the case level mode to the initial mode set during
\r
419 * construction of the RuleBasedCollator.
\r
420 * See setCaseLevel(boolean) for more details.
\r
421 * @see #setCaseLevel(boolean)
\r
422 * @see #isCaseLevel
\r
425 public void setCaseLevelDefault()
\r
427 m_isCaseLevel_ = m_defaultIsCaseLevel_;
\r
428 updateInternalState();
\r
432 * Sets the decomposition mode to the initial mode set during construction
\r
433 * of the RuleBasedCollator.
\r
434 * See setDecomposition(int) for more details.
\r
435 * @see #getDecomposition
\r
436 * @see #setDecomposition(int)
\r
439 public void setDecompositionDefault()
\r
441 setDecomposition(m_defaultDecomposition_);
\r
442 updateInternalState();
\r
446 * Sets the French collation mode to the initial mode set during
\r
447 * construction of the RuleBasedCollator.
\r
448 * See setFrenchCollation(boolean) for more details.
\r
449 * @see #isFrenchCollation
\r
450 * @see #setFrenchCollation(boolean)
\r
453 public void setFrenchCollationDefault()
\r
455 if(m_isFrenchCollation_ != m_defaultIsFrenchCollation_) {
\r
456 latinOneRegenTable_ = true;
\r
458 m_isFrenchCollation_ = m_defaultIsFrenchCollation_;
\r
459 updateInternalState();
\r
463 * Sets the collation strength to the initial mode set during the
\r
464 * construction of the RuleBasedCollator.
\r
465 * See setStrength(int) for more details.
\r
466 * @see #setStrength(int)
\r
467 * @see #getStrength
\r
470 public void setStrengthDefault()
\r
472 setStrength(m_defaultStrength_);
\r
473 updateInternalState();
\r
477 * Method to set numeric collation to its default value.
\r
478 * When numeric collation is turned on, this Collator generates a collation
\r
479 * key for the numeric value of substrings of digits. This is a way to get
\r
480 * '100' to sort AFTER '2'
\r
481 * @see #getNumericCollation
\r
482 * @see #setNumericCollation
\r
485 public void setNumericCollationDefault()
\r
487 setNumericCollation(m_defaultIsNumericCollation_);
\r
488 updateInternalState();
\r
492 * Sets the mode for the direction of SECONDARY weights to be used in
\r
493 * French collation.
\r
494 * The default value is false, which treats SECONDARY weights in the order
\r
496 * If set to true, the SECONDARY weights will be sorted backwards.
\r
497 * See the section on
\r
498 * <a href="http://www.icu-project.org/userguide/Collate_ServiceArchitecture.html">
\r
499 * French collation</a> for more information.
\r
500 * @param flag true to set the French collation on, false to set it off
\r
502 * @see #isFrenchCollation
\r
503 * @see #setFrenchCollationDefault
\r
505 public void setFrenchCollation(boolean flag)
\r
507 if(m_isFrenchCollation_ != flag) {
\r
508 latinOneRegenTable_ = true;
\r
510 m_isFrenchCollation_ = flag;
\r
511 updateInternalState();
\r
515 * Sets the alternate handling for QUATERNARY strength to be either
\r
516 * shifted or non-ignorable.
\r
517 * See the UCA definition on
\r
518 * <a href="http://www.unicode.org/unicode/reports/tr10/#Variable_Weighting">
\r
519 * Alternate Weighting</a>.
\r
520 * This attribute will only be effective when QUATERNARY strength is set.
\r
521 * The default value for this mode is false, corresponding to the
\r
522 * NON_IGNORABLE mode in UCA. In the NON-IGNORABLE mode, the
\r
523 * RuleBasedCollator will treats all the codepoints with non-ignorable
\r
524 * primary weights in the same way.
\r
525 * If the mode is set to true, the behaviour corresponds to SHIFTED defined
\r
526 * in UCA, this causes codepoints with PRIMARY orders that are equal or
\r
527 * below the variable top value to be ignored in PRIMARY order and
\r
528 * moved to the QUATERNARY order.
\r
529 * @param shifted true if SHIFTED behaviour for alternate handling is
\r
530 * desired, false for the NON_IGNORABLE behaviour.
\r
531 * @see #isAlternateHandlingShifted
\r
532 * @see #setAlternateHandlingDefault
\r
535 public void setAlternateHandlingShifted(boolean shifted)
\r
537 m_isAlternateHandlingShifted_ = shifted;
\r
538 updateInternalState();
\r
543 * When case level is set to true, an additional weight is formed
\r
544 * between the SECONDARY and TERTIARY weight, known as the case level.
\r
545 * The case level is used to distinguish large and small Japanese Kana
\r
546 * characters. Case level could also be used in other situations.
\r
547 * For example to distinguish certain Pinyin characters.
\r
548 * The default value is false, which means the case level is not generated.
\r
549 * The contents of the case level are affected by the case first
\r
550 * mode. A simple way to ignore accent differences in a string is to set
\r
551 * the strength to PRIMARY and enable case level.
\r
554 * See the section on
\r
555 * <a href="http://www.icu-project.org/userguide/Collate_ServiceArchitecture.html">
\r
556 * case level</a> for more information.
\r
558 * @param flag true if case level sorting is required, false otherwise
\r
560 * @see #setCaseLevelDefault
\r
561 * @see #isCaseLevel
\r
563 public void setCaseLevel(boolean flag)
\r
565 m_isCaseLevel_ = flag;
\r
566 updateInternalState();
\r
571 * Sets this Collator's strength property. The strength property
\r
572 * determines the minimum level of difference considered significant
\r
573 * during comparison.
\r
575 * <p>See the Collator class description for an example of use.</p>
\r
576 * @param newStrength the new strength value.
\r
577 * @see #getStrength
\r
578 * @see #setStrengthDefault
\r
584 * @exception IllegalArgumentException If the new strength value is not one
\r
585 * of PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL.
\r
588 public void setStrength(int newStrength)
\r
590 super.setStrength(newStrength);
\r
591 updateInternalState();
\r
596 * Variable top is a two byte primary value which causes all the codepoints
\r
597 * with primary values that are less or equal than the variable top to be
\r
598 * shifted when alternate handling is set to SHIFTED.
\r
601 * Sets the variable top to a collation element value of a string supplied.
\r
603 * @param varTop one or more (if contraction) characters to which the
\r
604 * variable top should be set
\r
605 * @return a int value containing the value of the variable top in upper 16
\r
606 * bits. Lower 16 bits are undefined.
\r
607 * @exception IllegalArgumentException is thrown if varTop argument is not
\r
608 * a valid variable top element. A variable top element is
\r
611 * <li>it is a contraction that does not exist in the
\r
613 * <li>when the PRIMARY strength collation element for the
\r
614 * variable top has more than two bytes
\r
615 * <li>when the varTop argument is null or zero in length.
\r
617 * @see #getVariableTop
\r
618 * @see RuleBasedCollator#setAlternateHandlingShifted
\r
621 public int setVariableTop(String varTop)
\r
623 if (varTop == null || varTop.length() == 0) {
\r
624 throw new IllegalArgumentException(
\r
625 "Variable top argument string can not be null or zero in length.");
\r
627 if (m_srcUtilIter_ == null) {
\r
631 m_srcUtilColEIter_.setText(varTop);
\r
632 int ce = m_srcUtilColEIter_.next();
\r
634 // here we check if we have consumed all characters
\r
635 // you can put in either one character or a contraction
\r
636 // you shouldn't put more...
\r
637 if (m_srcUtilColEIter_.getOffset() != varTop.length()
\r
638 || ce == CollationElementIterator.NULLORDER) {
\r
639 throw new IllegalArgumentException(
\r
640 "Variable top argument string is a contraction that does not exist "
\r
641 + "in the Collation order");
\r
644 int nextCE = m_srcUtilColEIter_.next();
\r
646 if ((nextCE != CollationElementIterator.NULLORDER)
\r
647 && (!isContinuation(nextCE) || (nextCE & CE_PRIMARY_MASK_) != 0)) {
\r
648 throw new IllegalArgumentException(
\r
649 "Variable top argument string can only have a single collation "
\r
650 + "element that has less than or equal to two PRIMARY strength "
\r
654 m_variableTopValue_ = (ce & CE_PRIMARY_MASK_) >> 16;
\r
656 return ce & CE_PRIMARY_MASK_;
\r
660 * Sets the variable top to a collation element value supplied.
\r
661 * Variable top is set to the upper 16 bits.
\r
662 * Lower 16 bits are ignored.
\r
663 * @param varTop Collation element value, as returned by setVariableTop or
\r
665 * @see #getVariableTop
\r
666 * @see #setVariableTop(String)
\r
669 public void setVariableTop(int varTop)
\r
671 m_variableTopValue_ = (varTop & CE_PRIMARY_MASK_) >> 16;
\r
675 * When numeric collation is turned on, this Collator generates a collation
\r
676 * key for the numeric value of substrings of digits. This is a way to get
\r
677 * '100' to sort AFTER '2'
\r
678 * @param flag true to turn numeric collation on and false to turn it off
\r
679 * @see #getNumericCollation
\r
680 * @see #setNumericCollationDefault
\r
683 public void setNumericCollation(boolean flag)
\r
685 // sort substrings of digits as numbers
\r
686 m_isNumericCollation_ = flag;
\r
687 updateInternalState();
\r
690 // public getters --------------------------------------------------------
\r
693 * Gets the collation rules for this RuleBasedCollator.
\r
694 * Equivalent to String getRules(RuleOption.FULL_RULES).
\r
695 * @return returns the collation rules
\r
696 * @see #getRules(boolean)
\r
699 public String getRules()
\r
705 * Returns current rules. The argument defines whether full rules
\r
706 * (UCA + tailored) rules are returned or just the tailoring.
\r
707 * @param fullrules true if the rules that defines the full set of
\r
708 * collation order is required, otherwise false for returning only
\r
709 * the tailored rules
\r
710 * @return the current rules that defines this Collator.
\r
714 public String getRules(boolean fullrules)
\r
719 // take the UCA rules and append real rules at the end
\r
720 return UCA_.m_rules_.concat(m_rules_);
\r
724 * Get an UnicodeSet that contains all the characters and sequences
\r
725 * tailored in this collator.
\r
726 * @return a pointer to a UnicodeSet object containing all the
\r
727 * code points and sequences that may sort differently than
\r
729 * @exception ParseException thrown when argument rules have an
\r
730 * invalid syntax. IOException
\r
733 public UnicodeSet getTailoredSet()
\r
736 CollationRuleParser src = new CollationRuleParser(getRules());
\r
737 return src.getTailoredSet();
\r
738 } catch(Exception e) {
\r
739 throw new IllegalStateException("A tailoring rule should not " +
\r
740 "have errors. Something is quite wrong!");
\r
744 private class contContext {
\r
745 RuleBasedCollator coll;
\r
746 UnicodeSet contractions;
\r
747 UnicodeSet expansions;
\r
748 UnicodeSet removedContractions;
\r
749 boolean addPrefixes;
\r
750 contContext(RuleBasedCollator coll, UnicodeSet contractions, UnicodeSet expansions,
\r
751 UnicodeSet removedContractions, boolean addPrefixes) {
\r
753 this.contractions = contractions;
\r
754 this.expansions = expansions;
\r
755 this.removedContractions = removedContractions;
\r
756 this.addPrefixes = addPrefixes;
\r
761 addSpecial(contContext c, StringBuffer buffer, int CE)
\r
763 StringBuffer b = new StringBuffer();
\r
764 int offset = (CE & 0xFFFFFF) - c.coll.m_contractionOffset_;
\r
765 int newCE = c.coll.m_contractionCE_[offset];
\r
766 // we might have a contraction that ends from previous level
\r
767 if(newCE != CollationElementIterator.CE_NOT_FOUND_) {
\r
768 if(isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_
\r
769 && isSpecial(newCE) && getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_
\r
770 && c.addPrefixes) {
\r
771 addSpecial(c, buffer, newCE);
\r
773 if(buffer.length() > 1) {
\r
774 if(c.contractions != null) {
\r
775 c.contractions.add(buffer.toString());
\r
777 if(c.expansions != null && isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) {
\r
778 c.expansions.add(buffer.toString());
\r
784 // check whether we're doing contraction or prefix
\r
785 if(getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) {
\r
786 while(c.coll.m_contractionIndex_[offset] != 0xFFFF) {
\r
787 b.delete(0, b.length());
\r
789 newCE = c.coll.m_contractionCE_[offset];
\r
790 b.insert(0, c.coll.m_contractionIndex_[offset]);
\r
791 if(isSpecial(newCE) && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) {
\r
792 addSpecial(c, b, newCE);
\r
794 if(c.contractions != null) {
\r
795 c.contractions.add(b.toString());
\r
797 if(c.expansions != null && isSpecial(newCE) && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) {
\r
798 c.expansions.add(b.toString());
\r
803 } else if(getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_) {
\r
804 while(c.coll.m_contractionIndex_[offset] != 0xFFFF) {
\r
805 b.delete(0, b.length());
\r
807 newCE = c.coll.m_contractionCE_[offset];
\r
808 b.append(c.coll.m_contractionIndex_[offset]);
\r
809 if(isSpecial(newCE) && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) {
\r
810 addSpecial(c, b, newCE);
\r
812 if(c.contractions != null) {
\r
813 c.contractions.add(b.toString());
\r
815 if(c.expansions != null && isSpecial(newCE) && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) {
\r
816 c.expansions.add(b.toString());
\r
825 void processSpecials(contContext c)
\r
827 int internalBufferSize = 512;
\r
828 TrieIterator trieiterator
\r
829 = new TrieIterator(c.coll.m_trie_);
\r
830 RangeValueIterator.Element element = new RangeValueIterator.Element();
\r
831 while (trieiterator.next(element)) {
\r
832 int start = element.start;
\r
833 int limit = element.limit;
\r
834 int CE = element.value;
\r
835 StringBuffer contraction = new StringBuffer(internalBufferSize);
\r
837 if(isSpecial(CE)) {
\r
838 if(((getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) || getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_)) {
\r
839 while(start < limit) {
\r
840 // if there are suppressed contractions, we don't
\r
841 // want to add them.
\r
842 if(c.removedContractions != null && c.removedContractions.contains(start)) {
\r
846 // we start our contraction from middle, since we don't know if it
\r
847 // will grow toward right or left
\r
848 contraction.append((char) start);
\r
849 addSpecial(c, contraction, CE);
\r
852 } else if(c.expansions != null && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) {
\r
853 while(start < limit) {
\r
854 c.expansions.add(start++);
\r
862 * Gets unicode sets containing contractions and/or expansions of a collator
\r
863 * @param contractions if not null, set to contain contractions
\r
864 * @param expansions if not null, set to contain expansions
\r
865 * @param addPrefixes add the prefix contextual elements to contractions
\r
866 * @throws Exception
\r
870 getContractionsAndExpansions(UnicodeSet contractions, UnicodeSet expansions,
\r
871 boolean addPrefixes) throws Exception {
\r
872 if(contractions != null) {
\r
873 contractions.clear();
\r
875 if(expansions != null) {
\r
876 expansions.clear();
\r
878 String rules = getRules();
\r
880 CollationRuleParser src = new CollationRuleParser(rules);
\r
881 contContext c = new contContext(RuleBasedCollator.UCA_,
\r
882 contractions, expansions, src.m_removeSet_, addPrefixes);
\r
884 // Add the UCA contractions
\r
885 processSpecials(c);
\r
886 // This is collator specific. Add contractions from a collator
\r
888 c.removedContractions = null;
\r
889 processSpecials(c);
\r
890 } catch (Exception e) {
\r
897 * Get a Collation key for the argument String source from this
\r
898 * RuleBasedCollator.
\r
901 * General recommendation: <br>
\r
902 * If comparison are to be done to the same String multiple times, it would
\r
903 * be more efficient to generate CollationKeys for the Strings and use
\r
904 * CollationKey.compareTo(CollationKey) for the comparisons.
\r
905 * If the each Strings are compared to only once, using the method
\r
906 * RuleBasedCollator.compare(String, String) will have a better performance.
\r
909 * See the class documentation for an explanation about CollationKeys.
\r
911 * @param source the text String to be transformed into a collation key.
\r
912 * @return the CollationKey for the given String based on this
\r
913 * RuleBasedCollator's collation rules. If the source String is
\r
914 * null, a null CollationKey is returned.
\r
915 * @see CollationKey
\r
916 * @see #compare(String, String)
\r
917 * @see #getRawCollationKey
\r
920 public CollationKey getCollationKey(String source) {
\r
921 if (source == null) {
\r
924 m_utilRawCollationKey_ = getRawCollationKey(source,
\r
925 m_utilRawCollationKey_);
\r
926 return new CollationKey(source, m_utilRawCollationKey_);
\r
930 * Gets the simpler form of a CollationKey for the String source following
\r
931 * the rules of this Collator and stores the result into the user provided
\r
933 * If key has a internal byte array of length that's too small for the
\r
934 * result, the internal byte array will be grown to the exact required
\r
936 * @param source the text String to be transformed into a RawCollationKey
\r
937 * @param key output RawCollationKey to store results
\r
938 * @return If key is null, a new instance of RawCollationKey will be
\r
939 * created and returned, otherwise the user provided key will be
\r
941 * @see #getCollationKey
\r
942 * @see #compare(String, String)
\r
943 * @see RawCollationKey
\r
946 public RawCollationKey getRawCollationKey(String source,
\r
947 RawCollationKey key)
\r
949 if (source == null) {
\r
952 int strength = getStrength();
\r
953 m_utilCompare0_ = m_isCaseLevel_;
\r
954 //m_utilCompare1_ = true;
\r
955 m_utilCompare2_ = strength >= SECONDARY;
\r
956 m_utilCompare3_ = strength >= TERTIARY;
\r
957 m_utilCompare4_ = strength >= QUATERNARY;
\r
958 m_utilCompare5_ = strength == IDENTICAL;
\r
960 m_utilBytesCount0_ = 0;
\r
961 m_utilBytesCount1_ = 0;
\r
962 m_utilBytesCount2_ = 0;
\r
963 m_utilBytesCount3_ = 0;
\r
964 m_utilBytesCount4_ = 0;
\r
965 //m_utilBytesCount5_ = 0;
\r
966 //m_utilCount0_ = 0;
\r
967 //m_utilCount1_ = 0;
\r
971 //m_utilCount5_ = 0;
\r
972 boolean doFrench = m_isFrenchCollation_ && m_utilCompare2_;
\r
973 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted.
\r
974 // If we have no qShifted, we don't need to set UCOL_COMMON_BOT4 so
\r
976 int commonBottom4 = ((m_variableTopValue_ >>> 8) + 1) & LAST_BYTE_MASK_;
\r
977 byte hiragana4 = 0;
\r
978 if (m_isHiragana4_ && m_utilCompare4_) {
\r
979 // allocate one more space for hiragana, value for hiragana
\r
980 hiragana4 = (byte)commonBottom4;
\r
984 int bottomCount4 = 0xFF - commonBottom4;
\r
985 // If we need to normalize, we'll do it all at once at the beginning!
\r
986 if (m_utilCompare5_ && Normalizer.quickCheck(source, Normalizer.NFD,0)
\r
987 != Normalizer.YES) {
\r
988 // if it is identical strength, we have to normalize the string to
\r
989 // NFD so that it will be appended correctly to the end of the sort
\r
991 source = Normalizer.decompose(source, false);
\r
993 else if (getDecomposition() != NO_DECOMPOSITION
\r
994 && Normalizer.quickCheck(source, Normalizer.FCD,0)
\r
995 != Normalizer.YES) {
\r
996 // for the rest of the strength, if decomposition is on, FCD is
\r
997 // enough for us to work on.
\r
998 source = Normalizer.normalize(source,Normalizer.FCD);
\r
1000 getSortKeyBytes(source, doFrench, hiragana4, commonBottom4,
\r
1002 if (key == null) {
\r
1003 key = new RawCollationKey();
\r
1005 getSortKey(source, doFrench, commonBottom4, bottomCount4, key);
\r
1010 * Return true if an uppercase character is sorted before the corresponding lowercase character.
\r
1011 * See setCaseFirst(boolean) for details.
\r
1012 * @see #setUpperCaseFirst
\r
1013 * @see #setLowerCaseFirst
\r
1014 * @see #isLowerCaseFirst
\r
1015 * @see #setCaseFirstDefault
\r
1016 * @return true if upper cased characters are sorted before lower cased
\r
1017 * characters, false otherwise
\r
1020 public boolean isUpperCaseFirst()
\r
1022 return (m_caseFirst_ == AttributeValue.UPPER_FIRST_);
\r
1026 * Return true if a lowercase character is sorted before the corresponding uppercase character.
\r
1027 * See setCaseFirst(boolean) for details.
\r
1028 * @see #setUpperCaseFirst
\r
1029 * @see #setLowerCaseFirst
\r
1030 * @see #isUpperCaseFirst
\r
1031 * @see #setCaseFirstDefault
\r
1032 * @return true lower cased characters are sorted before upper cased
\r
1033 * characters, false otherwise
\r
1036 public boolean isLowerCaseFirst()
\r
1038 return (m_caseFirst_ == AttributeValue.LOWER_FIRST_);
\r
1042 * Checks if the alternate handling behaviour is the UCA defined SHIFTED or
\r
1044 * If return value is true, then the alternate handling attribute for the
\r
1045 * Collator is SHIFTED. Otherwise if return value is false, then the
\r
1046 * alternate handling attribute for the Collator is NON_IGNORABLE
\r
1047 * See setAlternateHandlingShifted(boolean) for more details.
\r
1048 * @return true or false
\r
1049 * @see #setAlternateHandlingShifted(boolean)
\r
1050 * @see #setAlternateHandlingDefault
\r
1053 public boolean isAlternateHandlingShifted()
\r
1055 return m_isAlternateHandlingShifted_;
\r
1059 * Checks if case level is set to true.
\r
1060 * See setCaseLevel(boolean) for details.
\r
1061 * @return the case level mode
\r
1062 * @see #setCaseLevelDefault
\r
1063 * @see #isCaseLevel
\r
1064 * @see #setCaseLevel(boolean)
\r
1067 public boolean isCaseLevel()
\r
1069 return m_isCaseLevel_;
\r
1073 * Checks if French Collation is set to true.
\r
1074 * See setFrenchCollation(boolean) for details.
\r
1075 * @return true if French Collation is set to true, false otherwise
\r
1076 * @see #setFrenchCollation(boolean)
\r
1077 * @see #setFrenchCollationDefault
\r
1080 public boolean isFrenchCollation()
\r
1082 return m_isFrenchCollation_;
\r
1086 * Checks if the Hiragana Quaternary mode is set on.
\r
1087 * See setHiraganaQuaternary(boolean) for more details.
\r
1088 * @return flag true if Hiragana Quaternary mode is on, false otherwise
\r
1089 * @see #setHiraganaQuaternaryDefault
\r
1090 * @see #setHiraganaQuaternary(boolean)
\r
1093 public boolean isHiraganaQuaternary()
\r
1095 return m_isHiragana4_;
\r
1099 * Gets the variable top value of a Collator.
\r
1100 * Lower 16 bits are undefined and should be ignored.
\r
1101 * @return the variable top value of a Collator.
\r
1102 * @see #setVariableTop
\r
1105 public int getVariableTop()
\r
1107 return m_variableTopValue_ << 16;
\r
1111 * Method to retrieve the numeric collation value.
\r
1112 * When numeric collation is turned on, this Collator generates a collation
\r
1113 * key for the numeric value of substrings of digits. This is a way to get
\r
1114 * '100' to sort AFTER '2'
\r
1115 * @see #setNumericCollation
\r
1116 * @see #setNumericCollationDefault
\r
1117 * @return true if numeric collation is turned on, false otherwise
\r
1120 public boolean getNumericCollation()
\r
1122 return m_isNumericCollation_;
\r
1125 // public other methods -------------------------------------------------
\r
1128 * Compares the equality of two RuleBasedCollator objects.
\r
1129 * RuleBasedCollator objects are equal if they have the same collation
\r
1130 * rules and the same attributes.
\r
1131 * @param obj the RuleBasedCollator to be compared to.
\r
1132 * @return true if this RuleBasedCollator has exactly the same
\r
1133 * collation behaviour as obj, false otherwise.
\r
1136 public boolean equals(Object obj)
\r
1138 if (obj == null) {
\r
1139 return false; // super does class check
\r
1141 if (this == obj) {
\r
1144 if (getClass() != obj.getClass()) {
\r
1147 RuleBasedCollator other = (RuleBasedCollator)obj;
\r
1148 // all other non-transient information is also contained in rules.
\r
1149 if (getStrength() != other.getStrength()
\r
1150 || getDecomposition() != other.getDecomposition()
\r
1151 || other.m_caseFirst_ != m_caseFirst_
\r
1152 || other.m_caseSwitch_ != m_caseSwitch_
\r
1153 || other.m_isAlternateHandlingShifted_
\r
1154 != m_isAlternateHandlingShifted_
\r
1155 || other.m_isCaseLevel_ != m_isCaseLevel_
\r
1156 || other.m_isFrenchCollation_ != m_isFrenchCollation_
\r
1157 || other.m_isHiragana4_ != m_isHiragana4_) {
\r
1160 boolean rules = m_rules_ == other.m_rules_;
\r
1161 if (!rules && (m_rules_ != null && other.m_rules_ != null)) {
\r
1162 rules = m_rules_.equals(other.m_rules_);
\r
1164 if (!rules || !ICUDebug.enabled("collation")) {
\r
1167 if (m_addition3_ != other.m_addition3_
\r
1168 || m_bottom3_ != other.m_bottom3_
\r
1169 || m_bottomCount3_ != other.m_bottomCount3_
\r
1170 || m_common3_ != other.m_common3_
\r
1171 || m_isSimple3_ != other.m_isSimple3_
\r
1172 || m_mask3_ != other.m_mask3_
\r
1173 || m_minContractionEnd_ != other.m_minContractionEnd_
\r
1174 || m_minUnsafe_ != other.m_minUnsafe_
\r
1175 || m_top3_ != other.m_top3_
\r
1176 || m_topCount3_ != other.m_topCount3_
\r
1177 || !Arrays.equals(m_unsafe_, other.m_unsafe_)) {
\r
1180 if (!m_trie_.equals(other.m_trie_)) {
\r
1181 // we should use the trie iterator here, but then this part is
\r
1182 // only used in the test.
\r
1183 for (int i = UCharacter.MAX_VALUE; i >= UCharacter.MIN_VALUE; i --)
\r
1185 int v = m_trie_.getCodePointValue(i);
\r
1186 int otherv = other.m_trie_.getCodePointValue(i);
\r
1187 if (v != otherv) {
\r
1188 int mask = v & (CE_TAG_MASK_ | CE_SPECIAL_FLAG_);
\r
1189 if (mask == (otherv & 0xff000000)) {
\r
1191 otherv &= 0xffffff;
\r
1192 if (mask == 0xf1000000) {
\r
1193 v -= (m_expansionOffset_ << 4);
\r
1194 otherv -= (other.m_expansionOffset_ << 4);
\r
1196 else if (mask == 0xf2000000) {
\r
1197 v -= m_contractionOffset_;
\r
1198 otherv -= other.m_contractionOffset_;
\r
1200 if (v == otherv) {
\r
1208 if (Arrays.equals(m_contractionCE_, other.m_contractionCE_)
\r
1209 && Arrays.equals(m_contractionEnd_, other.m_contractionEnd_)
\r
1210 && Arrays.equals(m_contractionIndex_, other.m_contractionIndex_)
\r
1211 && Arrays.equals(m_expansion_, other.m_expansion_)
\r
1212 && Arrays.equals(m_expansionEndCE_, other.m_expansionEndCE_)) {
\r
1213 // not comparing paddings
\r
1214 for (int i = 0; i < m_expansionEndCE_.length; i ++) {
\r
1215 if (m_expansionEndCEMaxSize_[i]
\r
1216 != other.m_expansionEndCEMaxSize_[i]) {
\r
1226 * Generates a unique hash code for this RuleBasedCollator.
\r
1227 * @return the unique hash code for this Collator
\r
1230 public int hashCode()
\r
1232 String rules = getRules();
\r
1233 if (rules == null) {
\r
1236 return rules.hashCode();
\r
1240 * Compares the source text String to the target text String according to
\r
1241 * the collation rules, strength and decomposition mode for this
\r
1242 * RuleBasedCollator.
\r
1243 * Returns an integer less than,
\r
1244 * equal to or greater than zero depending on whether the source String is
\r
1245 * less than, equal to or greater than the target String. See the Collator
\r
1246 * class description for an example of use.
\r
1249 * General recommendation: <br>
\r
1250 * If comparison are to be done to the same String multiple times, it would
\r
1251 * be more efficient to generate CollationKeys for the Strings and use
\r
1252 * CollationKey.compareTo(CollationKey) for the comparisons.
\r
1253 * If speed performance is critical and object instantiation is to be
\r
1254 * reduced, further optimization may be achieved by generating a simpler
\r
1255 * key of the form RawCollationKey and reusing this RawCollationKey
\r
1256 * object with the method RuleBasedCollator.getRawCollationKey. Internal
\r
1257 * byte representation can be directly accessed via RawCollationKey and
\r
1258 * stored for future use. Like CollationKey, RawCollationKey provides a
\r
1259 * method RawCollationKey.compareTo for key comparisons.
\r
1260 * If the each Strings are compared to only once, using the method
\r
1261 * RuleBasedCollator.compare(String, String) will have a better performance.
\r
1263 * @param source the source text String.
\r
1264 * @param target the target text String.
\r
1265 * @return Returns an integer value. Value is less than zero if source is
\r
1266 * less than target, value is zero if source and target are equal,
\r
1267 * value is greater than zero if source is greater than target.
\r
1268 * @see CollationKey
\r
1269 * @see #getCollationKey
\r
1272 public int compare(String source, String target)
\r
1274 if (source == target) {
\r
1278 // Find the length of any leading portion that is equal
\r
1279 int offset = getFirstUnmatchedOffset(source, target);
\r
1280 //return compareRegular(source, target, offset);
\r
1281 if(latinOneUse_) {
\r
1282 if ((offset < source.length()
\r
1283 && source.charAt(offset) > ENDOFLATINONERANGE_)
\r
1284 || (offset < target.length()
\r
1285 && target.charAt(offset) > ENDOFLATINONERANGE_)) {
\r
1286 // source or target start with non-latin-1
\r
1287 return compareRegular(source, target, offset);
\r
1289 return compareUseLatin1(source, target, offset);
\r
1292 return compareRegular(source, target, offset);
\r
1296 // package private inner interfaces --------------------------------------
\r
1299 * Attribute values to be used when setting the Collator options
\r
1301 static interface AttributeValue
\r
1304 * Indicates that the default attribute value will be used.
\r
1305 * See individual attribute for details on its default value.
\r
1307 static final int DEFAULT_ = -1;
\r
1309 * Primary collation strength
\r
1311 static final int PRIMARY_ = Collator.PRIMARY;
\r
1313 * Secondary collation strength
\r
1315 static final int SECONDARY_ = Collator.SECONDARY;
\r
1317 * Tertiary collation strength
\r
1319 static final int TERTIARY_ = Collator.TERTIARY;
\r
1321 * Default collation strength
\r
1323 static final int DEFAULT_STRENGTH_ = Collator.TERTIARY;
\r
1325 * Internal use for strength checks in Collation elements
\r
1327 static final int CE_STRENGTH_LIMIT_ = Collator.TERTIARY + 1;
\r
1329 * Quaternary collation strength
\r
1331 static final int QUATERNARY_ = 3;
\r
1333 * Identical collation strength
\r
1335 static final int IDENTICAL_ = Collator.IDENTICAL;
\r
1337 * Internal use for strength checks
\r
1339 static final int STRENGTH_LIMIT_ = Collator.IDENTICAL + 1;
\r
1341 * Turn the feature off - works for FRENCH_COLLATION, CASE_LEVEL,
\r
1342 * HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE
\r
1344 static final int OFF_ = 16;
\r
1346 * Turn the feature on - works for FRENCH_COLLATION, CASE_LEVEL,
\r
1347 * HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE
\r
1349 static final int ON_ = 17;
\r
1351 * Valid for ALTERNATE_HANDLING. Alternate handling will be shifted
\r
1353 static final int SHIFTED_ = 20;
\r
1355 * Valid for ALTERNATE_HANDLING. Alternate handling will be non
\r
1358 static final int NON_IGNORABLE_ = 21;
\r
1360 * Valid for CASE_FIRST - lower case sorts before upper case
\r
1362 static final int LOWER_FIRST_ = 24;
\r
1364 * Upper case sorts before lower case
\r
1366 static final int UPPER_FIRST_ = 25;
\r
1368 * Number of attribute values
\r
1370 static final int LIMIT_ = 29;
\r
1374 * Attributes that collation service understands. All the attributes can
\r
1375 * take DEFAULT value, as well as the values specific to each one.
\r
1377 static interface Attribute
\r
1380 * Attribute for direction of secondary weights - used in French.
\r
1381 * Acceptable values are ON, which results in secondary weights being
\r
1382 * considered backwards and OFF which treats secondary weights in the
\r
1383 * order they appear.
\r
1385 static final int FRENCH_COLLATION_ = 0;
\r
1387 * Attribute for handling variable elements. Acceptable values are
\r
1388 * NON_IGNORABLE (default) which treats all the codepoints with
\r
1389 * non-ignorable primary weights in the same way, and SHIFTED which
\r
1390 * causes codepoints with primary weights that are equal or below the
\r
1391 * variable top value to be ignored on primary level and moved to the
\r
1392 * quaternary level.
\r
1394 static final int ALTERNATE_HANDLING_ = 1;
\r
1396 * Controls the ordering of upper and lower case letters. Acceptable
\r
1397 * values are OFF (default), which orders upper and lower case letters
\r
1398 * in accordance to their tertiary weights, UPPER_FIRST which forces
\r
1399 * upper case letters to sort before lower case letters, and
\r
1400 * LOWER_FIRST which does the opposite.
\r
1402 static final int CASE_FIRST_ = 2;
\r
1404 * Controls whether an extra case level (positioned before the third
\r
1405 * level) is generated or not. Acceptable values are OFF (default),
\r
1406 * when case level is not generated, and ON which causes the case
\r
1407 * level to be generated. Contents of the case level are affected by
\r
1408 * the value of CASE_FIRST attribute. A simple way to ignore accent
\r
1409 * differences in a string is to set the strength to PRIMARY and
\r
1410 * enable case level.
\r
1412 static final int CASE_LEVEL_ = 3;
\r
1414 * Controls whether the normalization check and necessary
\r
1415 * normalizations are performed. When set to OFF (default) no
\r
1416 * normalization check is performed. The correctness of the result is
\r
1417 * guaranteed only if the input data is in so-called FCD form (see
\r
1418 * users manual for more info). When set to ON, an incremental check
\r
1419 * is performed to see whether the input data is in the FCD form. If
\r
1420 * the data is not in the FCD form, incremental NFD normalization is
\r
1423 static final int NORMALIZATION_MODE_ = 4;
\r
1425 * The strength attribute. Can be either PRIMARY, SECONDARY, TERTIARY,
\r
1426 * QUATERNARY or IDENTICAL. The usual strength for most locales
\r
1427 * (except Japanese) is tertiary. Quaternary strength is useful when
\r
1428 * combined with shifted setting for alternate handling attribute and
\r
1429 * for JIS x 4061 collation, when it is used to distinguish between
\r
1430 * Katakana and Hiragana (this is achieved by setting the
\r
1431 * HIRAGANA_QUATERNARY mode to on. Otherwise, quaternary level is
\r
1432 * affected only by the number of non ignorable code points in the
\r
1433 * string. Identical strength is rarely useful, as it amounts to
\r
1434 * codepoints of the NFD form of the string.
\r
1436 static final int STRENGTH_ = 5;
\r
1438 * When turned on, this attribute positions Hiragana before all
\r
1439 * non-ignorables on quaternary level. This is a sneaky way to produce
\r
1442 static final int HIRAGANA_QUATERNARY_MODE_ = 6;
\r
1446 static final int LIMIT_ = 7;
\r
1450 * DataManipulate singleton
\r
1452 static class DataManipulate implements Trie.DataManipulate
\r
1454 // public methods ----------------------------------------------------
\r
1457 * Internal method called to parse a lead surrogate's ce for the offset
\r
1458 * to the next trail surrogate data.
\r
1459 * @param ce collation element of the lead surrogate
\r
1460 * @return data offset or 0 for the next trail surrogate
\r
1463 public final int getFoldingOffset(int ce)
\r
1465 if (isSpecial(ce) && getTag(ce) == CE_SURROGATE_TAG_) {
\r
1466 return (ce & 0xFFFFFF);
\r
1472 * Get singleton object
\r
1474 public static final DataManipulate getInstance()
\r
1476 if (m_instance_ == null) {
\r
1477 m_instance_ = new DataManipulate();
\r
1479 return m_instance_;
\r
1482 // private data member ----------------------------------------------
\r
1485 * Singleton instance
\r
1487 private static DataManipulate m_instance_;
\r
1489 // private constructor ----------------------------------------------
\r
1492 * private to prevent initialization
\r
1494 private DataManipulate()
\r
1502 static final class UCAConstants
\r
1504 int FIRST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000
\r
1505 int LAST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000
\r
1506 int FIRST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x00008705
\r
1507 int FIRST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000000
\r
1508 int LAST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000500
\r
1509 int LAST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x0000DD05
\r
1510 int FIRST_VARIABLE_[] = new int[2]; // 0x05070505
\r
1511 int LAST_VARIABLE_[] = new int[2]; // 0x13CF0505
\r
1512 int FIRST_NON_VARIABLE_[] = new int[2]; // 0x16200505
\r
1513 int LAST_NON_VARIABLE_[] = new int[2]; // 0x767C0505
\r
1514 int RESET_TOP_VALUE_[] = new int[2]; // 0x9F000303
\r
1515 int FIRST_IMPLICIT_[] = new int[2];
\r
1516 int LAST_IMPLICIT_[] = new int[2];
\r
1517 int FIRST_TRAILING_[] = new int[2];
\r
1518 int LAST_TRAILING_[] = new int[2];
\r
1519 int PRIMARY_TOP_MIN_;
\r
1520 int PRIMARY_IMPLICIT_MIN_; // 0xE8000000
\r
1521 int PRIMARY_IMPLICIT_MAX_; // 0xF0000000
\r
1522 int PRIMARY_TRAILING_MIN_; // 0xE8000000
\r
1523 int PRIMARY_TRAILING_MAX_; // 0xF0000000
\r
1524 int PRIMARY_SPECIAL_MIN_; // 0xE8000000
\r
1525 int PRIMARY_SPECIAL_MAX_; // 0xF0000000
\r
1528 // package private data member -------------------------------------------
\r
1530 static final byte BYTE_FIRST_TAILORED_ = (byte)0x04;
\r
1531 static final byte BYTE_COMMON_ = (byte)0x05;
\r
1532 static final int COMMON_TOP_2_ = 0x86; // int for unsigness
\r
1533 static final int COMMON_BOTTOM_2_ = BYTE_COMMON_;
\r
1534 static final int COMMON_BOTTOM_3 = 0x05;
\r
1536 * Case strength mask
\r
1538 static final int CE_CASE_BIT_MASK_ = 0xC0;
\r
1539 static final int CE_TAG_SHIFT_ = 24;
\r
1540 static final int CE_TAG_MASK_ = 0x0F000000;
\r
1542 static final int CE_SPECIAL_FLAG_ = 0xF0000000;
\r
1544 * Lead surrogate that is tailored and doesn't start a contraction
\r
1546 static final int CE_SURROGATE_TAG_ = 5;
\r
1548 * Mask to get the primary strength of the collation element
\r
1550 static final int CE_PRIMARY_MASK_ = 0xFFFF0000;
\r
1552 * Mask to get the secondary strength of the collation element
\r
1554 static final int CE_SECONDARY_MASK_ = 0xFF00;
\r
1556 * Mask to get the tertiary strength of the collation element
\r
1558 static final int CE_TERTIARY_MASK_ = 0xFF;
\r
1560 * Primary strength shift
\r
1562 static final int CE_PRIMARY_SHIFT_ = 16;
\r
1564 * Secondary strength shift
\r
1566 static final int CE_SECONDARY_SHIFT_ = 8;
\r
1568 * Continuation marker
\r
1570 static final int CE_CONTINUATION_MARKER_ = 0xC0;
\r
1573 * Size of collator raw data headers and options before the expansion
\r
1574 * data. This is used when expansion ces are to be retrieved. ICU4C uses
\r
1575 * the expansion offset starting from UCollator.UColHeader, hence ICU4J
\r
1576 * will have to minus that off to get the right expansion ce offset. In
\r
1579 int m_expansionOffset_;
\r
1581 * Size of collator raw data headers, options and expansions before
\r
1582 * contraction data. This is used when contraction ces are to be retrieved.
\r
1583 * ICU4C uses contraction offset starting from UCollator.UColHeader, hence
\r
1584 * ICU4J will have to minus that off to get the right contraction ce
\r
1585 * offset. In number of chars.
\r
1587 int m_contractionOffset_;
\r
1589 * Flag indicator if Jamo is special
\r
1591 boolean m_isJamoSpecial_;
\r
1593 // Collator options ------------------------------------------------------
\r
1595 int m_defaultVariableTopValue_;
\r
1596 boolean m_defaultIsFrenchCollation_;
\r
1597 boolean m_defaultIsAlternateHandlingShifted_;
\r
1598 int m_defaultCaseFirst_;
\r
1599 boolean m_defaultIsCaseLevel_;
\r
1600 int m_defaultDecomposition_;
\r
1601 int m_defaultStrength_;
\r
1602 boolean m_defaultIsHiragana4_;
\r
1603 boolean m_defaultIsNumericCollation_;
\r
1606 * Value of the variable top
\r
1608 int m_variableTopValue_;
\r
1610 * Attribute for special Hiragana
\r
1612 boolean m_isHiragana4_;
\r
1614 * Case sorting customization
\r
1618 * Numeric collation option
\r
1620 boolean m_isNumericCollation_;
\r
1622 // end Collator options --------------------------------------------------
\r
1627 int m_expansion_[];
\r
1629 * Contraction index table
\r
1631 char m_contractionIndex_[];
\r
1633 * Contraction CE table
\r
1635 int m_contractionCE_[];
\r
1641 * Table to store all collation elements that are the last element of an
\r
1642 * expansion. This is for use in StringSearch.
\r
1644 int m_expansionEndCE_[];
\r
1646 * Table to store the maximum size of any expansions that end with the
\r
1647 * corresponding collation element in m_expansionEndCE_. For use in
\r
1648 * StringSearch too
\r
1650 byte m_expansionEndCEMaxSize_[];
\r
1652 * Heuristic table to store information on whether a char character is
\r
1653 * considered "unsafe". "Unsafe" character are combining marks or those
\r
1654 * belonging to some contraction sequence from the offset 1 onwards.
\r
1655 * E.g. if "ABC" is the only contraction, then 'B' and 'C' are considered
\r
1656 * unsafe. If we have another contraction "ZA" with the one above, then
\r
1657 * 'A', 'B', 'C' are "unsafe" but 'Z' is not.
\r
1661 * Table to store information on whether a codepoint can occur as the last
\r
1662 * character in a contraction
\r
1664 byte m_contractionEnd_[];
\r
1666 * Original collation rules
\r
1670 * The smallest "unsafe" codepoint
\r
1672 char m_minUnsafe_;
\r
1674 * The smallest codepoint that could be the end of a contraction
\r
1676 char m_minContractionEnd_;
\r
1678 * General version of the collator
\r
1680 VersionInfo m_version_;
\r
1684 VersionInfo m_UCA_version_;
\r
1688 VersionInfo m_UCD_version_;
\r
1691 * UnicodeData.txt property object
\r
1693 static final RuleBasedCollator UCA_;
\r
1697 static final UCAConstants UCA_CONSTANTS_;
\r
1699 * Table for UCA and builder use
\r
1701 static final char UCA_CONTRACTIONS_[];
\r
1703 private static boolean UCA_INIT_COMPLETE;
\r
1706 * Implicit generator
\r
1708 static final ImplicitCEGenerator impCEGen_;
\r
1710 // * Implicit constants
\r
1712 // static final int IMPLICIT_BASE_BYTE_;
\r
1713 // static final int IMPLICIT_LIMIT_BYTE_;
\r
1714 // static final int IMPLICIT_4BYTE_BOUNDARY_;
\r
1715 // static final int LAST_MULTIPLIER_;
\r
1716 // static final int LAST2_MULTIPLIER_;
\r
1717 // static final int IMPLICIT_BASE_3BYTE_;
\r
1718 // static final int IMPLICIT_BASE_4BYTE_;
\r
1719 // static final int BYTES_TO_AVOID_ = 3;
\r
1720 // static final int OTHER_COUNT_ = 256 - BYTES_TO_AVOID_;
\r
1721 // static final int LAST_COUNT_ = OTHER_COUNT_ / 2;
\r
1723 // * Room for intervening, without expanding to 5 bytes
\r
1725 // static final int LAST_COUNT2_ = OTHER_COUNT_ / 21;
\r
1726 // static final int IMPLICIT_3BYTE_COUNT_ = 1;
\r
1728 static final byte SORT_LEVEL_TERMINATOR_ = 1;
\r
1730 // These are values from UCA required for
\r
1731 // implicit generation and supressing sort key compression
\r
1732 // they should regularly be in the UCA, but if one
\r
1733 // is running without UCA, it could be a problem
\r
1734 static final int maxRegularPrimary = 0xA0;
\r
1735 static final int minImplicitPrimary = 0xE0;
\r
1736 static final int maxImplicitPrimary = 0xE4;
\r
1739 // block to initialise character property database
\r
1742 // take pains to let static class init succeed, otherwise the class itself won't exist and
\r
1743 // clients will get a NoClassDefFoundException. Instead, make the constructors fail if
\r
1744 // we can't load the UCA data.
\r
1746 RuleBasedCollator iUCA_ = null;
\r
1747 UCAConstants iUCA_CONSTANTS_ = null;
\r
1748 char iUCA_CONTRACTIONS_[] = null;
\r
1749 ImplicitCEGenerator iimpCEGen_ = null;
\r
1752 // !!! note what's going on here...
\r
1753 // even though the static init of the class is not yet complete, we
\r
1754 // instantiate an instance of the class. So we'd better be sure that
\r
1755 // instantiation doesn't rely on the static initialization that's
\r
1756 // not complete yet!
\r
1757 iUCA_ = new RuleBasedCollator();
\r
1758 iUCA_CONSTANTS_ = new UCAConstants();
\r
1759 iUCA_CONTRACTIONS_ = CollatorReader.read(iUCA_, iUCA_CONSTANTS_);
\r
1761 // called before doing canonical closure for the UCA.
\r
1762 iimpCEGen_ = new ImplicitCEGenerator(minImplicitPrimary, maxImplicitPrimary);
\r
1763 //iimpCEGen_ = new ImplicitCEGenerator(iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_, iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_);
\r
1765 ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_COLLATION_BASE_NAME, ULocale.ENGLISH);
\r
1766 iUCA_.m_rules_ = (String)rb.getObject("UCARules");
\r
1768 catch (MissingResourceException ex)
\r
1772 catch (IOException e)
\r
1774 // e.printStackTrace();
\r
1775 // throw new MissingResourceException(e.getMessage(),"","");
\r
1779 UCA_CONSTANTS_ = iUCA_CONSTANTS_;
\r
1780 UCA_CONTRACTIONS_ = iUCA_CONTRACTIONS_;
\r
1781 impCEGen_ = iimpCEGen_;
\r
1783 UCA_INIT_COMPLETE = true;
\r
1787 private static void checkUCA() throws MissingResourceException {
\r
1788 if (UCA_INIT_COMPLETE && UCA_ == null) {
\r
1789 throw new MissingResourceException("Collator UCA data unavailable", "", "");
\r
1793 // package private constructors ------------------------------------------
\r
1796 * <p>Private contructor for use by subclasses.
\r
1797 * Public access to creating Collators is handled by the API
\r
1798 * Collator.getInstance() or RuleBasedCollator(String rules).
\r
1801 * This constructor constructs the UCA collator internally
\r
1804 RuleBasedCollator()
\r
1807 initUtility(false);
\r
1811 * Constructors a RuleBasedCollator from the argument locale.
\r
1812 * If no resource bundle is associated with the locale, UCA is used
\r
1816 RuleBasedCollator(ULocale locale)
\r
1819 ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_COLLATION_BASE_NAME, locale);
\r
1820 initUtility(false);
\r
1823 // Use keywords, if supplied for lookup
\r
1824 String collkey = locale.getKeywordValue("collation");
\r
1825 if(collkey == null) {
\r
1826 collkey = rb.getStringWithFallback("collations/default");
\r
1829 // collations/default will always give a string back
\r
1830 // keyword for the real collation data
\r
1831 // if "collations/collkey" will return null if collkey == null
\r
1832 ICUResourceBundle elements = rb.getWithFallback("collations/" + collkey);
\r
1833 if (elements != null) {
\r
1834 // TODO: Determine actual & valid locale correctly
\r
1835 ULocale uloc = rb.getULocale();
\r
1836 setLocale(uloc, uloc);
\r
1838 m_rules_ = elements.getString("Sequence");
\r
1839 ByteBuffer buf = elements.get("%%CollationBin").getBinary();
\r
1842 // m_rules_ = (String)rules[1][1];
\r
1843 byte map[] = buf.array();
\r
1844 CollatorReader.initRBC(this, map);
\r
1846 BufferedInputStream input =
\r
1847 new BufferedInputStream(
\r
1848 new ByteArrayInputStream(map));
\r
1850 CollatorReader reader = new CollatorReader(input, false);
\r
1851 if (map.length > MIN_BINARY_DATA_SIZE_) {
\r
1852 reader.read(this, null);
\r
1855 reader.readHeader(this);
\r
1856 reader.readOptions(this);
\r
1857 // duplicating UCA_'s data
\r
1858 setWithUCATables();
\r
1861 // at this point, we have read in the collator
\r
1862 // now we need to check whether the binary image has
\r
1863 // the right UCA and other versions
\r
1864 if(!m_UCA_version_.equals(UCA_.m_UCA_version_) ||
\r
1865 !m_UCD_version_.equals(UCA_.m_UCD_version_)) {
\r
1878 catch (Exception e) {
\r
1879 // e.printStackTrace();
\r
1880 // if failed use UCA.
\r
1886 // package private methods -----------------------------------------------
\r
1889 * Sets this collator to use the tables in UCA. Note options not taken
\r
1892 final void setWithUCATables()
\r
1894 m_contractionOffset_ = UCA_.m_contractionOffset_;
\r
1895 m_expansionOffset_ = UCA_.m_expansionOffset_;
\r
1896 m_expansion_ = UCA_.m_expansion_;
\r
1897 m_contractionIndex_ = UCA_.m_contractionIndex_;
\r
1898 m_contractionCE_ = UCA_.m_contractionCE_;
\r
1899 m_trie_ = UCA_.m_trie_;
\r
1900 m_expansionEndCE_ = UCA_.m_expansionEndCE_;
\r
1901 m_expansionEndCEMaxSize_ = UCA_.m_expansionEndCEMaxSize_;
\r
1902 m_unsafe_ = UCA_.m_unsafe_;
\r
1903 m_contractionEnd_ = UCA_.m_contractionEnd_;
\r
1904 m_minUnsafe_ = UCA_.m_minUnsafe_;
\r
1905 m_minContractionEnd_ = UCA_.m_minContractionEnd_;
\r
1909 * Sets this collator to use the all options and tables in UCA.
\r
1911 final void setWithUCAData()
\r
1913 latinOneFailed_ = true;
\r
1915 m_addition3_ = UCA_.m_addition3_;
\r
1916 m_bottom3_ = UCA_.m_bottom3_;
\r
1917 m_bottomCount3_ = UCA_.m_bottomCount3_;
\r
1918 m_caseFirst_ = UCA_.m_caseFirst_;
\r
1919 m_caseSwitch_ = UCA_.m_caseSwitch_;
\r
1920 m_common3_ = UCA_.m_common3_;
\r
1921 m_contractionOffset_ = UCA_.m_contractionOffset_;
\r
1922 setDecomposition(UCA_.getDecomposition());
\r
1923 m_defaultCaseFirst_ = UCA_.m_defaultCaseFirst_;
\r
1924 m_defaultDecomposition_ = UCA_.m_defaultDecomposition_;
\r
1925 m_defaultIsAlternateHandlingShifted_
\r
1926 = UCA_.m_defaultIsAlternateHandlingShifted_;
\r
1927 m_defaultIsCaseLevel_ = UCA_.m_defaultIsCaseLevel_;
\r
1928 m_defaultIsFrenchCollation_ = UCA_.m_defaultIsFrenchCollation_;
\r
1929 m_defaultIsHiragana4_ = UCA_.m_defaultIsHiragana4_;
\r
1930 m_defaultStrength_ = UCA_.m_defaultStrength_;
\r
1931 m_defaultVariableTopValue_ = UCA_.m_defaultVariableTopValue_;
\r
1932 m_defaultIsNumericCollation_ = UCA_.m_defaultIsNumericCollation_;
\r
1933 m_expansionOffset_ = UCA_.m_expansionOffset_;
\r
1934 m_isAlternateHandlingShifted_ = UCA_.m_isAlternateHandlingShifted_;
\r
1935 m_isCaseLevel_ = UCA_.m_isCaseLevel_;
\r
1936 m_isFrenchCollation_ = UCA_.m_isFrenchCollation_;
\r
1937 m_isHiragana4_ = UCA_.m_isHiragana4_;
\r
1938 m_isJamoSpecial_ = UCA_.m_isJamoSpecial_;
\r
1939 m_isSimple3_ = UCA_.m_isSimple3_;
\r
1940 m_mask3_ = UCA_.m_mask3_;
\r
1941 m_minContractionEnd_ = UCA_.m_minContractionEnd_;
\r
1942 m_minUnsafe_ = UCA_.m_minUnsafe_;
\r
1943 m_rules_ = UCA_.m_rules_;
\r
1944 setStrength(UCA_.getStrength());
\r
1945 m_top3_ = UCA_.m_top3_;
\r
1946 m_topCount3_ = UCA_.m_topCount3_;
\r
1947 m_variableTopValue_ = UCA_.m_variableTopValue_;
\r
1948 m_isNumericCollation_ = UCA_.m_isNumericCollation_;
\r
1949 setWithUCATables();
\r
1950 latinOneFailed_ = false;
\r
1954 * Test whether a char character is potentially "unsafe" for use as a
\r
1955 * collation starting point. "Unsafe" characters are combining marks or
\r
1956 * those belonging to some contraction sequence from the offset 1 onwards.
\r
1957 * E.g. if "ABC" is the only contraction, then 'B' and
\r
1958 * 'C' are considered unsafe. If we have another contraction "ZA" with
\r
1959 * the one above, then 'A', 'B', 'C' are "unsafe" but 'Z' is not.
\r
1960 * @param ch character to determin
\r
1961 * @return true if ch is unsafe, false otherwise
\r
1963 final boolean isUnsafe(char ch)
\r
1965 if (ch < m_minUnsafe_) {
\r
1969 if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) {
\r
1970 if (UTF16.isLeadSurrogate(ch)
\r
1971 || UTF16.isTrailSurrogate(ch)) {
\r
1972 // Trail surrogate are always considered unsafe.
\r
1975 ch &= HEURISTIC_OVERFLOW_MASK_;
\r
1976 ch += HEURISTIC_OVERFLOW_OFFSET_;
\r
1978 int value = m_unsafe_[ch >> HEURISTIC_SHIFT_];
\r
1979 return ((value >> (ch & HEURISTIC_MASK_)) & 1) != 0;
\r
1983 * Approximate determination if a char character is at a contraction end.
\r
1984 * Guaranteed to be true if a character is at the end of a contraction,
\r
1985 * otherwise it is not deterministic.
\r
1986 * @param ch character to be determined
\r
1988 final boolean isContractionEnd(char ch)
\r
1990 if (UTF16.isTrailSurrogate(ch)) {
\r
1994 if (ch < m_minContractionEnd_) {
\r
1998 if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) {
\r
1999 ch &= HEURISTIC_OVERFLOW_MASK_;
\r
2000 ch += HEURISTIC_OVERFLOW_OFFSET_;
\r
2002 int value = m_contractionEnd_[ch >> HEURISTIC_SHIFT_];
\r
2003 return ((value >> (ch & HEURISTIC_MASK_)) & 1) != 0;
\r
2007 * Retrieve the tag of a special ce
\r
2008 * @param ce ce to test
\r
2009 * @return tag of ce
\r
2011 static int getTag(int ce)
\r
2013 return (ce & CE_TAG_MASK_) >> CE_TAG_SHIFT_;
\r
2017 * Checking if ce is special
\r
2018 * @param ce to check
\r
2019 * @return true if ce is special
\r
2021 static boolean isSpecial(int ce)
\r
2023 return (ce & CE_SPECIAL_FLAG_) == CE_SPECIAL_FLAG_;
\r
2027 * Checks if the argument ce is a continuation
\r
2028 * @param ce collation element to test
\r
2029 * @return true if ce is a continuation
\r
2031 static final boolean isContinuation(int ce)
\r
2033 return ce != CollationElementIterator.NULLORDER
\r
2034 && (ce & CE_CONTINUATION_TAG_) == CE_CONTINUATION_TAG_;
\r
2037 // private inner classes ------------------------------------------------
\r
2039 // private variables -----------------------------------------------------
\r
2042 * The smallest natural unsafe or contraction end char character before
\r
2044 * This is a combining mark.
\r
2046 private static final int DEFAULT_MIN_HEURISTIC_ = 0x300;
\r
2048 * Heuristic table table size. Size is 32 bytes, 1 bit for each
\r
2049 * latin 1 char, and some power of two for hashing the rest of the chars.
\r
2052 private static final char HEURISTIC_SIZE_ = 1056;
\r
2054 * Mask value down to "some power of two" - 1,
\r
2055 * number of bits, not num of bytes.
\r
2057 private static final char HEURISTIC_OVERFLOW_MASK_ = 0x1fff;
\r
2059 * Unsafe character shift
\r
2061 private static final int HEURISTIC_SHIFT_ = 3;
\r
2063 * Unsafe character addition for character too large, it has to be folded
\r
2064 * then incremented.
\r
2066 private static final char HEURISTIC_OVERFLOW_OFFSET_ = 256;
\r
2068 * Mask value to get offset in heuristic table.
\r
2070 private static final char HEURISTIC_MASK_ = 7;
\r
2072 private int m_caseSwitch_;
\r
2073 private int m_common3_;
\r
2074 private int m_mask3_;
\r
2076 * When switching case, we need to add or subtract different values.
\r
2078 private int m_addition3_;
\r
2080 * Upper range when compressing
\r
2082 private int m_top3_;
\r
2084 * Upper range when compressing
\r
2086 private int m_bottom3_;
\r
2087 private int m_topCount3_;
\r
2088 private int m_bottomCount3_;
\r
2090 * Case first constants
\r
2092 private static final int CASE_SWITCH_ = 0xC0;
\r
2093 private static final int NO_CASE_SWITCH_ = 0;
\r
2095 * Case level constants
\r
2097 private static final int CE_REMOVE_CASE_ = 0x3F;
\r
2098 private static final int CE_KEEP_CASE_ = 0xFF;
\r
2100 * Case strength mask
\r
2102 private static final int CE_CASE_MASK_3_ = 0xFF;
\r
2104 * Sortkey size factor. Values can be changed.
\r
2106 private static final double PROPORTION_2_ = 0.5;
\r
2107 private static final double PROPORTION_3_ = 0.667;
\r
2109 // These values come from the UCA ----------------------------------------
\r
2112 * This is an enum that lists magic special byte values from the
\r
2115 //private static final byte BYTE_ZERO_ = 0x0;
\r
2116 //private static final byte BYTE_LEVEL_SEPARATOR_ = (byte)0x01;
\r
2117 //private static final byte BYTE_SORTKEY_GLUE_ = (byte)0x02;
\r
2118 private static final byte BYTE_SHIFT_PREFIX_ = (byte)0x03;
\r
2119 /*private*/ static final byte BYTE_UNSHIFTED_MIN_ = BYTE_SHIFT_PREFIX_;
\r
2120 //private static final byte BYTE_FIRST_UCA_ = BYTE_COMMON_;
\r
2121 static final byte CODAN_PLACEHOLDER = 0x27;
\r
2122 //private static final byte BYTE_LAST_LATIN_PRIMARY_ = (byte)0x4C;
\r
2123 private static final byte BYTE_FIRST_NON_LATIN_PRIMARY_ = (byte)0x4D;
\r
2124 private static final byte BYTE_UNSHIFTED_MAX_ = (byte)0xFF;
\r
2125 private static final int TOTAL_2_ = COMMON_TOP_2_ - COMMON_BOTTOM_2_ - 1;
\r
2126 private static final int FLAG_BIT_MASK_CASE_SWITCH_OFF_ = 0x80;
\r
2127 private static final int FLAG_BIT_MASK_CASE_SWITCH_ON_ = 0x40;
\r
2128 private static final int COMMON_TOP_CASE_SWITCH_OFF_3_ = 0x85;
\r
2129 private static final int COMMON_TOP_CASE_SWITCH_LOWER_3_ = 0x45;
\r
2130 private static final int COMMON_TOP_CASE_SWITCH_UPPER_3_ = 0xC5;
\r
2131 private static final int COMMON_BOTTOM_3_ = 0x05;
\r
2132 private static final int COMMON_BOTTOM_CASE_SWITCH_UPPER_3_ = 0x86;
\r
2133 private static final int COMMON_BOTTOM_CASE_SWITCH_LOWER_3_ =
\r
2135 private static final int TOP_COUNT_2_ = (int)(PROPORTION_2_ * TOTAL_2_);
\r
2136 private static final int BOTTOM_COUNT_2_ = TOTAL_2_ - TOP_COUNT_2_;
\r
2137 private static final int COMMON_2_ = COMMON_BOTTOM_2_;
\r
2138 private static final int COMMON_UPPER_FIRST_3_ = 0xC5;
\r
2139 private static final int COMMON_NORMAL_3_ = COMMON_BOTTOM_3_;
\r
2140 //private static final int COMMON_4_ = (byte)0xFF;
\r
2145 * Minimum size required for the binary collation data in bytes.
\r
2146 * Size of UCA header + size of options to 4 bytes
\r
2148 //private static final int MIN_BINARY_DATA_SIZE_ = (42 + 25) << 2;
\r
2151 * If this collator is to generate only simple tertiaries for fast path
\r
2153 private boolean m_isSimple3_;
\r
2156 * French collation sorting flag
\r
2158 private boolean m_isFrenchCollation_;
\r
2160 * Flag indicating if shifted is requested for Quaternary alternate
\r
2161 * handling. If this is not true, the default for alternate handling will
\r
2162 * be non-ignorable.
\r
2164 private boolean m_isAlternateHandlingShifted_;
\r
2166 * Extra case level for sorting
\r
2168 private boolean m_isCaseLevel_;
\r
2170 private static final int SORT_BUFFER_INIT_SIZE_ = 128;
\r
2171 private static final int SORT_BUFFER_INIT_SIZE_1_ =
\r
2172 SORT_BUFFER_INIT_SIZE_ << 3;
\r
2173 private static final int SORT_BUFFER_INIT_SIZE_2_ = SORT_BUFFER_INIT_SIZE_;
\r
2174 private static final int SORT_BUFFER_INIT_SIZE_3_ = SORT_BUFFER_INIT_SIZE_;
\r
2175 private static final int SORT_BUFFER_INIT_SIZE_CASE_ =
\r
2176 SORT_BUFFER_INIT_SIZE_ >> 2;
\r
2177 private static final int SORT_BUFFER_INIT_SIZE_4_ = SORT_BUFFER_INIT_SIZE_;
\r
2179 private static final int CE_CONTINUATION_TAG_ = 0xC0;
\r
2180 private static final int CE_REMOVE_CONTINUATION_MASK_ = 0xFFFFFF3F;
\r
2182 private static final int LAST_BYTE_MASK_ = 0xFF;
\r
2184 //private static final int CE_RESET_TOP_VALUE_ = 0x9F000303;
\r
2185 //private static final int CE_NEXT_TOP_VALUE_ = 0xE8960303;
\r
2187 private static final byte SORT_CASE_BYTE_START_ = (byte)0x80;
\r
2188 private static final byte SORT_CASE_SHIFT_START_ = (byte)7;
\r
2193 private static final int CE_BUFFER_SIZE_ = 512;
\r
2195 // variables for Latin-1 processing
\r
2196 boolean latinOneUse_ = false;
\r
2197 boolean latinOneRegenTable_ = false;
\r
2198 boolean latinOneFailed_ = false;
\r
2200 int latinOneTableLen_ = 0;
\r
2201 int latinOneCEs_[] = null;
\r
2203 * Bunch of utility iterators
\r
2205 private StringUCharacterIterator m_srcUtilIter_;
\r
2206 private CollationElementIterator m_srcUtilColEIter_;
\r
2207 private StringUCharacterIterator m_tgtUtilIter_;
\r
2208 private CollationElementIterator m_tgtUtilColEIter_;
\r
2210 * Utility comparison flags
\r
2212 private boolean m_utilCompare0_;
\r
2213 //private boolean m_utilCompare1_;
\r
2214 private boolean m_utilCompare2_;
\r
2215 private boolean m_utilCompare3_;
\r
2216 private boolean m_utilCompare4_;
\r
2217 private boolean m_utilCompare5_;
\r
2219 * Utility byte buffer
\r
2221 private byte m_utilBytes0_[];
\r
2222 private byte m_utilBytes1_[];
\r
2223 private byte m_utilBytes2_[];
\r
2224 private byte m_utilBytes3_[];
\r
2225 private byte m_utilBytes4_[];
\r
2226 //private byte m_utilBytes5_[];
\r
2227 private RawCollationKey m_utilRawCollationKey_;
\r
2229 private int m_utilBytesCount0_;
\r
2230 private int m_utilBytesCount1_;
\r
2231 private int m_utilBytesCount2_;
\r
2232 private int m_utilBytesCount3_;
\r
2233 private int m_utilBytesCount4_;
\r
2234 //private int m_utilBytesCount5_;
\r
2235 //private int m_utilCount0_;
\r
2236 //private int m_utilCount1_;
\r
2237 private int m_utilCount2_;
\r
2238 private int m_utilCount3_;
\r
2239 private int m_utilCount4_;
\r
2240 //private int m_utilCount5_;
\r
2242 private int m_utilFrenchStart_;
\r
2243 private int m_utilFrenchEnd_;
\r
2246 * Preparing the CE buffers. will be filled during the primary phase
\r
2248 private int m_srcUtilCEBuffer_[];
\r
2249 private int m_tgtUtilCEBuffer_[];
\r
2250 private int m_srcUtilCEBufferSize_;
\r
2251 private int m_tgtUtilCEBufferSize_;
\r
2253 private int m_srcUtilContOffset_;
\r
2254 private int m_tgtUtilContOffset_;
\r
2256 private int m_srcUtilOffset_;
\r
2257 private int m_tgtUtilOffset_;
\r
2259 // private methods -------------------------------------------------------
\r
2261 private void init(String rules) throws Exception
\r
2264 CollationParsedRuleBuilder builder
\r
2265 = new CollationParsedRuleBuilder(rules);
\r
2266 builder.setRules(this);
\r
2269 initUtility(false);
\r
2272 private final int compareRegular(String source, String target, int offset) {
\r
2273 if (m_srcUtilIter_ == null) {
\r
2274 initUtility(true);
\r
2276 int strength = getStrength();
\r
2277 // setting up the collator parameters
\r
2278 m_utilCompare0_ = m_isCaseLevel_;
\r
2279 //m_utilCompare1_ = true;
\r
2280 m_utilCompare2_ = strength >= SECONDARY;
\r
2281 m_utilCompare3_ = strength >= TERTIARY;
\r
2282 m_utilCompare4_ = strength >= QUATERNARY;
\r
2283 m_utilCompare5_ = strength == IDENTICAL;
\r
2284 boolean doFrench = m_isFrenchCollation_ && m_utilCompare2_;
\r
2285 boolean doShift4 = m_isAlternateHandlingShifted_ && m_utilCompare4_;
\r
2286 boolean doHiragana4 = m_isHiragana4_ && m_utilCompare4_;
\r
2288 if (doHiragana4 && doShift4) {
\r
2289 String sourcesub = source.substring(offset);
\r
2290 String targetsub = target.substring(offset);
\r
2291 return compareBySortKeys(sourcesub, targetsub);
\r
2294 // This is the lowest primary value that will not be ignored if shifted
\r
2295 int lowestpvalue = m_isAlternateHandlingShifted_
\r
2296 ? m_variableTopValue_ << 16 : 0;
\r
2297 m_srcUtilCEBufferSize_ = 0;
\r
2298 m_tgtUtilCEBufferSize_ = 0;
\r
2299 int result = doPrimaryCompare(doHiragana4, lowestpvalue, source,
\r
2301 if (m_srcUtilCEBufferSize_ == -1
\r
2302 && m_tgtUtilCEBufferSize_ == -1) {
\r
2303 // since the cebuffer is cleared when we have determined that
\r
2304 // either source is greater than target or vice versa, the return
\r
2305 // result is the comparison result and not the hiragana result
\r
2309 int hiraganaresult = result;
\r
2311 if (m_utilCompare2_) {
\r
2312 result = doSecondaryCompare(doFrench);
\r
2313 if (result != 0) {
\r
2317 // doing the case bit
\r
2318 if (m_utilCompare0_) {
\r
2319 result = doCaseCompare();
\r
2320 if (result != 0) {
\r
2325 if (m_utilCompare3_) {
\r
2326 result = doTertiaryCompare();
\r
2327 if (result != 0) {
\r
2332 if (doShift4) { // checkQuad
\r
2333 result = doQuaternaryCompare(lowestpvalue);
\r
2334 if (result != 0) {
\r
2338 else if (doHiragana4 && hiraganaresult != 0) {
\r
2339 // If we're fine on quaternaries, we might be different
\r
2340 // on Hiragana. This, however, might fail us in shifted.
\r
2341 return hiraganaresult;
\r
2344 // For IDENTICAL comparisons, we use a bitwise character comparison
\r
2345 // as a tiebreaker if all else is equal.
\r
2346 // Getting here should be quite rare - strings are not identical -
\r
2347 // that is checked first, but compared == through all other checks.
\r
2348 if (m_utilCompare5_) {
\r
2349 return doIdenticalCompare(source, target, offset, true);
\r
2355 * Gets the 2 bytes of primary order and adds it to the primary byte array
\r
2356 * @param ce current ce
\r
2357 * @param notIsContinuation flag indicating if the current bytes belong to
\r
2358 * a continuation ce
\r
2359 * @param doShift flag indicating if ce is to be shifted
\r
2360 * @param leadPrimary lead primary used for compression
\r
2361 * @param commonBottom4 common byte value for Quaternary
\r
2362 * @param bottomCount4 smallest byte value for Quaternary
\r
2363 * @return the new lead primary for compression
\r
2365 private final int doPrimaryBytes(int ce, boolean notIsContinuation,
\r
2366 boolean doShift, int leadPrimary,
\r
2367 int commonBottom4, int bottomCount4)
\r
2370 int p2 = (ce >>>= 16) & LAST_BYTE_MASK_; // in ints for unsigned
\r
2371 int p1 = ce >>> 8; // comparison
\r
2373 if (m_utilCount4_ > 0) {
\r
2374 while (m_utilCount4_ > bottomCount4) {
\r
2375 m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
\r
2376 (byte)(commonBottom4 + bottomCount4));
\r
2377 m_utilBytesCount4_ ++;
\r
2378 m_utilCount4_ -= bottomCount4;
\r
2380 m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
\r
2381 (byte)(commonBottom4
\r
2382 + (m_utilCount4_ - 1)));
\r
2383 m_utilBytesCount4_ ++;
\r
2384 m_utilCount4_ = 0;
\r
2386 // dealing with a variable and we're treating them as shifted
\r
2387 // This is a shifted ignorable
\r
2389 // we need to check this since we could be in continuation
\r
2390 m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
\r
2392 m_utilBytesCount4_ ++;
\r
2395 m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
\r
2397 m_utilBytesCount4_ ++;
\r
2401 // Note: This code assumes that the table is well built
\r
2402 // i.e. not having 0 bytes where they are not supposed to be.
\r
2403 // Usually, we'll have non-zero primary1 & primary2, except
\r
2404 // in cases of LatinOne and friends, when primary2 will be
\r
2405 // regular and simple sortkey calc
\r
2406 if (p1 != CollationElementIterator.IGNORABLE) {
\r
2407 if (notIsContinuation) {
\r
2408 if (leadPrimary == p1) {
\r
2409 m_utilBytes1_ = append(m_utilBytes1_,
\r
2410 m_utilBytesCount1_, (byte)p2);
\r
2411 m_utilBytesCount1_ ++;
\r
2414 if (leadPrimary != 0) {
\r
2415 m_utilBytes1_ = append(m_utilBytes1_,
\r
2416 m_utilBytesCount1_,
\r
2417 ((p1 > leadPrimary)
\r
2418 ? BYTE_UNSHIFTED_MAX_
\r
2419 : BYTE_UNSHIFTED_MIN_));
\r
2420 m_utilBytesCount1_ ++;
\r
2422 if (p2 == CollationElementIterator.IGNORABLE) {
\r
2423 // one byter, not compressed
\r
2424 m_utilBytes1_ = append(m_utilBytes1_,
\r
2425 m_utilBytesCount1_,
\r
2427 m_utilBytesCount1_ ++;
\r
2430 else if (p1 < BYTE_FIRST_NON_LATIN_PRIMARY_
\r
2431 || (p1 > maxRegularPrimary
\r
2432 //> (RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_[0]
\r
2434 && p1 < minImplicitPrimary
\r
2435 //< (RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_[0]
\r
2438 // not compressible
\r
2440 m_utilBytes1_ = append(m_utilBytes1_,
\r
2441 m_utilBytesCount1_,
\r
2443 m_utilBytesCount1_ ++;
\r
2444 m_utilBytes1_ = append(m_utilBytes1_,
\r
2445 m_utilBytesCount1_,
\r
2447 m_utilBytesCount1_ ++;
\r
2449 else { // compress
\r
2451 m_utilBytes1_ = append(m_utilBytes1_,
\r
2452 m_utilBytesCount1_,
\r
2454 m_utilBytesCount1_ ++;
\r
2455 m_utilBytes1_ = append(m_utilBytes1_,
\r
2456 m_utilBytesCount1_, (byte)p2);
\r
2457 m_utilBytesCount1_ ++;
\r
2462 // continuation, add primary to the key, no compression
\r
2463 m_utilBytes1_ = append(m_utilBytes1_,
\r
2464 m_utilBytesCount1_, (byte)p1);
\r
2465 m_utilBytesCount1_ ++;
\r
2466 if (p2 != CollationElementIterator.IGNORABLE) {
\r
2467 m_utilBytes1_ = append(m_utilBytes1_,
\r
2468 m_utilBytesCount1_, (byte)p2);
\r
2470 m_utilBytesCount1_ ++;
\r
2475 return leadPrimary;
\r
2479 * Gets the secondary byte and adds it to the secondary byte array
\r
2480 * @param ce current ce
\r
2481 * @param notIsContinuation flag indicating if the current bytes belong to
\r
2482 * a continuation ce
\r
2483 * @param doFrench flag indicator if french sort is to be performed
\r
2485 private final void doSecondaryBytes(int ce, boolean notIsContinuation,
\r
2488 int s = (ce >>= 8) & LAST_BYTE_MASK_; // int for comparison
\r
2491 // This is compression code.
\r
2492 if (s == COMMON_2_ && notIsContinuation) {
\r
2496 if (m_utilCount2_ > 0) {
\r
2497 if (s > COMMON_2_) { // not necessary for 4th level.
\r
2498 while (m_utilCount2_ > TOP_COUNT_2_) {
\r
2499 m_utilBytes2_ = append(m_utilBytes2_,
\r
2500 m_utilBytesCount2_,
\r
2501 (byte)(COMMON_TOP_2_ - TOP_COUNT_2_));
\r
2502 m_utilBytesCount2_ ++;
\r
2503 m_utilCount2_ -= TOP_COUNT_2_;
\r
2505 m_utilBytes2_ = append(m_utilBytes2_,
\r
2506 m_utilBytesCount2_,
\r
2507 (byte)(COMMON_TOP_2_
\r
2508 - (m_utilCount2_ - 1)));
\r
2509 m_utilBytesCount2_ ++;
\r
2512 while (m_utilCount2_ > BOTTOM_COUNT_2_) {
\r
2513 m_utilBytes2_ = append(m_utilBytes2_,
\r
2514 m_utilBytesCount2_,
\r
2515 (byte)(COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
\r
2516 m_utilBytesCount2_ ++;
\r
2517 m_utilCount2_ -= BOTTOM_COUNT_2_;
\r
2519 m_utilBytes2_ = append(m_utilBytes2_,
\r
2520 m_utilBytesCount2_,
\r
2521 (byte)(COMMON_BOTTOM_2_
\r
2522 + (m_utilCount2_ - 1)));
\r
2523 m_utilBytesCount2_ ++;
\r
2525 m_utilCount2_ = 0;
\r
2527 m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,
\r
2529 m_utilBytesCount2_ ++;
\r
2533 m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,
\r
2535 m_utilBytesCount2_ ++;
\r
2536 // Do the special handling for French secondaries
\r
2537 // We need to get continuation elements and do intermediate
\r
2539 // abc1c2c3de with french secondaries need to be edc1c2c3ba
\r
2541 if (notIsContinuation) {
\r
2542 if (m_utilFrenchStart_ != -1) {
\r
2543 // reverse secondaries from frenchStartPtr up to
\r
2545 reverseBuffer(m_utilBytes2_);
\r
2546 m_utilFrenchStart_ = -1;
\r
2550 if (m_utilFrenchStart_ == -1) {
\r
2551 m_utilFrenchStart_ = m_utilBytesCount2_ - 2;
\r
2553 m_utilFrenchEnd_ = m_utilBytesCount2_ - 1;
\r
2560 * Reverse the argument buffer
\r
2561 * @param buffer to reverse
\r
2563 private void reverseBuffer(byte buffer[])
\r
2565 int start = m_utilFrenchStart_;
\r
2566 int end = m_utilFrenchEnd_;
\r
2567 while (start < end) {
\r
2568 byte b = buffer[start];
\r
2569 buffer[start ++] = buffer[end];
\r
2570 buffer[end --] = b;
\r
2575 * Insert the case shifting byte if required
\r
2576 * @param caseshift value
\r
2577 * @return new caseshift value
\r
2579 private final int doCaseShift(int caseshift)
\r
2581 if (caseshift == 0) {
\r
2582 m_utilBytes0_ = append(m_utilBytes0_, m_utilBytesCount0_,
\r
2583 SORT_CASE_BYTE_START_);
\r
2584 m_utilBytesCount0_ ++;
\r
2585 caseshift = SORT_CASE_SHIFT_START_;
\r
2591 * Performs the casing sort
\r
2592 * @param tertiary byte in ints for easy comparison
\r
2593 * @param notIsContinuation flag indicating if the current bytes belong to
\r
2594 * a continuation ce
\r
2595 * @param caseshift
\r
2596 * @return the new value of case shift
\r
2598 private final int doCaseBytes(int tertiary, boolean notIsContinuation,
\r
2601 caseshift = doCaseShift(caseshift);
\r
2603 if (notIsContinuation && tertiary != 0) {
\r
2604 byte casebits = (byte)(tertiary & 0xC0);
\r
2605 if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {
\r
2606 if (casebits == 0) {
\r
2607 m_utilBytes0_[m_utilBytesCount0_ - 1]
\r
2608 |= (1 << (-- caseshift));
\r
2612 caseshift = doCaseShift(caseshift - 1);
\r
2613 m_utilBytes0_[m_utilBytesCount0_ - 1]
\r
2614 |= ((casebits >> 6) & 1) << (-- caseshift);
\r
2618 if (casebits != 0) {
\r
2619 m_utilBytes0_[m_utilBytesCount0_ - 1]
\r
2620 |= 1 << (-- caseshift);
\r
2622 caseshift = doCaseShift(caseshift);
\r
2623 m_utilBytes0_[m_utilBytesCount0_ - 1]
\r
2624 |= ((casebits >> 7) & 1) << (-- caseshift);
\r
2636 * Gets the tertiary byte and adds it to the tertiary byte array
\r
2637 * @param tertiary byte in int for easy comparison
\r
2638 * @param notIsContinuation flag indicating if the current bytes belong to
\r
2639 * a continuation ce
\r
2641 private final void doTertiaryBytes(int tertiary, boolean notIsContinuation)
\r
2643 if (tertiary != 0) {
\r
2644 // This is compression code.
\r
2645 // sequence size check is included in the if clause
\r
2646 if (tertiary == m_common3_ && notIsContinuation) {
\r
2650 int common3 = m_common3_ & LAST_BYTE_MASK_;
\r
2651 if (tertiary > common3 && m_common3_ == COMMON_NORMAL_3_) {
\r
2652 tertiary += m_addition3_;
\r
2654 else if (tertiary <= common3
\r
2655 && m_common3_ == COMMON_UPPER_FIRST_3_) {
\r
2656 tertiary -= m_addition3_;
\r
2658 if (m_utilCount3_ > 0) {
\r
2659 if (tertiary > common3) {
\r
2660 while (m_utilCount3_ > m_topCount3_) {
\r
2661 m_utilBytes3_ = append(m_utilBytes3_,
\r
2662 m_utilBytesCount3_,
\r
2663 (byte)(m_top3_ - m_topCount3_));
\r
2664 m_utilBytesCount3_ ++;
\r
2665 m_utilCount3_ -= m_topCount3_;
\r
2667 m_utilBytes3_ = append(m_utilBytes3_,
\r
2668 m_utilBytesCount3_,
\r
2670 - (m_utilCount3_ - 1)));
\r
2671 m_utilBytesCount3_ ++;
\r
2674 while (m_utilCount3_ > m_bottomCount3_) {
\r
2675 m_utilBytes3_ = append(m_utilBytes3_,
\r
2676 m_utilBytesCount3_,
\r
2677 (byte)(m_bottom3_ + m_bottomCount3_));
\r
2678 m_utilBytesCount3_ ++;
\r
2679 m_utilCount3_ -= m_bottomCount3_;
\r
2681 m_utilBytes3_ = append(m_utilBytes3_,
\r
2682 m_utilBytesCount3_,
\r
2684 + (m_utilCount3_ - 1)));
\r
2685 m_utilBytesCount3_ ++;
\r
2687 m_utilCount3_ = 0;
\r
2689 m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
\r
2691 m_utilBytesCount3_ ++;
\r
2697 * Gets the Quaternary byte and adds it to the Quaternary byte array
\r
2698 * @param isCodePointHiragana flag indicator if the previous codepoint
\r
2699 * we dealt with was Hiragana
\r
2700 * @param commonBottom4 smallest common Quaternary byte
\r
2701 * @param bottomCount4 smallest Quaternary byte
\r
2702 * @param hiragana4 hiragana Quaternary byte
\r
2704 private final void doQuaternaryBytes(boolean isCodePointHiragana,
\r
2705 int commonBottom4, int bottomCount4,
\r
2708 if (isCodePointHiragana) { // This was Hiragana, need to note it
\r
2709 if (m_utilCount4_ > 0) { // Close this part
\r
2710 while (m_utilCount4_ > bottomCount4) {
\r
2711 m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
\r
2712 (byte)(commonBottom4
\r
2714 m_utilBytesCount4_ ++;
\r
2715 m_utilCount4_ -= bottomCount4;
\r
2717 m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
\r
2718 (byte)(commonBottom4
\r
2719 + (m_utilCount4_ - 1)));
\r
2720 m_utilBytesCount4_ ++;
\r
2721 m_utilCount4_ = 0;
\r
2723 m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
\r
2724 hiragana4); // Add the Hiragana
\r
2725 m_utilBytesCount4_ ++;
\r
2727 else { // This wasn't Hiragana, so we can continue adding stuff
\r
2733 * Iterates through the argument string for all ces.
\r
2734 * Split the ces into their relevant primaries, secondaries etc.
\r
2735 * @param source normalized string
\r
2736 * @param doFrench flag indicator if special handling of French has to be
\r
2738 * @param hiragana4 offset for Hiragana quaternary
\r
2739 * @param commonBottom4 smallest common quaternary byte
\r
2740 * @param bottomCount4 smallest quaternary byte
\r
2742 private final void getSortKeyBytes(String source, boolean doFrench,
\r
2743 byte hiragana4, int commonBottom4,
\r
2747 if (m_srcUtilIter_ == null) {
\r
2748 initUtility(true);
\r
2750 int backupDecomposition = getDecomposition();
\r
2751 setDecomposition(NO_DECOMPOSITION); // have to revert to backup later
\r
2752 m_srcUtilIter_.setText(source);
\r
2753 m_srcUtilColEIter_.setText(m_srcUtilIter_);
\r
2754 m_utilFrenchStart_ = -1;
\r
2755 m_utilFrenchEnd_ = -1;
\r
2757 // scriptorder not implemented yet
\r
2758 // const uint8_t *scriptOrder = coll->scriptOrder;
\r
2760 boolean doShift = false;
\r
2761 boolean notIsContinuation = false;
\r
2763 int leadPrimary = 0; // int for easier comparison
\r
2764 int caseShift = 0;
\r
2767 int ce = m_srcUtilColEIter_.next();
\r
2768 if (ce == CollationElementIterator.NULLORDER) {
\r
2772 if (ce == CollationElementIterator.IGNORABLE) {
\r
2776 notIsContinuation = !isContinuation(ce);
\r
2779 * if (notIsContinuation) {
\r
2780 if (scriptOrder != NULL) {
\r
2781 primary1 = scriptOrder[primary1];
\r
2784 boolean isPrimaryByteIgnorable = (ce & CE_PRIMARY_MASK_) == 0;
\r
2785 // actually we can just check that the first byte is 0
\r
2786 // generation stuffs the order left first
\r
2787 boolean isSmallerThanVariableTop = (ce >>> CE_PRIMARY_SHIFT_)
\r
2788 <= m_variableTopValue_;
\r
2789 doShift = (m_isAlternateHandlingShifted_
\r
2790 && ((notIsContinuation && isSmallerThanVariableTop
\r
2791 && !isPrimaryByteIgnorable) // primary byte not 0
\r
2792 || (!notIsContinuation && doShift))
\r
2793 || (doShift && isPrimaryByteIgnorable));
\r
2794 if (doShift && isPrimaryByteIgnorable) {
\r
2795 // amendment to the UCA says that primary ignorables and other
\r
2796 // ignorables should be removed if following a shifted code
\r
2798 // if we were shifted and we got an ignorable code point
\r
2799 // we should just completely ignore it
\r
2802 leadPrimary = doPrimaryBytes(ce, notIsContinuation, doShift,
\r
2803 leadPrimary, commonBottom4,
\r
2808 if (m_utilCompare2_) {
\r
2809 doSecondaryBytes(ce, notIsContinuation, doFrench);
\r
2812 int t = ce & LAST_BYTE_MASK_;
\r
2813 if (!notIsContinuation) {
\r
2814 t = ce & CE_REMOVE_CONTINUATION_MASK_;
\r
2817 if (m_utilCompare0_ && (!isPrimaryByteIgnorable || m_utilCompare2_)) {
\r
2818 // do the case level if we need to do it. We don't want to calculate
\r
2819 // case level for primary ignorables if we have only primary strength and case level
\r
2820 // otherwise we would break well formedness of CEs
\r
2821 caseShift = doCaseBytes(t, notIsContinuation, caseShift);
\r
2823 else if (notIsContinuation) {
\r
2824 t ^= m_caseSwitch_;
\r
2829 if (m_utilCompare3_) {
\r
2830 doTertiaryBytes(t, notIsContinuation);
\r
2833 if (m_utilCompare4_ && notIsContinuation) { // compare quad
\r
2834 doQuaternaryBytes(m_srcUtilColEIter_.m_isCodePointHiragana_,
\r
2835 commonBottom4, bottomCount4, hiragana4);
\r
2838 setDecomposition(backupDecomposition); // reverts to original
\r
2839 if (m_utilFrenchStart_ != -1) {
\r
2840 // one last round of checks
\r
2841 reverseBuffer(m_utilBytes2_);
\r
2846 * From the individual strength byte results the final compact sortkey
\r
2847 * will be calculated.
\r
2848 * @param source text string
\r
2849 * @param doFrench flag indicating that special handling of French has to
\r
2851 * @param commonBottom4 smallest common quaternary byte
\r
2852 * @param bottomCount4 smallest quaternary byte
\r
2853 * @param key output RawCollationKey to store results, key cannot be null
\r
2855 private final void getSortKey(String source, boolean doFrench,
\r
2856 int commonBottom4,
\r
2858 RawCollationKey key)
\r
2860 // we have done all the CE's, now let's put them together to form
\r
2862 if (m_utilCompare2_) {
\r
2863 doSecondary(doFrench);
\r
2865 // adding case level should be independent of secondary level
\r
2866 if (m_utilCompare0_) {
\r
2869 if (m_utilCompare3_) {
\r
2871 if (m_utilCompare4_) {
\r
2872 doQuaternary(commonBottom4, bottomCount4);
\r
2873 if (m_utilCompare5_) {
\r
2874 doIdentical(source);
\r
2879 m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte)0);
\r
2880 m_utilBytesCount1_ ++;
\r
2882 key.set(m_utilBytes1_, 0, m_utilBytesCount1_);
\r
2886 * Packs the French bytes
\r
2888 private final void doFrench()
\r
2890 for (int i = 0; i < m_utilBytesCount2_; i ++) {
\r
2891 byte s = m_utilBytes2_[m_utilBytesCount2_ - i - 1];
\r
2892 // This is compression code.
\r
2893 if (s == COMMON_2_) {
\r
2897 if (m_utilCount2_ > 0) {
\r
2898 // getting the unsigned value
\r
2899 if ((s & LAST_BYTE_MASK_) > COMMON_2_) {
\r
2900 // not necessary for 4th level.
\r
2901 while (m_utilCount2_ > TOP_COUNT_2_) {
\r
2902 m_utilBytes1_ = append(m_utilBytes1_,
\r
2903 m_utilBytesCount1_,
\r
2904 (byte)(COMMON_TOP_2_ - TOP_COUNT_2_));
\r
2905 m_utilBytesCount1_ ++;
\r
2906 m_utilCount2_ -= TOP_COUNT_2_;
\r
2908 m_utilBytes1_ = append(m_utilBytes1_,
\r
2909 m_utilBytesCount1_,
\r
2910 (byte)(COMMON_TOP_2_
\r
2911 - (m_utilCount2_ - 1)));
\r
2912 m_utilBytesCount1_ ++;
\r
2915 while (m_utilCount2_ > BOTTOM_COUNT_2_) {
\r
2916 m_utilBytes1_ = append(m_utilBytes1_,
\r
2917 m_utilBytesCount1_,
\r
2918 (byte)(COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
\r
2919 m_utilBytesCount1_ ++;
\r
2920 m_utilCount2_ -= BOTTOM_COUNT_2_;
\r
2922 m_utilBytes1_ = append(m_utilBytes1_,
\r
2923 m_utilBytesCount1_,
\r
2924 (byte)(COMMON_BOTTOM_2_
\r
2925 + (m_utilCount2_ - 1)));
\r
2926 m_utilBytesCount1_ ++;
\r
2928 m_utilCount2_ = 0;
\r
2930 m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, s);
\r
2931 m_utilBytesCount1_ ++;
\r
2934 if (m_utilCount2_ > 0) {
\r
2935 while (m_utilCount2_ > BOTTOM_COUNT_2_) {
\r
2936 m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
\r
2937 (byte)(COMMON_BOTTOM_2_
\r
2938 + BOTTOM_COUNT_2_));
\r
2939 m_utilBytesCount1_ ++;
\r
2940 m_utilCount2_ -= BOTTOM_COUNT_2_;
\r
2942 m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
\r
2943 (byte)(COMMON_BOTTOM_2_
\r
2944 + (m_utilCount2_ - 1)));
\r
2945 m_utilBytesCount1_ ++;
\r
2950 * Compacts the secondary bytes and stores them into the primary array
\r
2951 * @param doFrench flag indicator that French has to be handled specially
\r
2953 private final void doSecondary(boolean doFrench)
\r
2955 if (m_utilCount2_ > 0) {
\r
2956 while (m_utilCount2_ > BOTTOM_COUNT_2_) {
\r
2957 m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,
\r
2958 (byte)(COMMON_BOTTOM_2_
\r
2959 + BOTTOM_COUNT_2_));
\r
2960 m_utilBytesCount2_ ++;
\r
2961 m_utilCount2_ -= BOTTOM_COUNT_2_;
\r
2963 m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,
\r
2964 (byte)(COMMON_BOTTOM_2_ +
\r
2965 (m_utilCount2_ - 1)));
\r
2966 m_utilBytesCount2_ ++;
\r
2969 m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
\r
2970 SORT_LEVEL_TERMINATOR_);
\r
2971 m_utilBytesCount1_ ++;
\r
2973 if (doFrench) { // do the reverse copy
\r
2977 if (m_utilBytes1_.length <= m_utilBytesCount1_
\r
2978 + m_utilBytesCount2_) {
\r
2979 m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
\r
2980 m_utilBytesCount2_);
\r
2982 System.arraycopy(m_utilBytes2_, 0, m_utilBytes1_,
\r
2983 m_utilBytesCount1_, m_utilBytesCount2_);
\r
2984 m_utilBytesCount1_ += m_utilBytesCount2_;
\r
2989 * Increase buffer size
\r
2990 * @param buffer array of bytes
\r
2991 * @param size of the byte array
\r
2992 * @param incrementsize size to increase
\r
2993 * @return the new buffer
\r
2995 private static final byte[] increase(byte buffer[], int size,
\r
2996 int incrementsize)
\r
2998 byte result[] = new byte[buffer.length + incrementsize];
\r
2999 System.arraycopy(buffer, 0, result, 0, size);
\r
3004 * Increase buffer size
\r
3005 * @param buffer array of ints
\r
3006 * @param size of the byte array
\r
3007 * @param incrementsize size to increase
\r
3008 * @return the new buffer
\r
3010 private static final int[] increase(int buffer[], int size,
\r
3011 int incrementsize)
\r
3013 int result[] = new int[buffer.length + incrementsize];
\r
3014 System.arraycopy(buffer, 0, result, 0, size);
\r
3019 * Compacts the case bytes and stores them into the primary array
\r
3021 private final void doCase()
\r
3023 m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
\r
3024 SORT_LEVEL_TERMINATOR_);
\r
3025 m_utilBytesCount1_ ++;
\r
3026 if (m_utilBytes1_.length <= m_utilBytesCount1_ + m_utilBytesCount0_) {
\r
3027 m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
\r
3028 m_utilBytesCount0_);
\r
3030 System.arraycopy(m_utilBytes0_, 0, m_utilBytes1_, m_utilBytesCount1_,
\r
3031 m_utilBytesCount0_);
\r
3032 m_utilBytesCount1_ += m_utilBytesCount0_;
\r
3036 * Compacts the tertiary bytes and stores them into the primary array
\r
3038 private final void doTertiary()
\r
3040 if (m_utilCount3_ > 0) {
\r
3041 if (m_common3_ != COMMON_BOTTOM_3_) {
\r
3042 while (m_utilCount3_ >= m_topCount3_) {
\r
3043 m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
\r
3044 (byte)(m_top3_ - m_topCount3_));
\r
3045 m_utilBytesCount3_ ++;
\r
3046 m_utilCount3_ -= m_topCount3_;
\r
3048 m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
\r
3049 (byte)(m_top3_ - m_utilCount3_));
\r
3050 m_utilBytesCount3_ ++;
\r
3053 while (m_utilCount3_ > m_bottomCount3_) {
\r
3054 m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
\r
3056 + m_bottomCount3_));
\r
3057 m_utilBytesCount3_ ++;
\r
3058 m_utilCount3_ -= m_bottomCount3_;
\r
3060 m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
\r
3062 + (m_utilCount3_ - 1)));
\r
3063 m_utilBytesCount3_ ++;
\r
3066 m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
\r
3067 SORT_LEVEL_TERMINATOR_);
\r
3068 m_utilBytesCount1_ ++;
\r
3069 if (m_utilBytes1_.length <= m_utilBytesCount1_ + m_utilBytesCount3_) {
\r
3070 m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
\r
3071 m_utilBytesCount3_);
\r
3073 System.arraycopy(m_utilBytes3_, 0, m_utilBytes1_, m_utilBytesCount1_,
\r
3074 m_utilBytesCount3_);
\r
3075 m_utilBytesCount1_ += m_utilBytesCount3_;
\r
3079 * Compacts the quaternary bytes and stores them into the primary array
\r
3081 private final void doQuaternary(int commonbottom4, int bottomcount4)
\r
3083 if (m_utilCount4_ > 0) {
\r
3084 while (m_utilCount4_ > bottomcount4) {
\r
3085 m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
\r
3086 (byte)(commonbottom4 + bottomcount4));
\r
3087 m_utilBytesCount4_ ++;
\r
3088 m_utilCount4_ -= bottomcount4;
\r
3090 m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
\r
3091 (byte)(commonbottom4
\r
3092 + (m_utilCount4_ - 1)));
\r
3093 m_utilBytesCount4_ ++;
\r
3095 m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
\r
3096 SORT_LEVEL_TERMINATOR_);
\r
3097 m_utilBytesCount1_ ++;
\r
3098 if (m_utilBytes1_.length <= m_utilBytesCount1_ + m_utilBytesCount4_) {
\r
3099 m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
\r
3100 m_utilBytesCount4_);
\r
3102 System.arraycopy(m_utilBytes4_, 0, m_utilBytes1_, m_utilBytesCount1_,
\r
3103 m_utilBytesCount4_);
\r
3104 m_utilBytesCount1_ += m_utilBytesCount4_;
\r
3108 * Deals with the identical sort.
\r
3109 * Appends the BOCSU version of the source string to the ends of the
\r
3111 * @param source text string
\r
3113 private final void doIdentical(String source)
\r
3115 int isize = BOCU.getCompressionLength(source);
\r
3116 m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
\r
3117 SORT_LEVEL_TERMINATOR_);
\r
3118 m_utilBytesCount1_ ++;
\r
3119 if (m_utilBytes1_.length <= m_utilBytesCount1_ + isize) {
\r
3120 m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
\r
3123 m_utilBytesCount1_ = BOCU.compress(source, m_utilBytes1_,
\r
3124 m_utilBytesCount1_);
\r
3128 * Gets the offset of the first unmatched characters in source and target.
\r
3129 * This method returns the offset of the start of a contraction or a
\r
3130 * combining sequence, if the first difference is in the middle of such a
\r
3132 * @param source string
\r
3133 * @param target string
\r
3134 * @return offset of the first unmatched characters in source and target.
\r
3136 private final int getFirstUnmatchedOffset(String source, String target)
\r
3139 int slength = source.length();
\r
3140 int tlength = target.length();
\r
3141 int minlength = slength;
\r
3142 if (minlength > tlength) {
\r
3143 minlength = tlength;
\r
3145 while (result < minlength
\r
3146 && source.charAt(result) == target.charAt(result)) {
\r
3150 // There is an identical portion at the beginning of the two
\r
3151 // strings. If the identical portion ends within a contraction or a
\r
3152 // combining character sequence, back up to the start of that
\r
3156 if (result < minlength) {
\r
3157 schar = source.charAt(result); // first differing chars
\r
3158 tchar = target.charAt(result);
\r
3161 schar = source.charAt(minlength - 1);
\r
3162 if (isUnsafe(schar)) {
\r
3165 else if (slength == tlength) {
\r
3168 else if (slength < tlength) {
\r
3169 tchar = target.charAt(result);
\r
3172 schar = source.charAt(result);
\r
3175 if (isUnsafe(schar) || isUnsafe(tchar))
\r
3177 // We are stopped in the middle of a contraction or combining
\r
3179 // Look backwards for the part of the string for the start of
\r
3181 // It doesn't matter which string we scan, since they are the
\r
3182 // same in this region.
\r
3186 while (result > 0 && isUnsafe(source.charAt(result)));
\r
3193 * Appending an byte to an array of bytes and increases it if we run out of
\r
3195 * @param array of byte arrays
\r
3196 * @param appendindex index in the byte array to append
\r
3197 * @param value to append
\r
3198 * @return array if array size can accomodate the new value, otherwise
\r
3199 * a bigger array will be created and returned
\r
3201 private static final byte[] append(byte array[], int appendindex,
\r
3205 array[appendindex] = value;
\r
3207 catch (ArrayIndexOutOfBoundsException e) {
\r
3208 array = increase(array, appendindex, SORT_BUFFER_INIT_SIZE_);
\r
3209 array[appendindex] = value;
\r
3215 * This is a trick string compare function that goes in and uses sortkeys
\r
3216 * to compare. It is used when compare gets in trouble and needs to bail
\r
3218 * @param source text string
\r
3219 * @param target text string
\r
3221 private final int compareBySortKeys(String source, String target)
\r
3224 m_utilRawCollationKey_ = getRawCollationKey(source,
\r
3225 m_utilRawCollationKey_);
\r
3226 // this method is very seldom called
\r
3227 RawCollationKey targetkey = getRawCollationKey(target, null);
\r
3228 return m_utilRawCollationKey_.compareTo(targetkey);
\r
3232 * Performs the primary comparisons, and fills up the CE buffer at the
\r
3234 * The return value toggles between the comparison result and the hiragana
\r
3235 * result. If either the source is greater than target or vice versa, the
\r
3236 * return result is the comparison result, ie 1 or -1, furthermore the
\r
3237 * cebuffers will be cleared when that happens. If the primary comparisons
\r
3238 * are equal, we'll have to continue with secondary comparison. In this case
\r
3239 * the cebuffer will not be cleared and the return result will be the
\r
3240 * hiragana result.
\r
3241 * @param doHiragana4 flag indicator that Hiragana Quaternary has to be
\r
3243 * @param lowestpvalue the lowest primary value that will not be ignored if
\r
3244 * alternate handling is shifted
\r
3245 * @param source text string
\r
3246 * @param target text string
\r
3247 * @param textoffset offset in text to start the comparison
\r
3248 * @return comparion result if a primary difference is found, otherwise
\r
3251 private final int doPrimaryCompare(boolean doHiragana4, int lowestpvalue,
\r
3252 String source, String target,
\r
3256 // Preparing the context objects for iterating over strings
\r
3257 m_srcUtilIter_.setText(source);
\r
3258 m_srcUtilColEIter_.setText(m_srcUtilIter_, textoffset);
\r
3259 m_tgtUtilIter_.setText(target);
\r
3260 m_tgtUtilColEIter_.setText(m_tgtUtilIter_, textoffset);
\r
3262 // Non shifted primary processing is quite simple
\r
3263 if (!m_isAlternateHandlingShifted_) {
\r
3264 int hiraganaresult = 0;
\r
3267 // We fetch CEs until we hit a non ignorable primary or end.
\r
3269 sorder = m_srcUtilColEIter_.next();
\r
3270 m_srcUtilCEBuffer_ = append(m_srcUtilCEBuffer_,
\r
3271 m_srcUtilCEBufferSize_, sorder);
\r
3272 m_srcUtilCEBufferSize_ ++;
\r
3273 sorder &= CE_PRIMARY_MASK_;
\r
3274 } while (sorder == CollationElementIterator.IGNORABLE);
\r
3278 torder = m_tgtUtilColEIter_.next();
\r
3279 m_tgtUtilCEBuffer_ = append(m_tgtUtilCEBuffer_,
\r
3280 m_tgtUtilCEBufferSize_, torder);
\r
3281 m_tgtUtilCEBufferSize_ ++;
\r
3282 torder &= CE_PRIMARY_MASK_;
\r
3283 } while (torder == CollationElementIterator.IGNORABLE);
\r
3285 // if both primaries are the same
\r
3286 if (sorder == torder) {
\r
3287 // and there are no more CEs, we advance to the next level
\r
3288 // see if we are at the end of either string
\r
3289 if (m_srcUtilCEBuffer_[m_srcUtilCEBufferSize_ - 1]
\r
3290 == CollationElementIterator.NULLORDER) {
\r
3291 if (m_tgtUtilCEBuffer_[m_tgtUtilCEBufferSize_ - 1]
\r
3292 != CollationElementIterator.NULLORDER) {
\r
3297 else if (m_tgtUtilCEBuffer_[m_tgtUtilCEBufferSize_ - 1]
\r
3298 == CollationElementIterator.NULLORDER) {
\r
3301 if (doHiragana4 && hiraganaresult == 0
\r
3302 && m_srcUtilColEIter_.m_isCodePointHiragana_ !=
\r
3303 m_tgtUtilColEIter_.m_isCodePointHiragana_) {
\r
3304 if (m_srcUtilColEIter_.m_isCodePointHiragana_) {
\r
3305 hiraganaresult = -1;
\r
3308 hiraganaresult = 1;
\r
3313 // if two primaries are different, we are done
\r
3314 return endPrimaryCompare(sorder, torder);
\r
3317 // no primary difference... do the rest from the buffers
\r
3318 return hiraganaresult;
\r
3320 else { // shifted - do a slightly more complicated processing :)
\r
3322 int sorder = getPrimaryShiftedCompareCE(m_srcUtilColEIter_,
\r
3323 lowestpvalue, true);
\r
3324 int torder = getPrimaryShiftedCompareCE(m_tgtUtilColEIter_,
\r
3325 lowestpvalue, false);
\r
3326 if (sorder == torder) {
\r
3327 if (m_srcUtilCEBuffer_[m_srcUtilCEBufferSize_ - 1]
\r
3328 == CollationElementIterator.NULLORDER) {
\r
3336 return endPrimaryCompare(sorder, torder);
\r
3338 } // no primary difference... do the rest from the buffers
\r
3344 * This is used only for primary strength when we know that sorder is
\r
3345 * already different from torder.
\r
3346 * Compares sorder and torder, returns -1 if sorder is less than torder.
\r
3347 * Clears the cebuffer at the same time.
\r
3348 * @param sorder source strength order
\r
3349 * @param torder target strength order
\r
3350 * @return the comparison result of sorder and torder
\r
3352 private final int endPrimaryCompare(int sorder, int torder)
\r
3354 // if we reach here, the ce offset accessed is the last ce
\r
3355 // appended to the buffer
\r
3356 boolean isSourceNullOrder = (m_srcUtilCEBuffer_[
\r
3357 m_srcUtilCEBufferSize_ - 1]
\r
3358 == CollationElementIterator.NULLORDER);
\r
3359 boolean isTargetNullOrder = (m_tgtUtilCEBuffer_[
\r
3360 m_tgtUtilCEBufferSize_ - 1]
\r
3361 == CollationElementIterator.NULLORDER);
\r
3362 m_srcUtilCEBufferSize_ = -1;
\r
3363 m_tgtUtilCEBufferSize_ = -1;
\r
3364 if (isSourceNullOrder) {
\r
3367 if (isTargetNullOrder) {
\r
3370 // getting rid of the sign
\r
3371 sorder >>>= CE_PRIMARY_SHIFT_;
\r
3372 torder >>>= CE_PRIMARY_SHIFT_;
\r
3373 if (sorder < torder) {
\r
3380 * Calculates the next primary shifted value and fills up cebuffer with the
\r
3381 * next non-ignorable ce.
\r
3382 * @param coleiter collation element iterator
\r
3383 * @param doHiragana4 flag indicator if hiragana quaternary is to be
\r
3385 * @param lowestpvalue lowest primary shifted value that will not be
\r
3387 * @return result next modified ce
\r
3389 private final int getPrimaryShiftedCompareCE(
\r
3390 CollationElementIterator coleiter,
\r
3391 int lowestpvalue, boolean isSrc)
\r
3394 boolean shifted = false;
\r
3395 int result = CollationElementIterator.IGNORABLE;
\r
3396 int cebuffer[] = m_srcUtilCEBuffer_;
\r
3397 int cebuffersize = m_srcUtilCEBufferSize_;
\r
3399 cebuffer = m_tgtUtilCEBuffer_;
\r
3400 cebuffersize = m_tgtUtilCEBufferSize_;
\r
3403 result = coleiter.next();
\r
3404 if (result == CollationElementIterator.NULLORDER) {
\r
3405 cebuffer = append(cebuffer, cebuffersize, result);
\r
3409 else if (result == CollationElementIterator.IGNORABLE
\r
3411 && (result & CE_PRIMARY_MASK_)
\r
3412 == CollationElementIterator.IGNORABLE)) {
\r
3413 // UCA amendment - ignore ignorables that follow shifted code
\r
3417 else if (isContinuation(result)) {
\r
3418 if ((result & CE_PRIMARY_MASK_)
\r
3419 != CollationElementIterator.IGNORABLE) {
\r
3420 // There is primary value
\r
3422 result = (result & CE_PRIMARY_MASK_)
\r
3423 | CE_CONTINUATION_MARKER_;
\r
3424 // preserve interesting continuation
\r
3425 cebuffer = append(cebuffer, cebuffersize, result);
\r
3430 cebuffer = append(cebuffer, cebuffersize, result);
\r
3435 else { // Just lower level values
\r
3437 cebuffer = append(cebuffer, cebuffersize, result);
\r
3443 if (Utility.compareUnsigned(result & CE_PRIMARY_MASK_,
\r
3444 lowestpvalue) > 0) {
\r
3445 cebuffer = append(cebuffer, cebuffersize, result);
\r
3450 if ((result & CE_PRIMARY_MASK_) != 0) {
\r
3452 result &= CE_PRIMARY_MASK_;
\r
3453 cebuffer = append(cebuffer, cebuffersize, result);
\r
3458 cebuffer = append(cebuffer, cebuffersize, result);
\r
3467 m_srcUtilCEBuffer_ = cebuffer;
\r
3468 m_srcUtilCEBufferSize_ = cebuffersize;
\r
3471 m_tgtUtilCEBuffer_ = cebuffer;
\r
3472 m_tgtUtilCEBufferSize_ = cebuffersize;
\r
3474 result &= CE_PRIMARY_MASK_;
\r
3479 * Appending an int to an array of ints and increases it if we run out of
\r
3481 * @param array of int arrays
\r
3482 * @param appendindex index at which value will be appended
\r
3483 * @param value to append
\r
3484 * @return array if size is not increased, otherwise a new array will be
\r
3487 private static final int[] append(int array[], int appendindex, int value)
\r
3489 if (appendindex + 1 >= array.length) {
\r
3490 array = increase(array, appendindex, CE_BUFFER_SIZE_);
\r
3492 array[appendindex] = value;
\r
3497 * Does secondary strength comparison based on the collected ces.
\r
3498 * @param doFrench flag indicates if French ordering is to be done
\r
3499 * @return the secondary strength comparison result
\r
3501 private final int doSecondaryCompare(boolean doFrench)
\r
3503 // now, we're gonna reexamine collected CEs
\r
3504 if (!doFrench) { // normal
\r
3508 int sorder = CollationElementIterator.IGNORABLE;
\r
3509 while (sorder == CollationElementIterator.IGNORABLE) {
\r
3510 sorder = m_srcUtilCEBuffer_[soffset ++]
\r
3511 & CE_SECONDARY_MASK_;
\r
3513 int torder = CollationElementIterator.IGNORABLE;
\r
3514 while (torder == CollationElementIterator.IGNORABLE) {
\r
3515 torder = m_tgtUtilCEBuffer_[toffset ++]
\r
3516 & CE_SECONDARY_MASK_;
\r
3519 if (sorder == torder) {
\r
3520 if (m_srcUtilCEBuffer_[soffset - 1]
\r
3521 == CollationElementIterator.NULLORDER) {
\r
3522 if (m_tgtUtilCEBuffer_[toffset - 1]
\r
3523 != CollationElementIterator.NULLORDER) {
\r
3528 else if (m_tgtUtilCEBuffer_[toffset - 1]
\r
3529 == CollationElementIterator.NULLORDER) {
\r
3534 if (m_srcUtilCEBuffer_[soffset - 1] ==
\r
3535 CollationElementIterator.NULLORDER) {
\r
3538 if (m_tgtUtilCEBuffer_[toffset - 1] ==
\r
3539 CollationElementIterator.NULLORDER) {
\r
3542 return (sorder < torder) ? -1 : 1;
\r
3546 else { // do the French
\r
3547 m_srcUtilContOffset_ = 0;
\r
3548 m_tgtUtilContOffset_ = 0;
\r
3549 m_srcUtilOffset_ = m_srcUtilCEBufferSize_ - 2;
\r
3550 m_tgtUtilOffset_ = m_tgtUtilCEBufferSize_ - 2;
\r
3552 int sorder = getSecondaryFrenchCE(true);
\r
3553 int torder = getSecondaryFrenchCE(false);
\r
3554 if (sorder == torder) {
\r
3555 if ((m_srcUtilOffset_ < 0 && m_tgtUtilOffset_ < 0)
\r
3556 || (m_srcUtilOffset_ >= 0
\r
3557 && m_srcUtilCEBuffer_[m_srcUtilOffset_]
\r
3558 == CollationElementIterator.NULLORDER)) {
\r
3563 return (sorder < torder) ? -1 : 1;
\r
3571 * Calculates the next secondary french CE.
\r
3572 * @param isSrc flag indicator if we are calculating the src ces
\r
3573 * @return result next modified ce
\r
3575 private final int getSecondaryFrenchCE(boolean isSrc)
\r
3577 int result = CollationElementIterator.IGNORABLE;
\r
3578 int offset = m_srcUtilOffset_;
\r
3579 int continuationoffset = m_srcUtilContOffset_;
\r
3580 int cebuffer[] = m_srcUtilCEBuffer_;
\r
3582 offset = m_tgtUtilOffset_;
\r
3583 continuationoffset = m_tgtUtilContOffset_;
\r
3584 cebuffer = m_tgtUtilCEBuffer_;
\r
3587 while (result == CollationElementIterator.IGNORABLE
\r
3589 if (continuationoffset == 0) {
\r
3590 result = cebuffer[offset];
\r
3591 while (isContinuation(cebuffer[offset --])){
\r
3593 // after this, sorder is at the start of continuation,
\r
3594 // and offset points before that
\r
3595 if (isContinuation(cebuffer[offset + 1])) {
\r
3596 // save offset for later
\r
3597 continuationoffset = offset;
\r
3602 result = cebuffer[offset ++];
\r
3603 if (!isContinuation(result)) {
\r
3604 // we have finished with this continuation
\r
3605 offset = continuationoffset;
\r
3606 // reset the pointer to before continuation
\r
3607 continuationoffset = 0;
\r
3611 result &= CE_SECONDARY_MASK_; // remove continuation bit
\r
3614 m_srcUtilOffset_ = offset;
\r
3615 m_srcUtilContOffset_ = continuationoffset;
\r
3618 m_tgtUtilOffset_ = offset;
\r
3619 m_tgtUtilContOffset_ = continuationoffset;
\r
3625 * Does case strength comparison based on the collected ces.
\r
3626 * @return the case strength comparison result
\r
3628 private final int doCaseCompare()
\r
3633 int sorder = CollationElementIterator.IGNORABLE;
\r
3634 int torder = CollationElementIterator.IGNORABLE;
\r
3635 while ((sorder & CE_REMOVE_CASE_)
\r
3636 == CollationElementIterator.IGNORABLE) {
\r
3637 sorder = m_srcUtilCEBuffer_[soffset ++];
\r
3638 if (!isContinuation(sorder) && ((sorder & CE_PRIMARY_MASK_) != 0 || m_utilCompare2_ == true)) {
\r
3639 // primary ignorables should not be considered on the case level when the strength is primary
\r
3640 // otherwise, the CEs stop being well-formed
\r
3641 sorder &= CE_CASE_MASK_3_;
\r
3642 sorder ^= m_caseSwitch_;
\r
3645 sorder = CollationElementIterator.IGNORABLE;
\r
3649 while ((torder & CE_REMOVE_CASE_)
\r
3650 == CollationElementIterator.IGNORABLE) {
\r
3651 torder = m_tgtUtilCEBuffer_[toffset ++];
\r
3652 if (!isContinuation(torder) && ((torder & CE_PRIMARY_MASK_) != 0 || m_utilCompare2_ == true)) {
\r
3653 // primary ignorables should not be considered on the case level when the strength is primary
\r
3654 // otherwise, the CEs stop being well-formed
\r
3655 torder &= CE_CASE_MASK_3_;
\r
3656 torder ^= m_caseSwitch_;
\r
3659 torder = CollationElementIterator.IGNORABLE;
\r
3663 sorder &= CE_CASE_BIT_MASK_;
\r
3664 torder &= CE_CASE_BIT_MASK_;
\r
3665 if (sorder == torder) {
\r
3666 // checking end of strings
\r
3667 if (m_srcUtilCEBuffer_[soffset - 1]
\r
3668 == CollationElementIterator.NULLORDER) {
\r
3669 if (m_tgtUtilCEBuffer_[toffset - 1]
\r
3670 != CollationElementIterator.NULLORDER) {
\r
3675 else if (m_tgtUtilCEBuffer_[toffset - 1]
\r
3676 == CollationElementIterator.NULLORDER) {
\r
3681 if (m_srcUtilCEBuffer_[soffset - 1]
\r
3682 == CollationElementIterator.NULLORDER) {
\r
3685 if (m_tgtUtilCEBuffer_[soffset - 1]
\r
3686 == CollationElementIterator.NULLORDER) {
\r
3689 return (sorder < torder) ? -1 : 1;
\r
3696 * Does tertiary strength comparison based on the collected ces.
\r
3697 * @return the tertiary strength comparison result
\r
3699 private final int doTertiaryCompare()
\r
3704 int sorder = CollationElementIterator.IGNORABLE;
\r
3705 int torder = CollationElementIterator.IGNORABLE;
\r
3706 while ((sorder & CE_REMOVE_CASE_)
\r
3707 == CollationElementIterator.IGNORABLE) {
\r
3708 sorder = m_srcUtilCEBuffer_[soffset ++] & m_mask3_;
\r
3709 if (!isContinuation(sorder)) {
\r
3710 sorder ^= m_caseSwitch_;
\r
3713 sorder &= CE_REMOVE_CASE_;
\r
3717 while ((torder & CE_REMOVE_CASE_)
\r
3718 == CollationElementIterator.IGNORABLE) {
\r
3719 torder = m_tgtUtilCEBuffer_[toffset ++] & m_mask3_;
\r
3720 if (!isContinuation(torder)) {
\r
3721 torder ^= m_caseSwitch_;
\r
3724 torder &= CE_REMOVE_CASE_;
\r
3728 if (sorder == torder) {
\r
3729 if (m_srcUtilCEBuffer_[soffset - 1]
\r
3730 == CollationElementIterator.NULLORDER) {
\r
3731 if (m_tgtUtilCEBuffer_[toffset - 1]
\r
3732 != CollationElementIterator.NULLORDER) {
\r
3737 else if (m_tgtUtilCEBuffer_[toffset - 1]
\r
3738 == CollationElementIterator.NULLORDER) {
\r
3743 if (m_srcUtilCEBuffer_[soffset - 1] ==
\r
3744 CollationElementIterator.NULLORDER) {
\r
3747 if (m_tgtUtilCEBuffer_[toffset - 1] ==
\r
3748 CollationElementIterator.NULLORDER) {
\r
3751 return (sorder < torder) ? -1 : 1;
\r
3758 * Does quaternary strength comparison based on the collected ces.
\r
3759 * @param lowestpvalue the lowest primary value that will not be ignored if
\r
3760 * alternate handling is shifted
\r
3761 * @return the quaternary strength comparison result
\r
3763 private final int doQuaternaryCompare(int lowestpvalue)
\r
3765 boolean sShifted = true;
\r
3766 boolean tShifted = true;
\r
3770 int sorder = CollationElementIterator.IGNORABLE;
\r
3771 int torder = CollationElementIterator.IGNORABLE;
\r
3772 while (sorder == CollationElementIterator.IGNORABLE
\r
3773 || (isContinuation(sorder) && !sShifted)) {
\r
3774 sorder = m_srcUtilCEBuffer_[soffset ++];
\r
3775 if (isContinuation(sorder)) {
\r
3780 else if (Utility.compareUnsigned(sorder, lowestpvalue) > 0
\r
3781 || (sorder & CE_PRIMARY_MASK_)
\r
3782 == CollationElementIterator.IGNORABLE) {
\r
3783 // non continuation
\r
3784 sorder = CE_PRIMARY_MASK_;
\r
3791 sorder >>>= CE_PRIMARY_SHIFT_;
\r
3792 while (torder == CollationElementIterator.IGNORABLE
\r
3793 || (isContinuation(torder) && !tShifted)) {
\r
3794 torder = m_tgtUtilCEBuffer_[toffset ++];
\r
3795 if (isContinuation(torder)) {
\r
3800 else if (Utility.compareUnsigned(torder, lowestpvalue) > 0
\r
3801 || (torder & CE_PRIMARY_MASK_)
\r
3802 == CollationElementIterator.IGNORABLE) {
\r
3803 // non continuation
\r
3804 torder = CE_PRIMARY_MASK_;
\r
3811 torder >>>= CE_PRIMARY_SHIFT_;
\r
3813 if (sorder == torder) {
\r
3814 if (m_srcUtilCEBuffer_[soffset - 1]
\r
3815 == CollationElementIterator.NULLORDER) {
\r
3816 if (m_tgtUtilCEBuffer_[toffset - 1]
\r
3817 != CollationElementIterator.NULLORDER) {
\r
3822 else if (m_tgtUtilCEBuffer_[toffset - 1]
\r
3823 == CollationElementIterator.NULLORDER) {
\r
3828 if (m_srcUtilCEBuffer_[soffset - 1] ==
\r
3829 CollationElementIterator.NULLORDER) {
\r
3832 if (m_tgtUtilCEBuffer_[toffset - 1] ==
\r
3833 CollationElementIterator.NULLORDER) {
\r
3836 return (sorder < torder) ? -1 : 1;
\r
3843 * Internal function. Does byte level string compare. Used by strcoll if
\r
3844 * strength == identical and strings are otherwise equal. This is a rare
\r
3845 * case. Comparison must be done on NFD normalized strings. FCD is not good
\r
3847 * @param source text
\r
3848 * @param target text
\r
3849 * @param offset of the first difference in the text strings
\r
3850 * @param normalize flag indicating if we are to normalize the text before
\r
3852 * @return 1 if source is greater than target, -1 less than and 0 if equals
\r
3854 private static final int doIdenticalCompare(String source, String target,
\r
3855 int offset, boolean normalize)
\r
3859 if (Normalizer.quickCheck(source, Normalizer.NFD,0)
\r
3860 != Normalizer.YES) {
\r
3861 source = Normalizer.decompose(source, false);
\r
3864 if (Normalizer.quickCheck(target, Normalizer.NFD,0)
\r
3865 != Normalizer.YES) {
\r
3866 target = Normalizer.decompose(target, false);
\r
3871 return doStringCompare(source, target, offset);
\r
3875 * Compares string for their codepoint order.
\r
3876 * This comparison handles surrogate characters and place them after the
\r
3877 * all non surrogate characters.
\r
3878 * @param source text
\r
3879 * @param target text
\r
3880 * @param offset start offset for comparison
\r
3881 * @return 1 if source is greater than target, -1 less than and 0 if equals
\r
3883 private static final int doStringCompare(String source,
\r
3887 // compare identical prefixes - they do not need to be fixed up
\r
3890 int slength = source.length();
\r
3891 int tlength = target.length();
\r
3892 int minlength = Math.min(slength, tlength);
\r
3893 while (offset < minlength) {
\r
3894 schar = source.charAt(offset);
\r
3895 tchar = target.charAt(offset ++);
\r
3896 if (schar != tchar) {
\r
3901 if (schar == tchar && offset == minlength) {
\r
3902 if (slength > minlength) {
\r
3905 if (tlength > minlength) {
\r
3911 // if both values are in or above the surrogate range, Fix them up.
\r
3912 if (schar >= UTF16.LEAD_SURROGATE_MIN_VALUE
\r
3913 && tchar >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
\r
3914 schar = fixupUTF16(schar);
\r
3915 tchar = fixupUTF16(tchar);
\r
3918 // now c1 and c2 are in UTF-32-compatible order
\r
3919 return (schar < tchar) ? -1 : 1; // schar and tchar has to be different
\r
3923 * Rotate surrogates to the top to get code point order
\r
3925 private static final char fixupUTF16(char ch)
\r
3927 if (ch >= 0xe000) {
\r
3937 * Resets the internal case data members and compression values.
\r
3939 private void updateInternalState()
\r
3941 if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {
\r
3942 m_caseSwitch_ = CASE_SWITCH_;
\r
3945 m_caseSwitch_ = NO_CASE_SWITCH_;
\r
3948 if (m_isCaseLevel_ || m_caseFirst_ == AttributeValue.OFF_) {
\r
3949 m_mask3_ = CE_REMOVE_CASE_;
\r
3950 m_common3_ = COMMON_NORMAL_3_;
\r
3951 m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_OFF_;
\r
3952 m_top3_ = COMMON_TOP_CASE_SWITCH_OFF_3_;
\r
3953 m_bottom3_ = COMMON_BOTTOM_3_;
\r
3956 m_mask3_ = CE_KEEP_CASE_;
\r
3957 m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_ON_;
\r
3958 if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {
\r
3959 m_common3_ = COMMON_UPPER_FIRST_3_;
\r
3960 m_top3_ = COMMON_TOP_CASE_SWITCH_UPPER_3_;
\r
3961 m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_UPPER_3_;
\r
3963 m_common3_ = COMMON_NORMAL_3_;
\r
3964 m_top3_ = COMMON_TOP_CASE_SWITCH_LOWER_3_;
\r
3965 m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_LOWER_3_;
\r
3969 // Set the compression values
\r
3970 int total3 = m_top3_ - COMMON_BOTTOM_3_ - 1;
\r
3971 // we multilply double with int, but need only int
\r
3972 m_topCount3_ = (int)(PROPORTION_3_ * total3);
\r
3973 m_bottomCount3_ = total3 - m_topCount3_;
\r
3975 if (!m_isCaseLevel_ && getStrength() == AttributeValue.TERTIARY_
\r
3976 && !m_isFrenchCollation_ && !m_isAlternateHandlingShifted_) {
\r
3977 m_isSimple3_ = true;
\r
3980 m_isSimple3_ = false;
\r
3982 if(!m_isCaseLevel_ && getStrength() <= AttributeValue.TERTIARY_ && !m_isNumericCollation_
\r
3983 && !m_isAlternateHandlingShifted_ && !latinOneFailed_) {
\r
3984 if(latinOneCEs_ == null || latinOneRegenTable_) {
\r
3985 if(setUpLatinOne()) { // if we succeed in building latin1 table, we'll use it
\r
3986 latinOneUse_ = true;
\r
3988 latinOneUse_ = false;
\r
3989 latinOneFailed_ = true;
\r
3991 latinOneRegenTable_ = false;
\r
3992 } else { // latin1Table exists and it doesn't need to be regenerated, just use it
\r
3993 latinOneUse_ = true;
\r
3996 latinOneUse_ = false;
\r
4002 * Initializes the RuleBasedCollator
\r
4004 private final void init()
\r
4006 for (m_minUnsafe_ = 0; m_minUnsafe_ < DEFAULT_MIN_HEURISTIC_;
\r
4007 m_minUnsafe_ ++) {
\r
4008 // Find the smallest unsafe char.
\r
4009 if (isUnsafe(m_minUnsafe_)) {
\r
4014 for (m_minContractionEnd_ = 0;
\r
4015 m_minContractionEnd_ < DEFAULT_MIN_HEURISTIC_;
\r
4016 m_minContractionEnd_ ++) {
\r
4017 // Find the smallest contraction-ending char.
\r
4018 if (isContractionEnd(m_minContractionEnd_)) {
\r
4022 latinOneFailed_ = true;
\r
4023 setStrength(m_defaultStrength_);
\r
4024 setDecomposition(m_defaultDecomposition_);
\r
4025 m_variableTopValue_ = m_defaultVariableTopValue_;
\r
4026 m_isFrenchCollation_ = m_defaultIsFrenchCollation_;
\r
4027 m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_;
\r
4028 m_isCaseLevel_ = m_defaultIsCaseLevel_;
\r
4029 m_caseFirst_ = m_defaultCaseFirst_;
\r
4030 m_isHiragana4_ = m_defaultIsHiragana4_;
\r
4031 m_isNumericCollation_ = m_defaultIsNumericCollation_;
\r
4032 latinOneFailed_ = false;
\r
4033 updateInternalState();
\r
4037 * Initializes utility iterators and byte buffer used by compare
\r
4039 private final void initUtility(boolean allocate) {
\r
4041 if (m_srcUtilIter_ == null) {
\r
4042 m_srcUtilIter_ = new StringUCharacterIterator();
\r
4043 m_srcUtilColEIter_ = new CollationElementIterator(m_srcUtilIter_, this);
\r
4044 m_tgtUtilIter_ = new StringUCharacterIterator();
\r
4045 m_tgtUtilColEIter_ = new CollationElementIterator(m_tgtUtilIter_, this);
\r
4046 m_utilBytes0_ = new byte[SORT_BUFFER_INIT_SIZE_CASE_]; // case
\r
4047 m_utilBytes1_ = new byte[SORT_BUFFER_INIT_SIZE_1_]; // primary
\r
4048 m_utilBytes2_ = new byte[SORT_BUFFER_INIT_SIZE_2_]; // secondary
\r
4049 m_utilBytes3_ = new byte[SORT_BUFFER_INIT_SIZE_3_]; // tertiary
\r
4050 m_utilBytes4_ = new byte[SORT_BUFFER_INIT_SIZE_4_]; // Quaternary
\r
4051 m_srcUtilCEBuffer_ = new int[CE_BUFFER_SIZE_];
\r
4052 m_tgtUtilCEBuffer_ = new int[CE_BUFFER_SIZE_];
\r
4055 m_srcUtilIter_ = null;
\r
4056 m_srcUtilColEIter_ = null;
\r
4057 m_tgtUtilIter_ = null;
\r
4058 m_tgtUtilColEIter_ = null;
\r
4059 m_utilBytes0_ = null;
\r
4060 m_utilBytes1_ = null;
\r
4061 m_utilBytes2_ = null;
\r
4062 m_utilBytes3_ = null;
\r
4063 m_utilBytes4_ = null;
\r
4064 m_srcUtilCEBuffer_ = null;
\r
4065 m_tgtUtilCEBuffer_ = null;
\r
4069 // Consts for Latin-1 special processing
\r
4070 private static final int ENDOFLATINONERANGE_ = 0xFF;
\r
4071 private static final int LATINONETABLELEN_ = (ENDOFLATINONERANGE_+50);
\r
4072 private static final int BAIL_OUT_CE_ = 0xFF000000;
\r
4075 * Generate latin-1 tables
\r
4078 private class shiftValues {
\r
4079 int primShift = 24;
\r
4080 int secShift = 24;
\r
4081 int terShift = 24;
\r
4084 private final void
\r
4085 addLatinOneEntry(char ch, int CE, shiftValues sh) {
\r
4086 int primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
\r
4087 boolean reverseSecondary = false;
\r
4088 if(!isContinuation(CE)) {
\r
4089 tertiary = ((CE & m_mask3_));
\r
4090 tertiary ^= m_caseSwitch_;
\r
4091 reverseSecondary = true;
\r
4093 tertiary = (byte)((CE & CE_REMOVE_CONTINUATION_MASK_));
\r
4094 tertiary &= CE_REMOVE_CASE_;
\r
4095 reverseSecondary = false;
\r
4098 secondary = ((CE >>>= 8) & LAST_BYTE_MASK_);
\r
4099 primary2 = ((CE >>>= 8) & LAST_BYTE_MASK_);
\r
4100 primary1 = (CE >>> 8);
\r
4102 if(primary1 != 0) {
\r
4103 latinOneCEs_[ch] |= (primary1 << sh.primShift);
\r
4104 sh.primShift -= 8;
\r
4106 if(primary2 != 0) {
\r
4107 if(sh.primShift < 0) {
\r
4108 latinOneCEs_[ch] = BAIL_OUT_CE_;
\r
4109 latinOneCEs_[latinOneTableLen_+ch] = BAIL_OUT_CE_;
\r
4110 latinOneCEs_[2*latinOneTableLen_+ch] = BAIL_OUT_CE_;
\r
4113 latinOneCEs_[ch] |= (primary2 << sh.primShift);
\r
4114 sh.primShift -= 8;
\r
4116 if(secondary != 0) {
\r
4117 if(reverseSecondary && m_isFrenchCollation_) { // reverse secondary
\r
4118 latinOneCEs_[latinOneTableLen_+ch] >>>= 8; // make space for secondary
\r
4119 latinOneCEs_[latinOneTableLen_+ch] |= (secondary << 24);
\r
4120 } else { // normal case
\r
4121 latinOneCEs_[latinOneTableLen_+ch] |= (secondary << sh.secShift);
\r
4125 if(tertiary != 0) {
\r
4126 latinOneCEs_[2*latinOneTableLen_+ch] |= (tertiary << sh.terShift);
\r
4131 private final void
\r
4132 resizeLatinOneTable(int newSize) {
\r
4133 int newTable[] = new int[3*newSize];
\r
4134 int sizeToCopy = ((newSize<latinOneTableLen_)?newSize:latinOneTableLen_);
\r
4135 //uprv_memset(newTable, 0, newSize*sizeof(uint32_t)*3); // automatically cleared.
\r
4136 System.arraycopy(latinOneCEs_, 0, newTable, 0, sizeToCopy);
\r
4137 System.arraycopy(latinOneCEs_, latinOneTableLen_, newTable, newSize, sizeToCopy);
\r
4138 System.arraycopy(latinOneCEs_, 2*latinOneTableLen_, newTable, 2*newSize, sizeToCopy);
\r
4139 latinOneTableLen_ = newSize;
\r
4140 latinOneCEs_ = newTable;
\r
4143 private final boolean setUpLatinOne() {
\r
4144 if(latinOneCEs_ == null || m_reallocLatinOneCEs_) {
\r
4145 latinOneCEs_ = new int[3*LATINONETABLELEN_];
\r
4146 latinOneTableLen_ = LATINONETABLELEN_;
\r
4147 m_reallocLatinOneCEs_ = false;
\r
4149 Arrays.fill(latinOneCEs_, 0);
\r
4151 if(m_ContInfo_ == null) {
\r
4152 m_ContInfo_ = new ContractionInfo();
\r
4155 //StringBuffer sCh = new StringBuffer();
\r
4156 //CollationElementIterator it = getCollationElementIterator(sCh.toString());
\r
4157 CollationElementIterator it = getCollationElementIterator("");
\r
4159 shiftValues s = new shiftValues();
\r
4161 char contractionOffset = ENDOFLATINONERANGE_+1;
\r
4163 for(ch = 0; ch <= ENDOFLATINONERANGE_; ch++) {
\r
4164 s.primShift = 24; s.secShift = 24; s.terShift = 24;
\r
4166 CE = m_trie_.getLatin1LinearValue(ch);
\r
4168 CE = m_trie_.getLeadValue(ch);
\r
4169 if(CE == CollationElementIterator.CE_NOT_FOUND_) {
\r
4170 CE = UCA_.m_trie_.getLeadValue(ch);
\r
4173 if(!isSpecial(CE)) {
\r
4174 addLatinOneEntry(ch, CE, s);
\r
4176 switch (RuleBasedCollator.getTag(CE)) {
\r
4177 case CollationElementIterator.CE_EXPANSION_TAG_:
\r
4178 case CollationElementIterator.CE_DIGIT_TAG_:
\r
4179 //sCh.delete(0, sCh.length());
\r
4181 //it.setText(sCh.toString());
\r
4182 it.setText(UCharacter.toString(ch));
\r
4183 while((CE = it.next()) != CollationElementIterator.NULLORDER) {
\r
4184 if(s.primShift < 0 || s.secShift < 0 || s.terShift < 0) {
\r
4185 latinOneCEs_[ch] = BAIL_OUT_CE_;
\r
4186 latinOneCEs_[latinOneTableLen_+ch] = BAIL_OUT_CE_;
\r
4187 latinOneCEs_[2*latinOneTableLen_+ch] = BAIL_OUT_CE_;
\r
4190 addLatinOneEntry(ch, CE, s);
\r
4193 case CollationElementIterator.CE_CONTRACTION_TAG_:
\r
4194 // here is the trick
\r
4195 // F2 is contraction. We do something very similar to contractions
\r
4196 // but have two indices, one in the real contraction table and the
\r
4197 // other to where we stuffed things. This hopes that we don't have
\r
4198 // many contractions (this should work for latin-1 tables).
\r
4200 if((CE & 0x00FFF000) != 0) {
\r
4201 latinOneFailed_ = true;
\r
4205 int UCharOffset = (CE & 0xFFFFFF) - m_contractionOffset_; //getContractionOffset(CE)]
\r
4207 CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
\r
4209 latinOneCEs_[ch] = CE;
\r
4210 latinOneCEs_[latinOneTableLen_+ch] = CE;
\r
4211 latinOneCEs_[2*latinOneTableLen_+ch] = CE;
\r
4213 // We're going to jump into contraction table, pick the elements
\r
4216 //CE = *(contractionCEs + (UCharOffset - contractionIndex));
\r
4217 CE = m_contractionCE_[UCharOffset];
\r
4220 == CollationElementIterator.CE_EXPANSION_TAG_) {
\r
4221 int i; /* general counter */
\r
4222 //uint32_t *CEOffset = (uint32_t *)image+getExpansionOffset(CE); /* find the offset to expansion table */
\r
4223 int offset = ((CE & 0xFFFFF0) >> 4) - m_expansionOffset_; //it.getExpansionOffset(this, CE);
\r
4224 int size = CE & 0xF; // getExpansionCount(CE);
\r
4225 //CE = *CEOffset++;
\r
4226 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
\r
4227 for(i = 0; i<size; i++) {
\r
4228 if(s.primShift < 0 || s.secShift < 0 || s.terShift < 0) {
\r
4229 latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;
\r
4230 latinOneCEs_[latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;
\r
4231 latinOneCEs_[2*latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;
\r
4234 addLatinOneEntry(contractionOffset, m_expansion_[offset+i], s);
\r
4236 } else { /* else, we do */
\r
4237 while(m_expansion_[offset] != 0) {
\r
4238 if(s.primShift < 0 || s.secShift < 0 || s.terShift < 0) {
\r
4239 latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;
\r
4240 latinOneCEs_[latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;
\r
4241 latinOneCEs_[2*latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;
\r
4244 addLatinOneEntry(contractionOffset, m_expansion_[offset++], s);
\r
4247 contractionOffset++;
\r
4248 } else if(!isSpecial(CE)) {
\r
4249 addLatinOneEntry(contractionOffset++, CE, s);
\r
4251 latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;
\r
4252 latinOneCEs_[latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;
\r
4253 latinOneCEs_[2*latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;
\r
4254 contractionOffset++;
\r
4257 s.primShift = 24; s.secShift = 24; s.terShift = 24;
\r
4258 if(contractionOffset == latinOneTableLen_) { // we need to reallocate
\r
4259 resizeLatinOneTable(2*latinOneTableLen_);
\r
4261 } while(m_contractionIndex_[UCharOffset] != 0xFFFF);
\r
4264 case CollationElementIterator.CE_SPEC_PROC_TAG_:
\r
4266 // 0xB7 is a precontext character defined in UCA5.1, a special
\r
4267 // handle is implemeted in order to save LatinOne table for
\r
4270 addLatinOneEntry(ch, CE, s);
\r
4273 latinOneFailed_ = true;
\r
4279 latinOneFailed_ = true;
\r
4285 if(contractionOffset < latinOneTableLen_) {
\r
4286 resizeLatinOneTable(contractionOffset);
\r
4291 private class ContractionInfo {
\r
4295 ContractionInfo m_ContInfo_;
\r
4298 getLatinOneContraction(int strength, int CE, String s) {
\r
4299 //int strength, int CE, String s, Integer ind) {
\r
4300 int len = s.length();
\r
4301 //const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
\r
4302 int UCharOffset = (CE & 0xFFF) - m_contractionOffset_;
\r
4304 int latinOneOffset = (CE & 0x00FFF000) >>> 12;
\r
4305 char schar = 0, tchar = 0;
\r
4310 if(s[*index] == 0) { // end of string
\r
4311 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
\r
4313 schar = s[*index];
\r
4317 if(m_ContInfo_.index == len) {
\r
4318 return(latinOneCEs_[strength*latinOneTableLen_+latinOneOffset]);
\r
4320 schar = s.charAt(m_ContInfo_.index);
\r
4324 while(schar > (tchar = m_contractionIndex_[UCharOffset+offset]/**(UCharOffset+offset)*/)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
\r
4328 if (schar == tchar) {
\r
4329 m_ContInfo_.index++;
\r
4330 return(latinOneCEs_[strength*latinOneTableLen_+latinOneOffset+offset]);
\r
4334 if(schar > ENDOFLATINONERANGE_ /*& 0xFF00*/) {
\r
4335 return BAIL_OUT_CE_;
\r
4337 // skip completely ignorables
\r
4338 int isZeroCE = m_trie_.getLeadValue(schar); //UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
\r
4339 if(isZeroCE == 0) { // we have to ignore completely ignorables
\r
4340 m_ContInfo_.index++;
\r
4344 return(latinOneCEs_[strength*latinOneTableLen_+latinOneOffset]);
\r
4351 * This is a fast strcoll, geared towards text in Latin-1.
\r
4352 * It supports contractions of size two, French secondaries
\r
4353 * and case switching. You can use it with strengths primary
\r
4354 * to tertiary. It does not support shifted and case level.
\r
4355 * It relies on the table build by setupLatin1Table. If it
\r
4356 * doesn't understand something, it will go to the regular
\r
4360 compareUseLatin1(String source, String target, int startOffset)
\r
4362 int sLen = source.length();
\r
4363 int tLen = target.length();
\r
4365 int strength = getStrength();
\r
4367 int sIndex = startOffset, tIndex = startOffset;
\r
4368 char sChar = 0, tChar = 0;
\r
4369 int sOrder=0, tOrder=0;
\r
4371 boolean endOfSource = false;
\r
4373 //uint32_t *elements = coll->latinOneCEs;
\r
4375 boolean haveContractions = false; // if we have contractions in our string
\r
4376 // we cannot do French secondary
\r
4378 int offset = latinOneTableLen_;
\r
4380 // Do the primary level
\r
4383 while(sOrder==0) { // this loop skips primary ignorables
\r
4384 // sOrder=getNextlatinOneCE(source);
\r
4385 if(sIndex==sLen) {
\r
4386 endOfSource = true;
\r
4389 sChar=source.charAt(sIndex++); //[sIndex++];
\r
4391 if(sChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out
\r
4392 //fprintf(stderr, "R");
\r
4393 return compareRegular(source, target, startOffset);
\r
4395 sOrder = latinOneCEs_[sChar];
\r
4396 if(isSpecial(sOrder)) { // if we got a special
\r
4397 // specials can basically be either contractions or bail-out signs. If we get anything
\r
4398 // else, we'll bail out anywasy
\r
4399 if(getTag(sOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) {
\r
4400 m_ContInfo_.index = sIndex;
\r
4401 sOrder = getLatinOneContraction(0, sOrder, source);
\r
4402 sIndex = m_ContInfo_.index;
\r
4403 haveContractions = true; // if there are contractions, we cannot do French secondary
\r
4404 // However, if there are contractions in the table, but we always use just one char,
\r
4405 // we might be able to do French. This should be checked out.
\r
4407 if(isSpecial(sOrder) /*== UCOL_BAIL_OUT_CE*/) {
\r
4408 //fprintf(stderr, "S");
\r
4409 return compareRegular(source, target, startOffset);
\r
4414 while(tOrder==0) { // this loop skips primary ignorables
\r
4415 // tOrder=getNextlatinOneCE(target);
\r
4416 if(tIndex==tLen) {
\r
4423 tChar=target.charAt(tIndex++); //[tIndex++];
\r
4424 if(tChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out
\r
4425 //fprintf(stderr, "R");
\r
4426 return compareRegular(source, target, startOffset);
\r
4428 tOrder = latinOneCEs_[tChar];
\r
4429 if(isSpecial(tOrder)) {
\r
4430 // Handling specials, see the comments for source
\r
4431 if(getTag(tOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) {
\r
4432 m_ContInfo_.index = tIndex;
\r
4433 tOrder = getLatinOneContraction(0, tOrder, target);
\r
4434 tIndex = m_ContInfo_.index;
\r
4435 haveContractions = true;
\r
4437 if(isSpecial(tOrder)/*== UCOL_BAIL_OUT_CE*/) {
\r
4438 //fprintf(stderr, "S");
\r
4439 return compareRegular(source, target, startOffset);
\r
4443 if(endOfSource) { // source is finished, but target is not, say the result.
\r
4447 if(sOrder == tOrder) { // if we have same CEs, we continue the loop
\r
4448 sOrder = 0; tOrder = 0;
\r
4451 // compare current top bytes
\r
4452 if(((sOrder^tOrder)&0xFF000000)!=0) {
\r
4453 // top bytes differ, return difference
\r
4454 if(sOrder >>> 8 < tOrder >>> 8) {
\r
4459 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
\r
4460 // since we must return enum value
\r
4463 // top bytes match, continue with following bytes
\r
4469 // after primary loop, we definitely know the sizes of strings,
\r
4470 // so we set it and use simpler loop for secondaries and tertiaries
\r
4471 //sLen = sIndex; tLen = tIndex;
\r
4472 if(strength >= SECONDARY) {
\r
4473 // adjust the table beggining
\r
4474 //latinOneCEs_ += coll->latinOneTableLen;
\r
4475 endOfSource = false;
\r
4477 if(!m_isFrenchCollation_) { // non French
\r
4478 // This loop is a simplified copy of primary loop
\r
4479 // at this point we know that whole strings are latin-1, so we don't
\r
4480 // check for that. We also know that we only have contractions as
\r
4482 //sIndex = 0; tIndex = 0;
\r
4483 sIndex = startOffset; tIndex = startOffset;
\r
4486 while(sOrder==0) {
\r
4487 if(sIndex==sLen) {
\r
4488 endOfSource = true;
\r
4491 sChar=source.charAt(sIndex++); //[sIndex++];
\r
4492 sOrder = latinOneCEs_[offset+sChar];
\r
4493 if(isSpecial(sOrder)) {
\r
4494 m_ContInfo_.index = sIndex;
\r
4495 sOrder = getLatinOneContraction(1, sOrder, source);
\r
4496 sIndex = m_ContInfo_.index;
\r
4500 while(tOrder==0) {
\r
4501 if(tIndex==tLen) {
\r
4508 tChar=target.charAt(tIndex++); //[tIndex++];
\r
4509 tOrder = latinOneCEs_[offset+tChar];
\r
4510 if(isSpecial(tOrder)) {
\r
4511 m_ContInfo_.index = tIndex;
\r
4512 tOrder = getLatinOneContraction(1, tOrder, target);
\r
4513 tIndex = m_ContInfo_.index;
\r
4520 if(sOrder == tOrder) {
\r
4521 sOrder = 0; tOrder = 0;
\r
4524 // see primary loop for comments on this
\r
4525 if(((sOrder^tOrder)&0xFF000000)!=0) {
\r
4526 if(sOrder >>> 8 < tOrder >>> 8) {
\r
4536 } else { // French
\r
4537 if(haveContractions) { // if we have contractions, we have to bail out
\r
4538 // since we don't really know how to handle them here
\r
4539 return compareRegular(source, target, startOffset);
\r
4541 // For French, we go backwards
\r
4542 sIndex = sLen; tIndex = tLen;
\r
4545 while(sOrder==0) {
\r
4546 if(sIndex==startOffset) {
\r
4547 endOfSource = true;
\r
4550 sChar=source.charAt(--sIndex); //[--sIndex];
\r
4551 sOrder = latinOneCEs_[offset+sChar];
\r
4552 // don't even look for contractions
\r
4555 while(tOrder==0) {
\r
4556 if(tIndex==startOffset) {
\r
4563 tChar=target.charAt(--tIndex); //[--tIndex];
\r
4564 tOrder = latinOneCEs_[offset+tChar];
\r
4565 // don't even look for contractions
\r
4571 if(sOrder == tOrder) {
\r
4572 sOrder = 0; tOrder = 0;
\r
4575 // see the primary loop for comments
\r
4576 if(((sOrder^tOrder)&0xFF000000)!=0) {
\r
4577 if(sOrder >>> 8 < tOrder >>> 8) {
\r
4590 if(strength >= TERTIARY) {
\r
4591 // tertiary loop is the same as secondary (except no French)
\r
4592 offset += latinOneTableLen_;
\r
4593 //sIndex = 0; tIndex = 0;
\r
4594 sIndex = startOffset; tIndex = startOffset;
\r
4595 endOfSource = false;
\r
4597 while(sOrder==0) {
\r
4598 if(sIndex==sLen) {
\r
4599 endOfSource = true;
\r
4602 sChar=source.charAt(sIndex++); //[sIndex++];
\r
4603 sOrder = latinOneCEs_[offset+sChar];
\r
4604 if(isSpecial(sOrder)) {
\r
4605 m_ContInfo_.index = sIndex;
\r
4606 sOrder = getLatinOneContraction(2, sOrder, source);
\r
4607 sIndex = m_ContInfo_.index;
\r
4610 while(tOrder==0) {
\r
4611 if(tIndex==tLen) {
\r
4613 return 0; // if both strings are at the end, they are equal
\r
4618 tChar=target.charAt(tIndex++); //[tIndex++];
\r
4619 tOrder = latinOneCEs_[offset+tChar];
\r
4620 if(isSpecial(tOrder)) {
\r
4621 m_ContInfo_.index = tIndex;
\r
4622 tOrder = getLatinOneContraction(2, tOrder, target);
\r
4623 tIndex = m_ContInfo_.index;
\r
4629 if(sOrder == tOrder) {
\r
4630 sOrder = 0; tOrder = 0;
\r
4633 if(((sOrder^tOrder)&0xff000000)!=0) {
\r
4634 if(sOrder >>> 8 < tOrder >>> 8) {
\r
4648 * Get the version of this collator object.
\r
4649 * @return the version object associated with this collator
\r
4652 public VersionInfo getVersion() {
\r
4653 /* RunTime version */
\r
4654 int rtVersion = VersionInfo.UCOL_RUNTIME_VERSION.getMajor();
\r
4655 /* Builder version*/
\r
4656 int bdVersion = m_version_.getMajor();
\r
4658 /* Charset Version. Need to get the version from cnv files
\r
4659 * makeconv should populate cnv files with version and
\r
4660 * an api has to be provided in ucnv.h to obtain this version
\r
4662 int csVersion = 0;
\r
4664 /* combine the version info */
\r
4665 int cmbVersion = ((rtVersion<<11) | (bdVersion<<6) | (csVersion)) & 0xFFFF;
\r
4667 /* Tailoring rules */
\r
4668 return VersionInfo.getInstance(cmbVersion>>8,
\r
4669 cmbVersion & 0xFF,
\r
4670 m_version_.getMinor(),
\r
4671 UCA_.m_UCA_version_.getMajor());
\r
4673 // versionInfo[0] = (uint8_t)(cmbVersion>>8);
\r
4674 // versionInfo[1] = (uint8_t)cmbVersion;
\r
4675 // versionInfo[2] = coll->image->version[1];
\r
4676 // versionInfo[3] = coll->UCA->image->UCAVersion[0];
\r
4680 * Get the UCA version of this collator object.
\r
4681 * @return the version object associated with this collator
\r
4684 public VersionInfo getUCAVersion() {
\r
4685 return UCA_.m_UCA_version_;
\r
4688 private transient boolean m_reallocLatinOneCEs_;
\r