2 *******************************************************************************
\r
3 * Copyright (C) 1996-2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.text;
\r
9 import java.io.IOException;
\r
10 import java.nio.ByteBuffer;
\r
11 import java.text.CharacterIterator;
\r
12 import java.text.ParseException;
\r
13 import java.util.Arrays;
\r
14 import java.util.MissingResourceException;
\r
16 import com.ibm.icu.impl.BOCU;
\r
17 import com.ibm.icu.impl.ICUDebug;
\r
18 import com.ibm.icu.impl.ICUResourceBundle;
\r
19 import com.ibm.icu.impl.ImplicitCEGenerator;
\r
20 import com.ibm.icu.impl.IntTrie;
\r
21 import com.ibm.icu.impl.StringUCharacterIterator;
\r
22 import com.ibm.icu.impl.Trie;
\r
23 import com.ibm.icu.impl.TrieIterator;
\r
24 import com.ibm.icu.impl.Utility;
\r
25 import com.ibm.icu.lang.UCharacter;
\r
26 import com.ibm.icu.util.RangeValueIterator;
\r
27 import com.ibm.icu.util.ULocale;
\r
28 import com.ibm.icu.util.UResourceBundle;
\r
29 import com.ibm.icu.util.VersionInfo;
\r
32 * <p>RuleBasedCollator is a concrete subclass of Collator. It allows
\r
33 * customization of the Collator via user-specified rule sets.
\r
34 * RuleBasedCollator is designed to be fully compliant to the <a
\r
35 * href="http://www.unicode.org/unicode/reports/tr10/">Unicode
\r
36 * Collation Algorithm (UCA)</a> and conforms to ISO 14651.</p>
\r
38 * <p>Users are strongly encouraged to read <a
\r
39 * href="http://www.icu-project.org/userguide/Collate_Intro.html">
\r
40 * the users guide</a> for more information about the collation
\r
41 * service before using this class.</p>
\r
43 * <p>Create a RuleBasedCollator from a locale by calling the
\r
44 * getInstance(Locale) factory method in the base class Collator.
\r
45 * Collator.getInstance(Locale) creates a RuleBasedCollator object
\r
46 * based on the collation rules defined by the argument locale. If a
\r
47 * customized collation ordering ar attributes is required, use the
\r
48 * RuleBasedCollator(String) constructor with the appropriate
\r
49 * rules. The customized RuleBasedCollator will base its ordering on
\r
50 * UCA, while re-adjusting the attributes and orders of the characters
\r
51 * in the specified rule accordingly.</p>
\r
53 * <p>RuleBasedCollator provides correct collation orders for most
\r
54 * locales supported in ICU. If specific data for a locale is not
\r
55 * available, the orders eventually falls back to the <a
\r
56 * href="http://www.unicode.org/unicode/reports/tr10/">UCA collation
\r
59 * <p>For information about the collation rule syntax and details
\r
60 * about customization, please refer to the
\r
61 * <a href="http://www.icu-project.org/userguide/Collate_Customization.html">
\r
62 * Collation customization</a> section of the user's guide.</p>
\r
64 * <p><strong>Note</strong> that there are some differences between
\r
65 * the Collation rule syntax used in Java and ICU4J:
\r
68 * <li>According to the JDK documentation:
\r
71 * Modifier '!' : Turns on Thai/Lao vowel-consonant swapping. If this rule
\r
72 * is in force when a Thai vowel of the range \U0E40-\U0E44 precedes a
\r
73 * Thai consonant of the range \U0E01-\U0E2E OR a Lao vowel of the
\r
74 * range \U0EC0-\U0EC4 precedes a Lao consonant of the range
\r
75 * \U0E81-\U0EAE then the
\r
76 * vowel is placed after the consonant for collation purposes.
\r
79 * If a rule is without the modifier '!', the Thai/Lao vowel-consonant
\r
80 * swapping is not turned on.
\r
84 * ICU4J's RuleBasedCollator does not support turning off the Thai/Lao
\r
85 * vowel-consonant swapping, since the UCA clearly states that it has to be
\r
86 * supported to ensure a correct sorting order. If a '!' is encountered, it is
\r
89 * <li>As mentioned in the documentation of the base class Collator,
\r
90 * compatibility decomposition mode is not supported.
\r
93 * <strong>Examples</strong>
\r
96 * Creating Customized RuleBasedCollators:
\r
99 * String simple = "& a < b < c < d";
\r
100 * RuleBasedCollator simpleCollator = new RuleBasedCollator(simple);
\r
102 * String norwegian = "& a , A < b , B < c , C < d , D < e , E "
\r
103 * + "< f , F < g , G < h , H < i , I < j , "
\r
104 * + "J < k , K < l , L < m , M < n , N < "
\r
105 * + "o , O < p , P < q , Q < r , R < s , S < "
\r
106 * + "t , T < u , U < v , V < w , W < x , X "
\r
107 * + "< y , Y < z , Z < \u00E5 = a\u030A "
\r
108 * + ", \u00C5 = A\u030A ; aa , AA < \u00E6 "
\r
109 * + ", \u00C6 < \u00F8 , \u00D8";
\r
110 * RuleBasedCollator norwegianCollator = new RuleBasedCollator(norwegian);
\r
114 * Concatenating rules to combine <code>Collator</code>s:
\r
117 * // Create an en_US Collator object
\r
118 * RuleBasedCollator en_USCollator = (RuleBasedCollator)
\r
119 * Collator.getInstance(new Locale("en", "US", ""));
\r
120 * // Create a da_DK Collator object
\r
121 * RuleBasedCollator da_DKCollator = (RuleBasedCollator)
\r
122 * Collator.getInstance(new Locale("da", "DK", ""));
\r
123 * // Combine the two
\r
124 * // First, get the collation rules from en_USCollator
\r
125 * String en_USRules = en_USCollator.getRules();
\r
126 * // Second, get the collation rules from da_DKCollator
\r
127 * String da_DKRules = da_DKCollator.getRules();
\r
128 * RuleBasedCollator newCollator =
\r
129 * new RuleBasedCollator(en_USRules + da_DKRules);
\r
130 * // newCollator has the combined rules
\r
134 * Making changes to an existing RuleBasedCollator to create a new
\r
135 * <code>Collator</code> object, by appending changes to the existing rule:
\r
138 * // Create a new Collator object with additional rules
\r
139 * String addRules = "& C < ch, cH, Ch, CH";
\r
140 * RuleBasedCollator myCollator =
\r
141 * new RuleBasedCollator(en_USCollator.getRules() + addRules);
\r
142 * // myCollator contains the new rules
\r
146 * How to change the order of non-spacing accents:
\r
149 * // old rule with main accents
\r
150 * String oldRules = "= \u0301 ; \u0300 ; \u0302 ; \u0308 "
\r
151 * + "; \u0327 ; \u0303 ; \u0304 ; \u0305 "
\r
152 * + "; \u0306 ; \u0307 ; \u0309 ; \u030A "
\r
153 * + "; \u030B ; \u030C ; \u030D ; \u030E "
\r
154 * + "; \u030F ; \u0310 ; \u0311 ; \u0312 "
\r
155 * + "< a , A ; ae, AE ; \u00e6 , \u00c6 "
\r
156 * + "< b , B < c, C < e, E & C < d , D";
\r
157 * // change the order of accent characters
\r
158 * String addOn = "& \u0300 ; \u0308 ; \u0302";
\r
159 * RuleBasedCollator myCollator = new RuleBasedCollator(oldRules + addOn);
\r
163 * Putting in a new primary ordering before the default setting,
\r
164 * e.g. sort English characters before or after Japanese characters in the Japanese
\r
165 * <code>Collator</code>:
\r
168 * // get en_US Collator rules
\r
169 * RuleBasedCollator en_USCollator
\r
170 * = (RuleBasedCollator)Collator.getInstance(Locale.US);
\r
171 * // add a few Japanese characters to sort before English characters
\r
172 * // suppose the last character before the first base letter 'a' in
\r
173 * // the English collation rule is \u2212
\r
174 * String jaString = "& \u2212 < \u3041, \u3042 < \u3043, "
\r
176 * RuleBasedCollator myJapaneseCollator
\r
177 * = new RuleBasedCollator(en_USCollator.getRules() + jaString);
\r
182 * This class is not subclassable
\r
184 * @author Syn Wee Quek
\r
187 public final class RuleBasedCollator extends Collator
\r
189 // public constructors ---------------------------------------------------
\r
193 * Constructor that takes the argument rules for
\r
194 * customization. The collator will be based on UCA,
\r
195 * with the attributes and re-ordering of the characters specified in the
\r
198 * <p>See the user guide's section on
\r
199 * <a href="http://www.icu-project.org/userguide/Collate_Customization.html">
\r
200 * Collation Customization</a> for details on the rule syntax.
\r
202 * @param rules the collation rules to build the collation table from.
\r
203 * @exception ParseException and IOException thrown. ParseException thrown
\r
204 * when argument rules have an invalid syntax. IOException
\r
205 * thrown when an error occured while reading internal data.
\r
208 public RuleBasedCollator(String rules) throws Exception
\r
211 if (rules == null) {
\r
212 throw new IllegalArgumentException(
\r
213 "Collation rules can not be null");
\r
218 // public methods --------------------------------------------------------
\r
221 * Clones the RuleBasedCollator
\r
222 * @return a new instance of this RuleBasedCollator object
\r
225 public Object clone() throws CloneNotSupportedException
\r
227 RuleBasedCollator result = (RuleBasedCollator)super.clone();
\r
228 if (latinOneCEs_ != null) {
\r
229 result.m_reallocLatinOneCEs_ = true;
\r
230 result.m_ContInfo_ = new ContractionInfo();
\r
233 // since all collation data in the RuleBasedCollator do not change
\r
234 // we can safely assign the result.fields to this collator
\r
235 result.initUtility(false); // let the new clone have their own util
\r
241 * Return a CollationElementIterator for the given String.
\r
242 * @see CollationElementIterator
\r
245 public CollationElementIterator getCollationElementIterator(String source)
\r
247 return new CollationElementIterator(source, this);
\r
251 * Return a CollationElementIterator for the given CharacterIterator.
\r
252 * The source iterator's integrity will be preserved since a new copy
\r
253 * will be created for use.
\r
254 * @see CollationElementIterator
\r
257 public CollationElementIterator getCollationElementIterator(
\r
258 CharacterIterator source)
\r
260 CharacterIterator newsource = (CharacterIterator)source.clone();
\r
261 return new CollationElementIterator(newsource, this);
\r
265 * Return a CollationElementIterator for the given UCharacterIterator.
\r
266 * The source iterator's integrity will be preserved since a new copy
\r
267 * will be created for use.
\r
268 * @see CollationElementIterator
\r
271 public CollationElementIterator getCollationElementIterator(
\r
272 UCharacterIterator source)
\r
274 return new CollationElementIterator(source, this);
\r
277 // public setters --------------------------------------------------------
\r
280 * Sets the Hiragana Quaternary mode to be on or off.
\r
281 * When the Hiragana Quaternary mode is turned on, the collator
\r
282 * positions Hiragana characters before all non-ignorable characters in
\r
283 * QUATERNARY strength. This is to produce a correct JIS collation order,
\r
284 * distinguishing between Katakana and Hiragana characters.
\r
285 * @param flag true if Hiragana Quaternary mode is to be on, false
\r
287 * @see #setHiraganaQuaternaryDefault
\r
288 * @see #isHiraganaQuaternary
\r
291 public void setHiraganaQuaternary(boolean flag)
\r
293 m_isHiragana4_ = flag;
\r
294 updateInternalState();
\r
298 * Sets the Hiragana Quaternary mode to the initial mode set during
\r
299 * construction of the RuleBasedCollator.
\r
300 * See setHiraganaQuaternary(boolean) for more details.
\r
301 * @see #setHiraganaQuaternary(boolean)
\r
302 * @see #isHiraganaQuaternary
\r
305 public void setHiraganaQuaternaryDefault()
\r
307 m_isHiragana4_ = m_defaultIsHiragana4_;
\r
308 updateInternalState();
\r
312 * Sets whether uppercase characters sort before lowercase
\r
313 * characters or vice versa, in strength TERTIARY. The default
\r
314 * mode is false, and so lowercase characters sort before uppercase
\r
316 * If true, sort upper case characters first.
\r
317 * @param upperfirst true to sort uppercase characters before
\r
318 * lowercase characters, false to sort lowercase
\r
319 * characters before uppercase characters
\r
320 * @see #isLowerCaseFirst
\r
321 * @see #isUpperCaseFirst
\r
322 * @see #setLowerCaseFirst
\r
323 * @see #setCaseFirstDefault
\r
326 public void setUpperCaseFirst(boolean upperfirst)
\r
329 if(m_caseFirst_ != AttributeValue.UPPER_FIRST_) {
\r
330 latinOneRegenTable_ = true;
\r
332 m_caseFirst_ = AttributeValue.UPPER_FIRST_;
\r
335 if(m_caseFirst_ != AttributeValue.OFF_) {
\r
336 latinOneRegenTable_ = true;
\r
338 m_caseFirst_ = AttributeValue.OFF_;
\r
340 updateInternalState();
\r
344 * Sets the orders of lower cased characters to sort before upper cased
\r
345 * characters, in strength TERTIARY. The default
\r
347 * If true is set, the RuleBasedCollator will sort lower cased characters
\r
348 * before the upper cased ones.
\r
349 * Otherwise, if false is set, the RuleBasedCollator will ignore case
\r
351 * @param lowerfirst true for sorting lower cased characters before
\r
352 * upper cased characters, false to ignore case
\r
354 * @see #isLowerCaseFirst
\r
355 * @see #isUpperCaseFirst
\r
356 * @see #setUpperCaseFirst
\r
357 * @see #setCaseFirstDefault
\r
360 public void setLowerCaseFirst(boolean lowerfirst)
\r
363 if(m_caseFirst_ != AttributeValue.LOWER_FIRST_) {
\r
364 latinOneRegenTable_ = true;
\r
366 m_caseFirst_ = AttributeValue.LOWER_FIRST_;
\r
369 if(m_caseFirst_ != AttributeValue.OFF_) {
\r
370 latinOneRegenTable_ = true;
\r
372 m_caseFirst_ = AttributeValue.OFF_;
\r
374 updateInternalState();
\r
378 * Sets the case first mode to the initial mode set during
\r
379 * construction of the RuleBasedCollator.
\r
380 * See setUpperCaseFirst(boolean) and setLowerCaseFirst(boolean) for more
\r
382 * @see #isLowerCaseFirst
\r
383 * @see #isUpperCaseFirst
\r
384 * @see #setLowerCaseFirst(boolean)
\r
385 * @see #setUpperCaseFirst(boolean)
\r
388 public final void setCaseFirstDefault()
\r
390 if(m_caseFirst_ != m_defaultCaseFirst_) {
\r
391 latinOneRegenTable_ = true;
\r
393 m_caseFirst_ = m_defaultCaseFirst_;
\r
394 updateInternalState();
\r
398 * Sets the alternate handling mode to the initial mode set during
\r
399 * construction of the RuleBasedCollator.
\r
400 * See setAlternateHandling(boolean) for more details.
\r
401 * @see #setAlternateHandlingShifted(boolean)
\r
402 * @see #isAlternateHandlingShifted()
\r
405 public void setAlternateHandlingDefault()
\r
407 m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_;
\r
408 updateInternalState();
\r
412 * Sets the case level mode to the initial mode set during
\r
413 * construction of the RuleBasedCollator.
\r
414 * See setCaseLevel(boolean) for more details.
\r
415 * @see #setCaseLevel(boolean)
\r
416 * @see #isCaseLevel
\r
419 public void setCaseLevelDefault()
\r
421 m_isCaseLevel_ = m_defaultIsCaseLevel_;
\r
422 updateInternalState();
\r
426 * Sets the decomposition mode to the initial mode set during construction
\r
427 * of the RuleBasedCollator.
\r
428 * See setDecomposition(int) for more details.
\r
429 * @see #getDecomposition
\r
430 * @see #setDecomposition(int)
\r
433 public void setDecompositionDefault()
\r
435 setDecomposition(m_defaultDecomposition_);
\r
436 updateInternalState();
\r
440 * Sets the French collation mode to the initial mode set during
\r
441 * construction of the RuleBasedCollator.
\r
442 * See setFrenchCollation(boolean) for more details.
\r
443 * @see #isFrenchCollation
\r
444 * @see #setFrenchCollation(boolean)
\r
447 public void setFrenchCollationDefault()
\r
449 if(m_isFrenchCollation_ != m_defaultIsFrenchCollation_) {
\r
450 latinOneRegenTable_ = true;
\r
452 m_isFrenchCollation_ = m_defaultIsFrenchCollation_;
\r
453 updateInternalState();
\r
457 * Sets the collation strength to the initial mode set during the
\r
458 * construction of the RuleBasedCollator.
\r
459 * See setStrength(int) for more details.
\r
460 * @see #setStrength(int)
\r
461 * @see #getStrength
\r
464 public void setStrengthDefault()
\r
466 setStrength(m_defaultStrength_);
\r
467 updateInternalState();
\r
471 * Method to set numeric collation to its default value.
\r
472 * When numeric collation is turned on, this Collator generates a collation
\r
473 * key for the numeric value of substrings of digits. This is a way to get
\r
474 * '100' to sort AFTER '2'
\r
475 * @see #getNumericCollation
\r
476 * @see #setNumericCollation
\r
479 public void setNumericCollationDefault()
\r
481 setNumericCollation(m_defaultIsNumericCollation_);
\r
482 updateInternalState();
\r
486 * Sets the mode for the direction of SECONDARY weights to be used in
\r
487 * French collation.
\r
488 * The default value is false, which treats SECONDARY weights in the order
\r
490 * If set to true, the SECONDARY weights will be sorted backwards.
\r
491 * See the section on
\r
492 * <a href="http://www.icu-project.org/userguide/Collate_ServiceArchitecture.html">
\r
493 * French collation</a> for more information.
\r
494 * @param flag true to set the French collation on, false to set it off
\r
496 * @see #isFrenchCollation
\r
497 * @see #setFrenchCollationDefault
\r
499 public void setFrenchCollation(boolean flag)
\r
501 if(m_isFrenchCollation_ != flag) {
\r
502 latinOneRegenTable_ = true;
\r
504 m_isFrenchCollation_ = flag;
\r
505 updateInternalState();
\r
509 * Sets the alternate handling for QUATERNARY strength to be either
\r
510 * shifted or non-ignorable.
\r
511 * See the UCA definition on
\r
512 * <a href="http://www.unicode.org/unicode/reports/tr10/#Variable_Weighting">
\r
513 * Alternate Weighting</a>.
\r
514 * This attribute will only be effective when QUATERNARY strength is set.
\r
515 * The default value for this mode is false, corresponding to the
\r
516 * NON_IGNORABLE mode in UCA. In the NON-IGNORABLE mode, the
\r
517 * RuleBasedCollator will treats all the codepoints with non-ignorable
\r
518 * primary weights in the same way.
\r
519 * If the mode is set to true, the behaviour corresponds to SHIFTED defined
\r
520 * in UCA, this causes codepoints with PRIMARY orders that are equal or
\r
521 * below the variable top value to be ignored in PRIMARY order and
\r
522 * moved to the QUATERNARY order.
\r
523 * @param shifted true if SHIFTED behaviour for alternate handling is
\r
524 * desired, false for the NON_IGNORABLE behaviour.
\r
525 * @see #isAlternateHandlingShifted
\r
526 * @see #setAlternateHandlingDefault
\r
529 public void setAlternateHandlingShifted(boolean shifted)
\r
531 m_isAlternateHandlingShifted_ = shifted;
\r
532 updateInternalState();
\r
537 * When case level is set to true, an additional weight is formed
\r
538 * between the SECONDARY and TERTIARY weight, known as the case level.
\r
539 * The case level is used to distinguish large and small Japanese Kana
\r
540 * characters. Case level could also be used in other situations.
\r
541 * For example to distinguish certain Pinyin characters.
\r
542 * The default value is false, which means the case level is not generated.
\r
543 * The contents of the case level are affected by the case first
\r
544 * mode. A simple way to ignore accent differences in a string is to set
\r
545 * the strength to PRIMARY and enable case level.
\r
548 * See the section on
\r
549 * <a href="http://www.icu-project.org/userguide/Collate_ServiceArchitecture.html">
\r
550 * case level</a> for more information.
\r
552 * @param flag true if case level sorting is required, false otherwise
\r
554 * @see #setCaseLevelDefault
\r
555 * @see #isCaseLevel
\r
557 public void setCaseLevel(boolean flag)
\r
559 m_isCaseLevel_ = flag;
\r
560 updateInternalState();
\r
565 * Sets this Collator's strength property. The strength property
\r
566 * determines the minimum level of difference considered significant
\r
567 * during comparison.
\r
569 * <p>See the Collator class description for an example of use.</p>
\r
570 * @param newStrength the new strength value.
\r
571 * @see #getStrength
\r
572 * @see #setStrengthDefault
\r
578 * @exception IllegalArgumentException If the new strength value is not one
\r
579 * of PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL.
\r
582 public void setStrength(int newStrength)
\r
584 super.setStrength(newStrength);
\r
585 updateInternalState();
\r
590 * Variable top is a two byte primary value which causes all the codepoints
\r
591 * with primary values that are less or equal than the variable top to be
\r
592 * shifted when alternate handling is set to SHIFTED.
\r
595 * Sets the variable top to a collation element value of a string supplied.
\r
597 * @param varTop one or more (if contraction) characters to which the
\r
598 * variable top should be set
\r
599 * @return a int value containing the value of the variable top in upper 16
\r
600 * bits. Lower 16 bits are undefined.
\r
601 * @exception IllegalArgumentException is thrown if varTop argument is not
\r
602 * a valid variable top element. A variable top element is
\r
605 * <li>it is a contraction that does not exist in the
\r
607 * <li>when the PRIMARY strength collation element for the
\r
608 * variable top has more than two bytes
\r
609 * <li>when the varTop argument is null or zero in length.
\r
611 * @see #getVariableTop
\r
612 * @see RuleBasedCollator#setAlternateHandlingShifted
\r
615 public int setVariableTop(String varTop)
\r
617 if (varTop == null || varTop.length() == 0) {
\r
618 throw new IllegalArgumentException(
\r
619 "Variable top argument string can not be null or zero in length.");
\r
621 if (m_srcUtilIter_ == null) {
\r
625 m_srcUtilColEIter_.setText(varTop);
\r
626 int ce = m_srcUtilColEIter_.next();
\r
628 // here we check if we have consumed all characters
\r
629 // you can put in either one character or a contraction
\r
630 // you shouldn't put more...
\r
631 if (m_srcUtilColEIter_.getOffset() != varTop.length()
\r
632 || ce == CollationElementIterator.NULLORDER) {
\r
633 throw new IllegalArgumentException(
\r
634 "Variable top argument string is a contraction that does not exist "
\r
635 + "in the Collation order");
\r
638 int nextCE = m_srcUtilColEIter_.next();
\r
640 if ((nextCE != CollationElementIterator.NULLORDER)
\r
641 && (!isContinuation(nextCE) || (nextCE & CE_PRIMARY_MASK_) != 0)) {
\r
642 throw new IllegalArgumentException(
\r
643 "Variable top argument string can only have a single collation "
\r
644 + "element that has less than or equal to two PRIMARY strength "
\r
648 m_variableTopValue_ = (ce & CE_PRIMARY_MASK_) >> 16;
\r
650 return ce & CE_PRIMARY_MASK_;
\r
654 * Sets the variable top to a collation element value supplied.
\r
655 * Variable top is set to the upper 16 bits.
\r
656 * Lower 16 bits are ignored.
\r
657 * @param varTop Collation element value, as returned by setVariableTop or
\r
659 * @see #getVariableTop
\r
660 * @see #setVariableTop(String)
\r
663 public void setVariableTop(int varTop)
\r
665 m_variableTopValue_ = (varTop & CE_PRIMARY_MASK_) >> 16;
\r
669 * When numeric collation is turned on, this Collator generates a collation
\r
670 * key for the numeric value of substrings of digits. This is a way to get
\r
671 * '100' to sort AFTER '2'
\r
672 * @param flag true to turn numeric collation on and false to turn it off
\r
673 * @see #getNumericCollation
\r
674 * @see #setNumericCollationDefault
\r
677 public void setNumericCollation(boolean flag)
\r
679 // sort substrings of digits as numbers
\r
680 m_isNumericCollation_ = flag;
\r
681 updateInternalState();
\r
684 // public getters --------------------------------------------------------
\r
687 * Gets the collation rules for this RuleBasedCollator.
\r
688 * Equivalent to String getRules(RuleOption.FULL_RULES).
\r
689 * @return returns the collation rules
\r
690 * @see #getRules(boolean)
\r
693 public String getRules()
\r
699 * Returns current rules. The argument defines whether full rules
\r
700 * (UCA + tailored) rules are returned or just the tailoring.
\r
701 * @param fullrules true if the rules that defines the full set of
\r
702 * collation order is required, otherwise false for returning only
\r
703 * the tailored rules
\r
704 * @return the current rules that defines this Collator.
\r
708 public String getRules(boolean fullrules)
\r
713 // take the UCA rules and append real rules at the end
\r
714 return UCA_.m_rules_.concat(m_rules_);
\r
718 * Get an UnicodeSet that contains all the characters and sequences
\r
719 * tailored in this collator.
\r
720 * @return a pointer to a UnicodeSet object containing all the
\r
721 * code points and sequences that may sort differently than
\r
725 public UnicodeSet getTailoredSet()
\r
728 CollationRuleParser src = new CollationRuleParser(getRules());
\r
729 return src.getTailoredSet();
\r
730 } catch(Exception e) {
\r
731 throw new IllegalStateException("A tailoring rule should not " +
\r
732 "have errors. Something is quite wrong!");
\r
736 private class contContext {
\r
737 RuleBasedCollator coll;
\r
738 UnicodeSet contractions;
\r
739 UnicodeSet expansions;
\r
740 UnicodeSet removedContractions;
\r
741 boolean addPrefixes;
\r
742 contContext(RuleBasedCollator coll, UnicodeSet contractions, UnicodeSet expansions,
\r
743 UnicodeSet removedContractions, boolean addPrefixes) {
\r
745 this.contractions = contractions;
\r
746 this.expansions = expansions;
\r
747 this.removedContractions = removedContractions;
\r
748 this.addPrefixes = addPrefixes;
\r
753 addSpecial(contContext c, StringBuilder buffer, int CE)
\r
755 StringBuilder b = new StringBuilder();
\r
756 int offset = (CE & 0xFFFFFF) - c.coll.m_contractionOffset_;
\r
757 int newCE = c.coll.m_contractionCE_[offset];
\r
758 // we might have a contraction that ends from previous level
\r
759 if(newCE != CollationElementIterator.CE_NOT_FOUND_) {
\r
760 if(isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_
\r
761 && isSpecial(newCE) && getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_
\r
762 && c.addPrefixes) {
\r
763 addSpecial(c, buffer, newCE);
\r
765 if(buffer.length() > 1) {
\r
766 if(c.contractions != null) {
\r
767 c.contractions.add(buffer.toString());
\r
769 if(c.expansions != null && isSpecial(CE) && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) {
\r
770 c.expansions.add(buffer.toString());
\r
776 // check whether we're doing contraction or prefix
\r
777 if(getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) {
\r
778 while(c.coll.m_contractionIndex_[offset] != 0xFFFF) {
\r
779 b.delete(0, b.length());
\r
781 newCE = c.coll.m_contractionCE_[offset];
\r
782 b.insert(0, c.coll.m_contractionIndex_[offset]);
\r
783 if(isSpecial(newCE) && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) {
\r
784 addSpecial(c, b, newCE);
\r
786 if(c.contractions != null) {
\r
787 c.contractions.add(b.toString());
\r
789 if(c.expansions != null && isSpecial(newCE) && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) {
\r
790 c.expansions.add(b.toString());
\r
795 } else if(getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_) {
\r
796 while(c.coll.m_contractionIndex_[offset] != 0xFFFF) {
\r
797 b.delete(0, b.length());
\r
799 newCE = c.coll.m_contractionCE_[offset];
\r
800 b.append(c.coll.m_contractionIndex_[offset]);
\r
801 if(isSpecial(newCE) && (getTag(newCE) == CollationElementIterator.CE_CONTRACTION_TAG_ || getTag(newCE) == CollationElementIterator.CE_SPEC_PROC_TAG_)) {
\r
802 addSpecial(c, b, newCE);
\r
804 if(c.contractions != null) {
\r
805 c.contractions.add(b.toString());
\r
807 if(c.expansions != null && isSpecial(newCE) && getTag(newCE) == CollationElementIterator.CE_EXPANSION_TAG_) {
\r
808 c.expansions.add(b.toString());
\r
817 void processSpecials(contContext c)
\r
819 int internalBufferSize = 512;
\r
820 TrieIterator trieiterator
\r
821 = new TrieIterator(c.coll.m_trie_);
\r
822 RangeValueIterator.Element element = new RangeValueIterator.Element();
\r
823 while (trieiterator.next(element)) {
\r
824 int start = element.start;
\r
825 int limit = element.limit;
\r
826 int CE = element.value;
\r
827 StringBuilder contraction = new StringBuilder(internalBufferSize);
\r
829 if(isSpecial(CE)) {
\r
830 if(((getTag(CE) == CollationElementIterator.CE_SPEC_PROC_TAG_ && c.addPrefixes) || getTag(CE) == CollationElementIterator.CE_CONTRACTION_TAG_)) {
\r
831 while(start < limit) {
\r
832 // if there are suppressed contractions, we don't
\r
833 // want to add them.
\r
834 if(c.removedContractions != null && c.removedContractions.contains(start)) {
\r
838 // we start our contraction from middle, since we don't know if it
\r
839 // will grow toward right or left
\r
840 contraction.append((char) start);
\r
841 addSpecial(c, contraction, CE);
\r
844 } else if(c.expansions != null && getTag(CE) == CollationElementIterator.CE_EXPANSION_TAG_) {
\r
845 while(start < limit) {
\r
846 c.expansions.add(start++);
\r
854 * Gets unicode sets containing contractions and/or expansions of a collator
\r
855 * @param contractions if not null, set to contain contractions
\r
856 * @param expansions if not null, set to contain expansions
\r
857 * @param addPrefixes add the prefix contextual elements to contractions
\r
858 * @throws Exception Throws an exception if any errors occurs.
\r
862 getContractionsAndExpansions(UnicodeSet contractions, UnicodeSet expansions,
\r
863 boolean addPrefixes) throws Exception {
\r
864 if(contractions != null) {
\r
865 contractions.clear();
\r
867 if(expansions != null) {
\r
868 expansions.clear();
\r
870 String rules = getRules();
\r
872 CollationRuleParser src = new CollationRuleParser(rules);
\r
873 contContext c = new contContext(RuleBasedCollator.UCA_,
\r
874 contractions, expansions, src.m_removeSet_, addPrefixes);
\r
876 // Add the UCA contractions
\r
877 processSpecials(c);
\r
878 // This is collator specific. Add contractions from a collator
\r
880 c.removedContractions = null;
\r
881 processSpecials(c);
\r
882 } catch (Exception e) {
\r
889 * Get a Collation key for the argument String source from this
\r
890 * RuleBasedCollator.
\r
893 * General recommendation: <br>
\r
894 * If comparison are to be done to the same String multiple times, it would
\r
895 * be more efficient to generate CollationKeys for the Strings and use
\r
896 * CollationKey.compareTo(CollationKey) for the comparisons.
\r
897 * If the each Strings are compared to only once, using the method
\r
898 * RuleBasedCollator.compare(String, String) will have a better performance.
\r
901 * See the class documentation for an explanation about CollationKeys.
\r
903 * @param source the text String to be transformed into a collation key.
\r
904 * @return the CollationKey for the given String based on this
\r
905 * RuleBasedCollator's collation rules. If the source String is
\r
906 * null, a null CollationKey is returned.
\r
907 * @see CollationKey
\r
908 * @see #compare(String, String)
\r
909 * @see #getRawCollationKey
\r
912 public CollationKey getCollationKey(String source) {
\r
913 if (source == null) {
\r
916 m_utilRawCollationKey_ = getRawCollationKey(source,
\r
917 m_utilRawCollationKey_);
\r
918 return new CollationKey(source, m_utilRawCollationKey_);
\r
922 * Gets the simpler form of a CollationKey for the String source following
\r
923 * the rules of this Collator and stores the result into the user provided
\r
925 * If key has a internal byte array of length that's too small for the
\r
926 * result, the internal byte array will be grown to the exact required
\r
928 * @param source the text String to be transformed into a RawCollationKey
\r
929 * @param key output RawCollationKey to store results
\r
930 * @return If key is null, a new instance of RawCollationKey will be
\r
931 * created and returned, otherwise the user provided key will be
\r
933 * @see #getCollationKey
\r
934 * @see #compare(String, String)
\r
935 * @see RawCollationKey
\r
938 public RawCollationKey getRawCollationKey(String source,
\r
939 RawCollationKey key)
\r
941 if (source == null) {
\r
944 int strength = getStrength();
\r
945 m_utilCompare0_ = m_isCaseLevel_;
\r
946 //m_utilCompare1_ = true;
\r
947 m_utilCompare2_ = strength >= SECONDARY;
\r
948 m_utilCompare3_ = strength >= TERTIARY;
\r
949 m_utilCompare4_ = strength >= QUATERNARY;
\r
950 m_utilCompare5_ = strength == IDENTICAL;
\r
952 m_utilBytesCount0_ = 0;
\r
953 m_utilBytesCount1_ = 0;
\r
954 m_utilBytesCount2_ = 0;
\r
955 m_utilBytesCount3_ = 0;
\r
956 m_utilBytesCount4_ = 0;
\r
957 //m_utilBytesCount5_ = 0;
\r
958 //m_utilCount0_ = 0;
\r
959 //m_utilCount1_ = 0;
\r
963 //m_utilCount5_ = 0;
\r
964 boolean doFrench = m_isFrenchCollation_ && m_utilCompare2_;
\r
965 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted.
\r
966 // If we have no qShifted, we don't need to set UCOL_COMMON_BOT4 so
\r
968 int commonBottom4 = ((m_variableTopValue_ >>> 8) + 1) & LAST_BYTE_MASK_;
\r
969 byte hiragana4 = 0;
\r
970 if (m_isHiragana4_ && m_utilCompare4_) {
\r
971 // allocate one more space for hiragana, value for hiragana
\r
972 hiragana4 = (byte)commonBottom4;
\r
976 int bottomCount4 = 0xFF - commonBottom4;
\r
977 // If we need to normalize, we'll do it all at once at the beginning!
\r
978 if (m_utilCompare5_ && Normalizer.quickCheck(source, Normalizer.NFD,0)
\r
979 != Normalizer.YES) {
\r
980 // if it is identical strength, we have to normalize the string to
\r
981 // NFD so that it will be appended correctly to the end of the sort
\r
983 source = Normalizer.decompose(source, false);
\r
985 else if (getDecomposition() != NO_DECOMPOSITION
\r
986 && Normalizer.quickCheck(source, Normalizer.FCD,0)
\r
987 != Normalizer.YES) {
\r
988 // for the rest of the strength, if decomposition is on, FCD is
\r
989 // enough for us to work on.
\r
990 source = Normalizer.normalize(source,Normalizer.FCD);
\r
992 getSortKeyBytes(source, doFrench, hiragana4, commonBottom4,
\r
995 key = new RawCollationKey();
\r
997 getSortKey(source, doFrench, commonBottom4, bottomCount4, key);
\r
1002 * Return true if an uppercase character is sorted before the corresponding lowercase character.
\r
1003 * See setCaseFirst(boolean) for details.
\r
1004 * @see #setUpperCaseFirst
\r
1005 * @see #setLowerCaseFirst
\r
1006 * @see #isLowerCaseFirst
\r
1007 * @see #setCaseFirstDefault
\r
1008 * @return true if upper cased characters are sorted before lower cased
\r
1009 * characters, false otherwise
\r
1012 public boolean isUpperCaseFirst()
\r
1014 return (m_caseFirst_ == AttributeValue.UPPER_FIRST_);
\r
1018 * Return true if a lowercase character is sorted before the corresponding uppercase character.
\r
1019 * See setCaseFirst(boolean) for details.
\r
1020 * @see #setUpperCaseFirst
\r
1021 * @see #setLowerCaseFirst
\r
1022 * @see #isUpperCaseFirst
\r
1023 * @see #setCaseFirstDefault
\r
1024 * @return true lower cased characters are sorted before upper cased
\r
1025 * characters, false otherwise
\r
1028 public boolean isLowerCaseFirst()
\r
1030 return (m_caseFirst_ == AttributeValue.LOWER_FIRST_);
\r
1034 * Checks if the alternate handling behaviour is the UCA defined SHIFTED or
\r
1036 * If return value is true, then the alternate handling attribute for the
\r
1037 * Collator is SHIFTED. Otherwise if return value is false, then the
\r
1038 * alternate handling attribute for the Collator is NON_IGNORABLE
\r
1039 * See setAlternateHandlingShifted(boolean) for more details.
\r
1040 * @return true or false
\r
1041 * @see #setAlternateHandlingShifted(boolean)
\r
1042 * @see #setAlternateHandlingDefault
\r
1045 public boolean isAlternateHandlingShifted()
\r
1047 return m_isAlternateHandlingShifted_;
\r
1051 * Checks if case level is set to true.
\r
1052 * See setCaseLevel(boolean) for details.
\r
1053 * @return the case level mode
\r
1054 * @see #setCaseLevelDefault
\r
1055 * @see #isCaseLevel
\r
1056 * @see #setCaseLevel(boolean)
\r
1059 public boolean isCaseLevel()
\r
1061 return m_isCaseLevel_;
\r
1065 * Checks if French Collation is set to true.
\r
1066 * See setFrenchCollation(boolean) for details.
\r
1067 * @return true if French Collation is set to true, false otherwise
\r
1068 * @see #setFrenchCollation(boolean)
\r
1069 * @see #setFrenchCollationDefault
\r
1072 public boolean isFrenchCollation()
\r
1074 return m_isFrenchCollation_;
\r
1078 * Checks if the Hiragana Quaternary mode is set on.
\r
1079 * See setHiraganaQuaternary(boolean) for more details.
\r
1080 * @return flag true if Hiragana Quaternary mode is on, false otherwise
\r
1081 * @see #setHiraganaQuaternaryDefault
\r
1082 * @see #setHiraganaQuaternary(boolean)
\r
1085 public boolean isHiraganaQuaternary()
\r
1087 return m_isHiragana4_;
\r
1091 * Gets the variable top value of a Collator.
\r
1092 * Lower 16 bits are undefined and should be ignored.
\r
1093 * @return the variable top value of a Collator.
\r
1094 * @see #setVariableTop
\r
1097 public int getVariableTop()
\r
1099 return m_variableTopValue_ << 16;
\r
1103 * Method to retrieve the numeric collation value.
\r
1104 * When numeric collation is turned on, this Collator generates a collation
\r
1105 * key for the numeric value of substrings of digits. This is a way to get
\r
1106 * '100' to sort AFTER '2'
\r
1107 * @see #setNumericCollation
\r
1108 * @see #setNumericCollationDefault
\r
1109 * @return true if numeric collation is turned on, false otherwise
\r
1112 public boolean getNumericCollation()
\r
1114 return m_isNumericCollation_;
\r
1117 // public other methods -------------------------------------------------
\r
1120 * Compares the equality of two RuleBasedCollator objects.
\r
1121 * RuleBasedCollator objects are equal if they have the same collation
\r
1122 * rules and the same attributes.
\r
1123 * @param obj the RuleBasedCollator to be compared to.
\r
1124 * @return true if this RuleBasedCollator has exactly the same
\r
1125 * collation behaviour as obj, false otherwise.
\r
1128 public boolean equals(Object obj)
\r
1130 if (obj == null) {
\r
1131 return false; // super does class check
\r
1133 if (this == obj) {
\r
1136 if (getClass() != obj.getClass()) {
\r
1139 RuleBasedCollator other = (RuleBasedCollator)obj;
\r
1140 // all other non-transient information is also contained in rules.
\r
1141 if (getStrength() != other.getStrength()
\r
1142 || getDecomposition() != other.getDecomposition()
\r
1143 || other.m_caseFirst_ != m_caseFirst_
\r
1144 || other.m_caseSwitch_ != m_caseSwitch_
\r
1145 || other.m_isAlternateHandlingShifted_
\r
1146 != m_isAlternateHandlingShifted_
\r
1147 || other.m_isCaseLevel_ != m_isCaseLevel_
\r
1148 || other.m_isFrenchCollation_ != m_isFrenchCollation_
\r
1149 || other.m_isHiragana4_ != m_isHiragana4_) {
\r
1152 boolean rules = m_rules_ == other.m_rules_;
\r
1153 if (!rules && (m_rules_ != null && other.m_rules_ != null)) {
\r
1154 rules = m_rules_.equals(other.m_rules_);
\r
1156 if (!rules || !ICUDebug.enabled("collation")) {
\r
1159 if (m_addition3_ != other.m_addition3_
\r
1160 || m_bottom3_ != other.m_bottom3_
\r
1161 || m_bottomCount3_ != other.m_bottomCount3_
\r
1162 || m_common3_ != other.m_common3_
\r
1163 || m_isSimple3_ != other.m_isSimple3_
\r
1164 || m_mask3_ != other.m_mask3_
\r
1165 || m_minContractionEnd_ != other.m_minContractionEnd_
\r
1166 || m_minUnsafe_ != other.m_minUnsafe_
\r
1167 || m_top3_ != other.m_top3_
\r
1168 || m_topCount3_ != other.m_topCount3_
\r
1169 || !Arrays.equals(m_unsafe_, other.m_unsafe_)) {
\r
1172 if (!m_trie_.equals(other.m_trie_)) {
\r
1173 // we should use the trie iterator here, but then this part is
\r
1174 // only used in the test.
\r
1175 for (int i = UCharacter.MAX_VALUE; i >= UCharacter.MIN_VALUE; i --)
\r
1177 int v = m_trie_.getCodePointValue(i);
\r
1178 int otherv = other.m_trie_.getCodePointValue(i);
\r
1179 if (v != otherv) {
\r
1180 int mask = v & (CE_TAG_MASK_ | CE_SPECIAL_FLAG_);
\r
1181 if (mask == (otherv & 0xff000000)) {
\r
1183 otherv &= 0xffffff;
\r
1184 if (mask == 0xf1000000) {
\r
1185 v -= (m_expansionOffset_ << 4);
\r
1186 otherv -= (other.m_expansionOffset_ << 4);
\r
1188 else if (mask == 0xf2000000) {
\r
1189 v -= m_contractionOffset_;
\r
1190 otherv -= other.m_contractionOffset_;
\r
1192 if (v == otherv) {
\r
1200 if (!Arrays.equals(m_contractionCE_, other.m_contractionCE_)
\r
1201 || !Arrays.equals(m_contractionEnd_, other.m_contractionEnd_)
\r
1202 || !Arrays.equals(m_contractionIndex_, other.m_contractionIndex_)
\r
1203 || !Arrays.equals(m_expansion_, other.m_expansion_)
\r
1204 || !Arrays.equals(m_expansionEndCE_, other.m_expansionEndCE_)) {
\r
1207 // not comparing paddings
\r
1208 for (int i = 0; i < m_expansionEndCE_.length; i++) {
\r
1209 if (m_expansionEndCEMaxSize_[i] != other.m_expansionEndCEMaxSize_[i]) {
\r
1217 * Generates a unique hash code for this RuleBasedCollator.
\r
1218 * @return the unique hash code for this Collator
\r
1221 public int hashCode()
\r
1223 String rules = getRules();
\r
1224 if (rules == null) {
\r
1227 return rules.hashCode();
\r
1231 * Compares the source text String to the target text String according to
\r
1232 * the collation rules, strength and decomposition mode for this
\r
1233 * RuleBasedCollator.
\r
1234 * Returns an integer less than,
\r
1235 * equal to or greater than zero depending on whether the source String is
\r
1236 * less than, equal to or greater than the target String. See the Collator
\r
1237 * class description for an example of use.
\r
1240 * General recommendation: <br>
\r
1241 * If comparison are to be done to the same String multiple times, it would
\r
1242 * be more efficient to generate CollationKeys for the Strings and use
\r
1243 * CollationKey.compareTo(CollationKey) for the comparisons.
\r
1244 * If speed performance is critical and object instantiation is to be
\r
1245 * reduced, further optimization may be achieved by generating a simpler
\r
1246 * key of the form RawCollationKey and reusing this RawCollationKey
\r
1247 * object with the method RuleBasedCollator.getRawCollationKey. Internal
\r
1248 * byte representation can be directly accessed via RawCollationKey and
\r
1249 * stored for future use. Like CollationKey, RawCollationKey provides a
\r
1250 * method RawCollationKey.compareTo for key comparisons.
\r
1251 * If the each Strings are compared to only once, using the method
\r
1252 * RuleBasedCollator.compare(String, String) will have a better performance.
\r
1254 * @param source the source text String.
\r
1255 * @param target the target text String.
\r
1256 * @return Returns an integer value. Value is less than zero if source is
\r
1257 * less than target, value is zero if source and target are equal,
\r
1258 * value is greater than zero if source is greater than target.
\r
1259 * @see CollationKey
\r
1260 * @see #getCollationKey
\r
1263 public int compare(String source, String target)
\r
1265 if (source == target) {
\r
1269 // Find the length of any leading portion that is equal
\r
1270 int offset = getFirstUnmatchedOffset(source, target);
\r
1271 //return compareRegular(source, target, offset);
\r
1272 if(latinOneUse_) {
\r
1273 if ((offset < source.length()
\r
1274 && source.charAt(offset) > ENDOFLATINONERANGE_)
\r
1275 || (offset < target.length()
\r
1276 && target.charAt(offset) > ENDOFLATINONERANGE_)) {
\r
1277 // source or target start with non-latin-1
\r
1278 return compareRegular(source, target, offset);
\r
1280 return compareUseLatin1(source, target, offset);
\r
1283 return compareRegular(source, target, offset);
\r
1287 // package private inner interfaces --------------------------------------
\r
1290 * Attribute values to be used when setting the Collator options
\r
1292 static interface AttributeValue
\r
1295 * Indicates that the default attribute value will be used.
\r
1296 * See individual attribute for details on its default value.
\r
1298 static final int DEFAULT_ = -1;
\r
1300 * Primary collation strength
\r
1302 static final int PRIMARY_ = Collator.PRIMARY;
\r
1304 * Secondary collation strength
\r
1306 static final int SECONDARY_ = Collator.SECONDARY;
\r
1308 * Tertiary collation strength
\r
1310 static final int TERTIARY_ = Collator.TERTIARY;
\r
1312 * Default collation strength
\r
1314 static final int DEFAULT_STRENGTH_ = Collator.TERTIARY;
\r
1316 * Internal use for strength checks in Collation elements
\r
1318 static final int CE_STRENGTH_LIMIT_ = Collator.TERTIARY + 1;
\r
1320 * Quaternary collation strength
\r
1322 static final int QUATERNARY_ = 3;
\r
1324 * Identical collation strength
\r
1326 static final int IDENTICAL_ = Collator.IDENTICAL;
\r
1328 * Internal use for strength checks
\r
1330 static final int STRENGTH_LIMIT_ = Collator.IDENTICAL + 1;
\r
1332 * Turn the feature off - works for FRENCH_COLLATION, CASE_LEVEL,
\r
1333 * HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE
\r
1335 static final int OFF_ = 16;
\r
1337 * Turn the feature on - works for FRENCH_COLLATION, CASE_LEVEL,
\r
1338 * HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE
\r
1340 static final int ON_ = 17;
\r
1342 * Valid for ALTERNATE_HANDLING. Alternate handling will be shifted
\r
1344 static final int SHIFTED_ = 20;
\r
1346 * Valid for ALTERNATE_HANDLING. Alternate handling will be non
\r
1349 static final int NON_IGNORABLE_ = 21;
\r
1351 * Valid for CASE_FIRST - lower case sorts before upper case
\r
1353 static final int LOWER_FIRST_ = 24;
\r
1355 * Upper case sorts before lower case
\r
1357 static final int UPPER_FIRST_ = 25;
\r
1359 * Number of attribute values
\r
1361 static final int LIMIT_ = 29;
\r
1365 * Attributes that collation service understands. All the attributes can
\r
1366 * take DEFAULT value, as well as the values specific to each one.
\r
1368 static interface Attribute
\r
1371 * Attribute for direction of secondary weights - used in French.
\r
1372 * Acceptable values are ON, which results in secondary weights being
\r
1373 * considered backwards and OFF which treats secondary weights in the
\r
1374 * order they appear.
\r
1376 static final int FRENCH_COLLATION_ = 0;
\r
1378 * Attribute for handling variable elements. Acceptable values are
\r
1379 * NON_IGNORABLE (default) which treats all the codepoints with
\r
1380 * non-ignorable primary weights in the same way, and SHIFTED which
\r
1381 * causes codepoints with primary weights that are equal or below the
\r
1382 * variable top value to be ignored on primary level and moved to the
\r
1383 * quaternary level.
\r
1385 static final int ALTERNATE_HANDLING_ = 1;
\r
1387 * Controls the ordering of upper and lower case letters. Acceptable
\r
1388 * values are OFF (default), which orders upper and lower case letters
\r
1389 * in accordance to their tertiary weights, UPPER_FIRST which forces
\r
1390 * upper case letters to sort before lower case letters, and
\r
1391 * LOWER_FIRST which does the opposite.
\r
1393 static final int CASE_FIRST_ = 2;
\r
1395 * Controls whether an extra case level (positioned before the third
\r
1396 * level) is generated or not. Acceptable values are OFF (default),
\r
1397 * when case level is not generated, and ON which causes the case
\r
1398 * level to be generated. Contents of the case level are affected by
\r
1399 * the value of CASE_FIRST attribute. A simple way to ignore accent
\r
1400 * differences in a string is to set the strength to PRIMARY and
\r
1401 * enable case level.
\r
1403 static final int CASE_LEVEL_ = 3;
\r
1405 * Controls whether the normalization check and necessary
\r
1406 * normalizations are performed. When set to OFF (default) no
\r
1407 * normalization check is performed. The correctness of the result is
\r
1408 * guaranteed only if the input data is in so-called FCD form (see
\r
1409 * users manual for more info). When set to ON, an incremental check
\r
1410 * is performed to see whether the input data is in the FCD form. If
\r
1411 * the data is not in the FCD form, incremental NFD normalization is
\r
1414 static final int NORMALIZATION_MODE_ = 4;
\r
1416 * The strength attribute. Can be either PRIMARY, SECONDARY, TERTIARY,
\r
1417 * QUATERNARY or IDENTICAL. The usual strength for most locales
\r
1418 * (except Japanese) is tertiary. Quaternary strength is useful when
\r
1419 * combined with shifted setting for alternate handling attribute and
\r
1420 * for JIS x 4061 collation, when it is used to distinguish between
\r
1421 * Katakana and Hiragana (this is achieved by setting the
\r
1422 * HIRAGANA_QUATERNARY mode to on. Otherwise, quaternary level is
\r
1423 * affected only by the number of non ignorable code points in the
\r
1424 * string. Identical strength is rarely useful, as it amounts to
\r
1425 * codepoints of the NFD form of the string.
\r
1427 static final int STRENGTH_ = 5;
\r
1429 * When turned on, this attribute positions Hiragana before all
\r
1430 * non-ignorables on quaternary level. This is a sneaky way to produce
\r
1433 static final int HIRAGANA_QUATERNARY_MODE_ = 6;
\r
1437 static final int LIMIT_ = 7;
\r
1441 * DataManipulate singleton
\r
1443 static class DataManipulate implements Trie.DataManipulate
\r
1445 // public methods ----------------------------------------------------
\r
1448 * Internal method called to parse a lead surrogate's ce for the offset
\r
1449 * to the next trail surrogate data.
\r
1450 * @param ce collation element of the lead surrogate
\r
1451 * @return data offset or 0 for the next trail surrogate
\r
1454 public final int getFoldingOffset(int ce)
\r
1456 if (isSpecial(ce) && getTag(ce) == CE_SURROGATE_TAG_) {
\r
1457 return (ce & 0xFFFFFF);
\r
1463 * Get singleton object
\r
1465 public static final DataManipulate getInstance()
\r
1467 if (m_instance_ == null) {
\r
1468 m_instance_ = new DataManipulate();
\r
1470 return m_instance_;
\r
1473 // private data member ----------------------------------------------
\r
1476 * Singleton instance
\r
1478 private static DataManipulate m_instance_;
\r
1480 // private constructor ----------------------------------------------
\r
1483 * private to prevent initialization
\r
1485 private DataManipulate()
\r
1493 static final class UCAConstants
\r
1495 int FIRST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000
\r
1496 int LAST_TERTIARY_IGNORABLE_[] = new int[2]; // 0x00000000
\r
1497 int FIRST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x00008705
\r
1498 int FIRST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000000
\r
1499 int LAST_SECONDARY_IGNORABLE_[] = new int[2]; // 0x00000500
\r
1500 int LAST_PRIMARY_IGNORABLE_[] = new int[2]; // 0x0000DD05
\r
1501 int FIRST_VARIABLE_[] = new int[2]; // 0x05070505
\r
1502 int LAST_VARIABLE_[] = new int[2]; // 0x13CF0505
\r
1503 int FIRST_NON_VARIABLE_[] = new int[2]; // 0x16200505
\r
1504 int LAST_NON_VARIABLE_[] = new int[2]; // 0x767C0505
\r
1505 int RESET_TOP_VALUE_[] = new int[2]; // 0x9F000303
\r
1506 int FIRST_IMPLICIT_[] = new int[2];
\r
1507 int LAST_IMPLICIT_[] = new int[2];
\r
1508 int FIRST_TRAILING_[] = new int[2];
\r
1509 int LAST_TRAILING_[] = new int[2];
\r
1510 int PRIMARY_TOP_MIN_;
\r
1511 int PRIMARY_IMPLICIT_MIN_; // 0xE8000000
\r
1512 int PRIMARY_IMPLICIT_MAX_; // 0xF0000000
\r
1513 int PRIMARY_TRAILING_MIN_; // 0xE8000000
\r
1514 int PRIMARY_TRAILING_MAX_; // 0xF0000000
\r
1515 int PRIMARY_SPECIAL_MIN_; // 0xE8000000
\r
1516 int PRIMARY_SPECIAL_MAX_; // 0xF0000000
\r
1519 // package private data member -------------------------------------------
\r
1521 static final byte BYTE_FIRST_TAILORED_ = (byte)0x04;
\r
1522 static final byte BYTE_COMMON_ = (byte)0x05;
\r
1523 static final int COMMON_TOP_2_ = 0x86; // int for unsigness
\r
1524 static final int COMMON_BOTTOM_2_ = BYTE_COMMON_;
\r
1525 static final int COMMON_BOTTOM_3 = 0x05;
\r
1527 * Case strength mask
\r
1529 static final int CE_CASE_BIT_MASK_ = 0xC0;
\r
1530 static final int CE_TAG_SHIFT_ = 24;
\r
1531 static final int CE_TAG_MASK_ = 0x0F000000;
\r
1533 static final int CE_SPECIAL_FLAG_ = 0xF0000000;
\r
1535 * Lead surrogate that is tailored and doesn't start a contraction
\r
1537 static final int CE_SURROGATE_TAG_ = 5;
\r
1539 * Mask to get the primary strength of the collation element
\r
1541 static final int CE_PRIMARY_MASK_ = 0xFFFF0000;
\r
1543 * Mask to get the secondary strength of the collation element
\r
1545 static final int CE_SECONDARY_MASK_ = 0xFF00;
\r
1547 * Mask to get the tertiary strength of the collation element
\r
1549 static final int CE_TERTIARY_MASK_ = 0xFF;
\r
1551 * Primary strength shift
\r
1553 static final int CE_PRIMARY_SHIFT_ = 16;
\r
1555 * Secondary strength shift
\r
1557 static final int CE_SECONDARY_SHIFT_ = 8;
\r
1559 * Continuation marker
\r
1561 static final int CE_CONTINUATION_MARKER_ = 0xC0;
\r
1564 * Size of collator raw data headers and options before the expansion
\r
1565 * data. This is used when expansion ces are to be retrieved. ICU4C uses
\r
1566 * the expansion offset starting from UCollator.UColHeader, hence ICU4J
\r
1567 * will have to minus that off to get the right expansion ce offset. In
\r
1570 int m_expansionOffset_;
\r
1572 * Size of collator raw data headers, options and expansions before
\r
1573 * contraction data. This is used when contraction ces are to be retrieved.
\r
1574 * ICU4C uses contraction offset starting from UCollator.UColHeader, hence
\r
1575 * ICU4J will have to minus that off to get the right contraction ce
\r
1576 * offset. In number of chars.
\r
1578 int m_contractionOffset_;
\r
1580 * Flag indicator if Jamo is special
\r
1582 boolean m_isJamoSpecial_;
\r
1584 // Collator options ------------------------------------------------------
\r
1586 int m_defaultVariableTopValue_;
\r
1587 boolean m_defaultIsFrenchCollation_;
\r
1588 boolean m_defaultIsAlternateHandlingShifted_;
\r
1589 int m_defaultCaseFirst_;
\r
1590 boolean m_defaultIsCaseLevel_;
\r
1591 int m_defaultDecomposition_;
\r
1592 int m_defaultStrength_;
\r
1593 boolean m_defaultIsHiragana4_;
\r
1594 boolean m_defaultIsNumericCollation_;
\r
1597 * Value of the variable top
\r
1599 int m_variableTopValue_;
\r
1601 * Attribute for special Hiragana
\r
1603 boolean m_isHiragana4_;
\r
1605 * Case sorting customization
\r
1609 * Numeric collation option
\r
1611 boolean m_isNumericCollation_;
\r
1613 // end Collator options --------------------------------------------------
\r
1618 int m_expansion_[];
\r
1620 * Contraction index table
\r
1622 char m_contractionIndex_[];
\r
1624 * Contraction CE table
\r
1626 int m_contractionCE_[];
\r
1632 * Table to store all collation elements that are the last element of an
\r
1633 * expansion. This is for use in StringSearch.
\r
1635 int m_expansionEndCE_[];
\r
1637 * Table to store the maximum size of any expansions that end with the
\r
1638 * corresponding collation element in m_expansionEndCE_. For use in
\r
1639 * StringSearch too
\r
1641 byte m_expansionEndCEMaxSize_[];
\r
1643 * Heuristic table to store information on whether a char character is
\r
1644 * considered "unsafe". "Unsafe" character are combining marks or those
\r
1645 * belonging to some contraction sequence from the offset 1 onwards.
\r
1646 * E.g. if "ABC" is the only contraction, then 'B' and 'C' are considered
\r
1647 * unsafe. If we have another contraction "ZA" with the one above, then
\r
1648 * 'A', 'B', 'C' are "unsafe" but 'Z' is not.
\r
1652 * Table to store information on whether a codepoint can occur as the last
\r
1653 * character in a contraction
\r
1655 byte m_contractionEnd_[];
\r
1657 * Original collation rules
\r
1661 * The smallest "unsafe" codepoint
\r
1663 char m_minUnsafe_;
\r
1665 * The smallest codepoint that could be the end of a contraction
\r
1667 char m_minContractionEnd_;
\r
1669 * General version of the collator
\r
1671 VersionInfo m_version_;
\r
1675 VersionInfo m_UCA_version_;
\r
1679 VersionInfo m_UCD_version_;
\r
1682 * UnicodeData.txt property object
\r
1684 static final RuleBasedCollator UCA_;
\r
1688 static final UCAConstants UCA_CONSTANTS_;
\r
1690 * Table for UCA and builder use
\r
1692 static final char UCA_CONTRACTIONS_[];
\r
1694 private static boolean UCA_INIT_COMPLETE;
\r
1697 * Implicit generator
\r
1699 static final ImplicitCEGenerator impCEGen_;
\r
1701 // * Implicit constants
\r
1703 // static final int IMPLICIT_BASE_BYTE_;
\r
1704 // static final int IMPLICIT_LIMIT_BYTE_;
\r
1705 // static final int IMPLICIT_4BYTE_BOUNDARY_;
\r
1706 // static final int LAST_MULTIPLIER_;
\r
1707 // static final int LAST2_MULTIPLIER_;
\r
1708 // static final int IMPLICIT_BASE_3BYTE_;
\r
1709 // static final int IMPLICIT_BASE_4BYTE_;
\r
1710 // static final int BYTES_TO_AVOID_ = 3;
\r
1711 // static final int OTHER_COUNT_ = 256 - BYTES_TO_AVOID_;
\r
1712 // static final int LAST_COUNT_ = OTHER_COUNT_ / 2;
\r
1714 // * Room for intervening, without expanding to 5 bytes
\r
1716 // static final int LAST_COUNT2_ = OTHER_COUNT_ / 21;
\r
1717 // static final int IMPLICIT_3BYTE_COUNT_ = 1;
\r
1719 static final byte SORT_LEVEL_TERMINATOR_ = 1;
\r
1721 // These are values from UCA required for
\r
1722 // implicit generation and supressing sort key compression
\r
1723 // they should regularly be in the UCA, but if one
\r
1724 // is running without UCA, it could be a problem
\r
1725 static final int maxRegularPrimary = 0xA0;
\r
1726 static final int minImplicitPrimary = 0xE0;
\r
1727 static final int maxImplicitPrimary = 0xE4;
\r
1730 // block to initialise character property database
\r
1733 // take pains to let static class init succeed, otherwise the class itself won't exist and
\r
1734 // clients will get a NoClassDefFoundException. Instead, make the constructors fail if
\r
1735 // we can't load the UCA data.
\r
1737 RuleBasedCollator iUCA_ = null;
\r
1738 UCAConstants iUCA_CONSTANTS_ = null;
\r
1739 char iUCA_CONTRACTIONS_[] = null;
\r
1740 ImplicitCEGenerator iimpCEGen_ = null;
\r
1743 // !!! note what's going on here...
\r
1744 // even though the static init of the class is not yet complete, we
\r
1745 // instantiate an instance of the class. So we'd better be sure that
\r
1746 // instantiation doesn't rely on the static initialization that's
\r
1747 // not complete yet!
\r
1748 iUCA_ = new RuleBasedCollator();
\r
1749 iUCA_CONSTANTS_ = new UCAConstants();
\r
1750 iUCA_CONTRACTIONS_ = CollatorReader.read(iUCA_, iUCA_CONSTANTS_);
\r
1752 // called before doing canonical closure for the UCA.
\r
1753 iimpCEGen_ = new ImplicitCEGenerator(minImplicitPrimary, maxImplicitPrimary);
\r
1754 //iimpCEGen_ = new ImplicitCEGenerator(iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_, iUCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_);
\r
1756 ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_COLLATION_BASE_NAME, ULocale.ENGLISH);
\r
1757 iUCA_.m_rules_ = (String)rb.getObject("UCARules");
\r
1759 catch (MissingResourceException ex)
\r
1763 catch (IOException e)
\r
1765 // e.printStackTrace();
\r
1766 // throw new MissingResourceException(e.getMessage(),"","");
\r
1770 UCA_CONSTANTS_ = iUCA_CONSTANTS_;
\r
1771 UCA_CONTRACTIONS_ = iUCA_CONTRACTIONS_;
\r
1772 impCEGen_ = iimpCEGen_;
\r
1774 UCA_INIT_COMPLETE = true;
\r
1778 private static void checkUCA() throws MissingResourceException {
\r
1779 if (UCA_INIT_COMPLETE && UCA_ == null) {
\r
1780 throw new MissingResourceException("Collator UCA data unavailable", "", "");
\r
1784 // package private constructors ------------------------------------------
\r
1787 * <p>Private contructor for use by subclasses.
\r
1788 * Public access to creating Collators is handled by the API
\r
1789 * Collator.getInstance() or RuleBasedCollator(String rules).
\r
1792 * This constructor constructs the UCA collator internally
\r
1795 RuleBasedCollator()
\r
1798 initUtility(false);
\r
1802 * Constructors a RuleBasedCollator from the argument locale.
\r
1803 * If no resource bundle is associated with the locale, UCA is used
\r
1807 RuleBasedCollator(ULocale locale)
\r
1810 ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_COLLATION_BASE_NAME, locale);
\r
1811 initUtility(false);
\r
1814 // Use keywords, if supplied for lookup
\r
1815 String collkey = locale.getKeywordValue("collation");
\r
1816 if(collkey == null) {
\r
1817 collkey = rb.getStringWithFallback("collations/default");
\r
1820 // collations/default will always give a string back
\r
1821 // keyword for the real collation data
\r
1822 // if "collations/collkey" will return null if collkey == null
\r
1823 ICUResourceBundle elements = rb.getWithFallback("collations/" + collkey);
\r
1824 if (elements != null) {
\r
1825 // TODO: Determine actual & valid locale correctly
\r
1826 ULocale uloc = rb.getULocale();
\r
1827 setLocale(uloc, uloc);
\r
1829 m_rules_ = elements.getString("Sequence");
\r
1830 ByteBuffer buf = elements.get("%%CollationBin").getBinary();
\r
1833 // m_rules_ = (String)rules[1][1];
\r
1834 CollatorReader.initRBC(this, buf);
\r
1836 BufferedInputStream input =
\r
1837 new BufferedInputStream(
\r
1838 new ByteArrayInputStream(map));
\r
1840 CollatorReader reader = new CollatorReader(input, false);
\r
1841 if (map.length > MIN_BINARY_DATA_SIZE_) {
\r
1842 reader.read(this, null);
\r
1845 reader.readHeader(this);
\r
1846 reader.readOptions(this);
\r
1847 // duplicating UCA_'s data
\r
1848 setWithUCATables();
\r
1851 // at this point, we have read in the collator
\r
1852 // now we need to check whether the binary image has
\r
1853 // the right UCA and other versions
\r
1854 if(!m_UCA_version_.equals(UCA_.m_UCA_version_) ||
\r
1855 !m_UCD_version_.equals(UCA_.m_UCD_version_)) {
\r
1868 catch (Exception e) {
\r
1869 // e.printStackTrace();
\r
1870 // if failed use UCA.
\r
1876 // package private methods -----------------------------------------------
\r
1879 * Sets this collator to use the tables in UCA. Note options not taken
\r
1882 final void setWithUCATables()
\r
1884 m_contractionOffset_ = UCA_.m_contractionOffset_;
\r
1885 m_expansionOffset_ = UCA_.m_expansionOffset_;
\r
1886 m_expansion_ = UCA_.m_expansion_;
\r
1887 m_contractionIndex_ = UCA_.m_contractionIndex_;
\r
1888 m_contractionCE_ = UCA_.m_contractionCE_;
\r
1889 m_trie_ = UCA_.m_trie_;
\r
1890 m_expansionEndCE_ = UCA_.m_expansionEndCE_;
\r
1891 m_expansionEndCEMaxSize_ = UCA_.m_expansionEndCEMaxSize_;
\r
1892 m_unsafe_ = UCA_.m_unsafe_;
\r
1893 m_contractionEnd_ = UCA_.m_contractionEnd_;
\r
1894 m_minUnsafe_ = UCA_.m_minUnsafe_;
\r
1895 m_minContractionEnd_ = UCA_.m_minContractionEnd_;
\r
1899 * Sets this collator to use the all options and tables in UCA.
\r
1901 final void setWithUCAData()
\r
1903 latinOneFailed_ = true;
\r
1905 m_addition3_ = UCA_.m_addition3_;
\r
1906 m_bottom3_ = UCA_.m_bottom3_;
\r
1907 m_bottomCount3_ = UCA_.m_bottomCount3_;
\r
1908 m_caseFirst_ = UCA_.m_caseFirst_;
\r
1909 m_caseSwitch_ = UCA_.m_caseSwitch_;
\r
1910 m_common3_ = UCA_.m_common3_;
\r
1911 m_contractionOffset_ = UCA_.m_contractionOffset_;
\r
1912 setDecomposition(UCA_.getDecomposition());
\r
1913 m_defaultCaseFirst_ = UCA_.m_defaultCaseFirst_;
\r
1914 m_defaultDecomposition_ = UCA_.m_defaultDecomposition_;
\r
1915 m_defaultIsAlternateHandlingShifted_
\r
1916 = UCA_.m_defaultIsAlternateHandlingShifted_;
\r
1917 m_defaultIsCaseLevel_ = UCA_.m_defaultIsCaseLevel_;
\r
1918 m_defaultIsFrenchCollation_ = UCA_.m_defaultIsFrenchCollation_;
\r
1919 m_defaultIsHiragana4_ = UCA_.m_defaultIsHiragana4_;
\r
1920 m_defaultStrength_ = UCA_.m_defaultStrength_;
\r
1921 m_defaultVariableTopValue_ = UCA_.m_defaultVariableTopValue_;
\r
1922 m_defaultIsNumericCollation_ = UCA_.m_defaultIsNumericCollation_;
\r
1923 m_expansionOffset_ = UCA_.m_expansionOffset_;
\r
1924 m_isAlternateHandlingShifted_ = UCA_.m_isAlternateHandlingShifted_;
\r
1925 m_isCaseLevel_ = UCA_.m_isCaseLevel_;
\r
1926 m_isFrenchCollation_ = UCA_.m_isFrenchCollation_;
\r
1927 m_isHiragana4_ = UCA_.m_isHiragana4_;
\r
1928 m_isJamoSpecial_ = UCA_.m_isJamoSpecial_;
\r
1929 m_isSimple3_ = UCA_.m_isSimple3_;
\r
1930 m_mask3_ = UCA_.m_mask3_;
\r
1931 m_minContractionEnd_ = UCA_.m_minContractionEnd_;
\r
1932 m_minUnsafe_ = UCA_.m_minUnsafe_;
\r
1933 m_rules_ = UCA_.m_rules_;
\r
1934 setStrength(UCA_.getStrength());
\r
1935 m_top3_ = UCA_.m_top3_;
\r
1936 m_topCount3_ = UCA_.m_topCount3_;
\r
1937 m_variableTopValue_ = UCA_.m_variableTopValue_;
\r
1938 m_isNumericCollation_ = UCA_.m_isNumericCollation_;
\r
1939 setWithUCATables();
\r
1940 latinOneFailed_ = false;
\r
1944 * Test whether a char character is potentially "unsafe" for use as a
\r
1945 * collation starting point. "Unsafe" characters are combining marks or
\r
1946 * those belonging to some contraction sequence from the offset 1 onwards.
\r
1947 * E.g. if "ABC" is the only contraction, then 'B' and
\r
1948 * 'C' are considered unsafe. If we have another contraction "ZA" with
\r
1949 * the one above, then 'A', 'B', 'C' are "unsafe" but 'Z' is not.
\r
1950 * @param ch character to determin
\r
1951 * @return true if ch is unsafe, false otherwise
\r
1953 final boolean isUnsafe(char ch)
\r
1955 if (ch < m_minUnsafe_) {
\r
1959 if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) {
\r
1960 if (UTF16.isLeadSurrogate(ch)
\r
1961 || UTF16.isTrailSurrogate(ch)) {
\r
1962 // Trail surrogate are always considered unsafe.
\r
1965 ch &= HEURISTIC_OVERFLOW_MASK_;
\r
1966 ch += HEURISTIC_OVERFLOW_OFFSET_;
\r
1968 int value = m_unsafe_[ch >> HEURISTIC_SHIFT_];
\r
1969 return ((value >> (ch & HEURISTIC_MASK_)) & 1) != 0;
\r
1973 * Approximate determination if a char character is at a contraction end.
\r
1974 * Guaranteed to be true if a character is at the end of a contraction,
\r
1975 * otherwise it is not deterministic.
\r
1976 * @param ch character to be determined
\r
1978 final boolean isContractionEnd(char ch)
\r
1980 if (UTF16.isTrailSurrogate(ch)) {
\r
1984 if (ch < m_minContractionEnd_) {
\r
1988 if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) {
\r
1989 ch &= HEURISTIC_OVERFLOW_MASK_;
\r
1990 ch += HEURISTIC_OVERFLOW_OFFSET_;
\r
1992 int value = m_contractionEnd_[ch >> HEURISTIC_SHIFT_];
\r
1993 return ((value >> (ch & HEURISTIC_MASK_)) & 1) != 0;
\r
1997 * Retrieve the tag of a special ce
\r
1998 * @param ce ce to test
\r
1999 * @return tag of ce
\r
2001 static int getTag(int ce)
\r
2003 return (ce & CE_TAG_MASK_) >> CE_TAG_SHIFT_;
\r
2007 * Checking if ce is special
\r
2008 * @param ce to check
\r
2009 * @return true if ce is special
\r
2011 static boolean isSpecial(int ce)
\r
2013 return (ce & CE_SPECIAL_FLAG_) == CE_SPECIAL_FLAG_;
\r
2017 * Checks if the argument ce is a continuation
\r
2018 * @param ce collation element to test
\r
2019 * @return true if ce is a continuation
\r
2021 static final boolean isContinuation(int ce)
\r
2023 return ce != CollationElementIterator.NULLORDER
\r
2024 && (ce & CE_CONTINUATION_TAG_) == CE_CONTINUATION_TAG_;
\r
2027 // private inner classes ------------------------------------------------
\r
2029 // private variables -----------------------------------------------------
\r
2032 * The smallest natural unsafe or contraction end char character before
\r
2034 * This is a combining mark.
\r
2036 private static final int DEFAULT_MIN_HEURISTIC_ = 0x300;
\r
2038 * Heuristic table table size. Size is 32 bytes, 1 bit for each
\r
2039 * latin 1 char, and some power of two for hashing the rest of the chars.
\r
2042 private static final char HEURISTIC_SIZE_ = 1056;
\r
2044 * Mask value down to "some power of two" - 1,
\r
2045 * number of bits, not num of bytes.
\r
2047 private static final char HEURISTIC_OVERFLOW_MASK_ = 0x1fff;
\r
2049 * Unsafe character shift
\r
2051 private static final int HEURISTIC_SHIFT_ = 3;
\r
2053 * Unsafe character addition for character too large, it has to be folded
\r
2054 * then incremented.
\r
2056 private static final char HEURISTIC_OVERFLOW_OFFSET_ = 256;
\r
2058 * Mask value to get offset in heuristic table.
\r
2060 private static final char HEURISTIC_MASK_ = 7;
\r
2062 private int m_caseSwitch_;
\r
2063 private int m_common3_;
\r
2064 private int m_mask3_;
\r
2066 * When switching case, we need to add or subtract different values.
\r
2068 private int m_addition3_;
\r
2070 * Upper range when compressing
\r
2072 private int m_top3_;
\r
2074 * Upper range when compressing
\r
2076 private int m_bottom3_;
\r
2077 private int m_topCount3_;
\r
2078 private int m_bottomCount3_;
\r
2080 * Case first constants
\r
2082 private static final int CASE_SWITCH_ = 0xC0;
\r
2083 private static final int NO_CASE_SWITCH_ = 0;
\r
2085 * Case level constants
\r
2087 private static final int CE_REMOVE_CASE_ = 0x3F;
\r
2088 private static final int CE_KEEP_CASE_ = 0xFF;
\r
2090 * Case strength mask
\r
2092 private static final int CE_CASE_MASK_3_ = 0xFF;
\r
2094 * Sortkey size factor. Values can be changed.
\r
2096 private static final double PROPORTION_2_ = 0.5;
\r
2097 private static final double PROPORTION_3_ = 0.667;
\r
2099 // These values come from the UCA ----------------------------------------
\r
2102 * This is an enum that lists magic special byte values from the
\r
2105 //private static final byte BYTE_ZERO_ = 0x0;
\r
2106 //private static final byte BYTE_LEVEL_SEPARATOR_ = (byte)0x01;
\r
2107 //private static final byte BYTE_SORTKEY_GLUE_ = (byte)0x02;
\r
2108 private static final byte BYTE_SHIFT_PREFIX_ = (byte)0x03;
\r
2109 /*private*/ static final byte BYTE_UNSHIFTED_MIN_ = BYTE_SHIFT_PREFIX_;
\r
2110 //private static final byte BYTE_FIRST_UCA_ = BYTE_COMMON_;
\r
2111 static final byte CODAN_PLACEHOLDER = 0x27;
\r
2112 //private static final byte BYTE_LAST_LATIN_PRIMARY_ = (byte)0x4C;
\r
2113 private static final byte BYTE_FIRST_NON_LATIN_PRIMARY_ = (byte)0x4D;
\r
2114 private static final byte BYTE_UNSHIFTED_MAX_ = (byte)0xFF;
\r
2115 private static final int TOTAL_2_ = COMMON_TOP_2_ - COMMON_BOTTOM_2_ - 1;
\r
2116 private static final int FLAG_BIT_MASK_CASE_SWITCH_OFF_ = 0x80;
\r
2117 private static final int FLAG_BIT_MASK_CASE_SWITCH_ON_ = 0x40;
\r
2118 private static final int COMMON_TOP_CASE_SWITCH_OFF_3_ = 0x85;
\r
2119 private static final int COMMON_TOP_CASE_SWITCH_LOWER_3_ = 0x45;
\r
2120 private static final int COMMON_TOP_CASE_SWITCH_UPPER_3_ = 0xC5;
\r
2121 private static final int COMMON_BOTTOM_3_ = 0x05;
\r
2122 private static final int COMMON_BOTTOM_CASE_SWITCH_UPPER_3_ = 0x86;
\r
2123 private static final int COMMON_BOTTOM_CASE_SWITCH_LOWER_3_ =
\r
2125 private static final int TOP_COUNT_2_ = (int)(PROPORTION_2_ * TOTAL_2_);
\r
2126 private static final int BOTTOM_COUNT_2_ = TOTAL_2_ - TOP_COUNT_2_;
\r
2127 private static final int COMMON_2_ = COMMON_BOTTOM_2_;
\r
2128 private static final int COMMON_UPPER_FIRST_3_ = 0xC5;
\r
2129 private static final int COMMON_NORMAL_3_ = COMMON_BOTTOM_3_;
\r
2130 //private static final int COMMON_4_ = (byte)0xFF;
\r
2135 * Minimum size required for the binary collation data in bytes.
\r
2136 * Size of UCA header + size of options to 4 bytes
\r
2138 //private static final int MIN_BINARY_DATA_SIZE_ = (42 + 25) << 2;
\r
2141 * If this collator is to generate only simple tertiaries for fast path
\r
2143 private boolean m_isSimple3_;
\r
2146 * French collation sorting flag
\r
2148 private boolean m_isFrenchCollation_;
\r
2150 * Flag indicating if shifted is requested for Quaternary alternate
\r
2151 * handling. If this is not true, the default for alternate handling will
\r
2152 * be non-ignorable.
\r
2154 private boolean m_isAlternateHandlingShifted_;
\r
2156 * Extra case level for sorting
\r
2158 private boolean m_isCaseLevel_;
\r
2160 private static final int SORT_BUFFER_INIT_SIZE_ = 128;
\r
2161 private static final int SORT_BUFFER_INIT_SIZE_1_ =
\r
2162 SORT_BUFFER_INIT_SIZE_ << 3;
\r
2163 private static final int SORT_BUFFER_INIT_SIZE_2_ = SORT_BUFFER_INIT_SIZE_;
\r
2164 private static final int SORT_BUFFER_INIT_SIZE_3_ = SORT_BUFFER_INIT_SIZE_;
\r
2165 private static final int SORT_BUFFER_INIT_SIZE_CASE_ =
\r
2166 SORT_BUFFER_INIT_SIZE_ >> 2;
\r
2167 private static final int SORT_BUFFER_INIT_SIZE_4_ = SORT_BUFFER_INIT_SIZE_;
\r
2169 private static final int CE_CONTINUATION_TAG_ = 0xC0;
\r
2170 private static final int CE_REMOVE_CONTINUATION_MASK_ = 0xFFFFFF3F;
\r
2172 private static final int LAST_BYTE_MASK_ = 0xFF;
\r
2174 //private static final int CE_RESET_TOP_VALUE_ = 0x9F000303;
\r
2175 //private static final int CE_NEXT_TOP_VALUE_ = 0xE8960303;
\r
2177 private static final byte SORT_CASE_BYTE_START_ = (byte)0x80;
\r
2178 private static final byte SORT_CASE_SHIFT_START_ = (byte)7;
\r
2183 private static final int CE_BUFFER_SIZE_ = 512;
\r
2185 // variables for Latin-1 processing
\r
2186 boolean latinOneUse_ = false;
\r
2187 boolean latinOneRegenTable_ = false;
\r
2188 boolean latinOneFailed_ = false;
\r
2190 int latinOneTableLen_ = 0;
\r
2191 int latinOneCEs_[] = null;
\r
2193 * Bunch of utility iterators
\r
2195 private StringUCharacterIterator m_srcUtilIter_;
\r
2196 private CollationElementIterator m_srcUtilColEIter_;
\r
2197 private StringUCharacterIterator m_tgtUtilIter_;
\r
2198 private CollationElementIterator m_tgtUtilColEIter_;
\r
2200 * Utility comparison flags
\r
2202 private boolean m_utilCompare0_;
\r
2203 //private boolean m_utilCompare1_;
\r
2204 private boolean m_utilCompare2_;
\r
2205 private boolean m_utilCompare3_;
\r
2206 private boolean m_utilCompare4_;
\r
2207 private boolean m_utilCompare5_;
\r
2209 * Utility byte buffer
\r
2211 private byte m_utilBytes0_[];
\r
2212 private byte m_utilBytes1_[];
\r
2213 private byte m_utilBytes2_[];
\r
2214 private byte m_utilBytes3_[];
\r
2215 private byte m_utilBytes4_[];
\r
2216 //private byte m_utilBytes5_[];
\r
2217 private RawCollationKey m_utilRawCollationKey_;
\r
2219 private int m_utilBytesCount0_;
\r
2220 private int m_utilBytesCount1_;
\r
2221 private int m_utilBytesCount2_;
\r
2222 private int m_utilBytesCount3_;
\r
2223 private int m_utilBytesCount4_;
\r
2224 //private int m_utilBytesCount5_;
\r
2225 //private int m_utilCount0_;
\r
2226 //private int m_utilCount1_;
\r
2227 private int m_utilCount2_;
\r
2228 private int m_utilCount3_;
\r
2229 private int m_utilCount4_;
\r
2230 //private int m_utilCount5_;
\r
2232 private int m_utilFrenchStart_;
\r
2233 private int m_utilFrenchEnd_;
\r
2236 * Preparing the CE buffers. will be filled during the primary phase
\r
2238 private int m_srcUtilCEBuffer_[];
\r
2239 private int m_tgtUtilCEBuffer_[];
\r
2240 private int m_srcUtilCEBufferSize_;
\r
2241 private int m_tgtUtilCEBufferSize_;
\r
2243 private int m_srcUtilContOffset_;
\r
2244 private int m_tgtUtilContOffset_;
\r
2246 private int m_srcUtilOffset_;
\r
2247 private int m_tgtUtilOffset_;
\r
2249 // private methods -------------------------------------------------------
\r
2251 private void init(String rules) throws Exception
\r
2254 CollationParsedRuleBuilder builder
\r
2255 = new CollationParsedRuleBuilder(rules);
\r
2256 builder.setRules(this);
\r
2259 initUtility(false);
\r
2262 private final int compareRegular(String source, String target, int offset) {
\r
2263 if (m_srcUtilIter_ == null) {
\r
2264 initUtility(true);
\r
2266 int strength = getStrength();
\r
2267 // setting up the collator parameters
\r
2268 m_utilCompare0_ = m_isCaseLevel_;
\r
2269 //m_utilCompare1_ = true;
\r
2270 m_utilCompare2_ = strength >= SECONDARY;
\r
2271 m_utilCompare3_ = strength >= TERTIARY;
\r
2272 m_utilCompare4_ = strength >= QUATERNARY;
\r
2273 m_utilCompare5_ = strength == IDENTICAL;
\r
2274 boolean doFrench = m_isFrenchCollation_ && m_utilCompare2_;
\r
2275 boolean doShift4 = m_isAlternateHandlingShifted_ && m_utilCompare4_;
\r
2276 boolean doHiragana4 = m_isHiragana4_ && m_utilCompare4_;
\r
2278 if (doHiragana4 && doShift4) {
\r
2279 String sourcesub = source.substring(offset);
\r
2280 String targetsub = target.substring(offset);
\r
2281 return compareBySortKeys(sourcesub, targetsub);
\r
2284 // This is the lowest primary value that will not be ignored if shifted
\r
2285 int lowestpvalue = m_isAlternateHandlingShifted_
\r
2286 ? m_variableTopValue_ << 16 : 0;
\r
2287 m_srcUtilCEBufferSize_ = 0;
\r
2288 m_tgtUtilCEBufferSize_ = 0;
\r
2289 int result = doPrimaryCompare(doHiragana4, lowestpvalue, source,
\r
2291 if (m_srcUtilCEBufferSize_ == -1
\r
2292 && m_tgtUtilCEBufferSize_ == -1) {
\r
2293 // since the cebuffer is cleared when we have determined that
\r
2294 // either source is greater than target or vice versa, the return
\r
2295 // result is the comparison result and not the hiragana result
\r
2299 int hiraganaresult = result;
\r
2301 if (m_utilCompare2_) {
\r
2302 result = doSecondaryCompare(doFrench);
\r
2303 if (result != 0) {
\r
2307 // doing the case bit
\r
2308 if (m_utilCompare0_) {
\r
2309 result = doCaseCompare();
\r
2310 if (result != 0) {
\r
2315 if (m_utilCompare3_) {
\r
2316 result = doTertiaryCompare();
\r
2317 if (result != 0) {
\r
2322 if (doShift4) { // checkQuad
\r
2323 result = doQuaternaryCompare(lowestpvalue);
\r
2324 if (result != 0) {
\r
2328 else if (doHiragana4 && hiraganaresult != 0) {
\r
2329 // If we're fine on quaternaries, we might be different
\r
2330 // on Hiragana. This, however, might fail us in shifted.
\r
2331 return hiraganaresult;
\r
2334 // For IDENTICAL comparisons, we use a bitwise character comparison
\r
2335 // as a tiebreaker if all else is equal.
\r
2336 // Getting here should be quite rare - strings are not identical -
\r
2337 // that is checked first, but compared == through all other checks.
\r
2338 if (m_utilCompare5_) {
\r
2339 return doIdenticalCompare(source, target, offset, true);
\r
2345 * Gets the 2 bytes of primary order and adds it to the primary byte array
\r
2346 * @param ce current ce
\r
2347 * @param notIsContinuation flag indicating if the current bytes belong to
\r
2348 * a continuation ce
\r
2349 * @param doShift flag indicating if ce is to be shifted
\r
2350 * @param leadPrimary lead primary used for compression
\r
2351 * @param commonBottom4 common byte value for Quaternary
\r
2352 * @param bottomCount4 smallest byte value for Quaternary
\r
2353 * @return the new lead primary for compression
\r
2355 private final int doPrimaryBytes(int ce, boolean notIsContinuation,
\r
2356 boolean doShift, int leadPrimary,
\r
2357 int commonBottom4, int bottomCount4)
\r
2360 int p2 = (ce >>>= 16) & LAST_BYTE_MASK_; // in ints for unsigned
\r
2361 int p1 = ce >>> 8; // comparison
\r
2363 if (m_utilCount4_ > 0) {
\r
2364 while (m_utilCount4_ > bottomCount4) {
\r
2365 m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
\r
2366 (byte)(commonBottom4 + bottomCount4));
\r
2367 m_utilBytesCount4_ ++;
\r
2368 m_utilCount4_ -= bottomCount4;
\r
2370 m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
\r
2371 (byte)(commonBottom4
\r
2372 + (m_utilCount4_ - 1)));
\r
2373 m_utilBytesCount4_ ++;
\r
2374 m_utilCount4_ = 0;
\r
2376 // dealing with a variable and we're treating them as shifted
\r
2377 // This is a shifted ignorable
\r
2379 // we need to check this since we could be in continuation
\r
2380 m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
\r
2382 m_utilBytesCount4_ ++;
\r
2385 m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
\r
2387 m_utilBytesCount4_ ++;
\r
2391 // Note: This code assumes that the table is well built
\r
2392 // i.e. not having 0 bytes where they are not supposed to be.
\r
2393 // Usually, we'll have non-zero primary1 & primary2, except
\r
2394 // in cases of LatinOne and friends, when primary2 will be
\r
2395 // regular and simple sortkey calc
\r
2396 if (p1 != CollationElementIterator.IGNORABLE) {
\r
2397 if (notIsContinuation) {
\r
2398 if (leadPrimary == p1) {
\r
2399 m_utilBytes1_ = append(m_utilBytes1_,
\r
2400 m_utilBytesCount1_, (byte)p2);
\r
2401 m_utilBytesCount1_ ++;
\r
2404 if (leadPrimary != 0) {
\r
2405 m_utilBytes1_ = append(m_utilBytes1_,
\r
2406 m_utilBytesCount1_,
\r
2407 ((p1 > leadPrimary)
\r
2408 ? BYTE_UNSHIFTED_MAX_
\r
2409 : BYTE_UNSHIFTED_MIN_));
\r
2410 m_utilBytesCount1_ ++;
\r
2412 if (p2 == CollationElementIterator.IGNORABLE) {
\r
2413 // one byter, not compressed
\r
2414 m_utilBytes1_ = append(m_utilBytes1_,
\r
2415 m_utilBytesCount1_,
\r
2417 m_utilBytesCount1_ ++;
\r
2420 else if (p1 < BYTE_FIRST_NON_LATIN_PRIMARY_
\r
2421 || (p1 > maxRegularPrimary
\r
2422 //> (RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_[0]
\r
2424 && p1 < minImplicitPrimary
\r
2425 //< (RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_[0]
\r
2428 // not compressible
\r
2430 m_utilBytes1_ = append(m_utilBytes1_,
\r
2431 m_utilBytesCount1_,
\r
2433 m_utilBytesCount1_ ++;
\r
2434 m_utilBytes1_ = append(m_utilBytes1_,
\r
2435 m_utilBytesCount1_,
\r
2437 m_utilBytesCount1_ ++;
\r
2439 else { // compress
\r
2441 m_utilBytes1_ = append(m_utilBytes1_,
\r
2442 m_utilBytesCount1_,
\r
2444 m_utilBytesCount1_ ++;
\r
2445 m_utilBytes1_ = append(m_utilBytes1_,
\r
2446 m_utilBytesCount1_, (byte)p2);
\r
2447 m_utilBytesCount1_ ++;
\r
2452 // continuation, add primary to the key, no compression
\r
2453 m_utilBytes1_ = append(m_utilBytes1_,
\r
2454 m_utilBytesCount1_, (byte)p1);
\r
2455 m_utilBytesCount1_ ++;
\r
2456 if (p2 != CollationElementIterator.IGNORABLE) {
\r
2457 m_utilBytes1_ = append(m_utilBytes1_,
\r
2458 m_utilBytesCount1_, (byte)p2);
\r
2460 m_utilBytesCount1_ ++;
\r
2465 return leadPrimary;
\r
2469 * Gets the secondary byte and adds it to the secondary byte array
\r
2470 * @param ce current ce
\r
2471 * @param notIsContinuation flag indicating if the current bytes belong to
\r
2472 * a continuation ce
\r
2473 * @param doFrench flag indicator if french sort is to be performed
\r
2475 private final void doSecondaryBytes(int ce, boolean notIsContinuation,
\r
2478 int s = (ce >>= 8) & LAST_BYTE_MASK_; // int for comparison
\r
2481 // This is compression code.
\r
2482 if (s == COMMON_2_ && notIsContinuation) {
\r
2486 if (m_utilCount2_ > 0) {
\r
2487 if (s > COMMON_2_) { // not necessary for 4th level.
\r
2488 while (m_utilCount2_ > TOP_COUNT_2_) {
\r
2489 m_utilBytes2_ = append(m_utilBytes2_,
\r
2490 m_utilBytesCount2_,
\r
2491 (byte)(COMMON_TOP_2_ - TOP_COUNT_2_));
\r
2492 m_utilBytesCount2_ ++;
\r
2493 m_utilCount2_ -= TOP_COUNT_2_;
\r
2495 m_utilBytes2_ = append(m_utilBytes2_,
\r
2496 m_utilBytesCount2_,
\r
2497 (byte)(COMMON_TOP_2_
\r
2498 - (m_utilCount2_ - 1)));
\r
2499 m_utilBytesCount2_ ++;
\r
2502 while (m_utilCount2_ > BOTTOM_COUNT_2_) {
\r
2503 m_utilBytes2_ = append(m_utilBytes2_,
\r
2504 m_utilBytesCount2_,
\r
2505 (byte)(COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
\r
2506 m_utilBytesCount2_ ++;
\r
2507 m_utilCount2_ -= BOTTOM_COUNT_2_;
\r
2509 m_utilBytes2_ = append(m_utilBytes2_,
\r
2510 m_utilBytesCount2_,
\r
2511 (byte)(COMMON_BOTTOM_2_
\r
2512 + (m_utilCount2_ - 1)));
\r
2513 m_utilBytesCount2_ ++;
\r
2515 m_utilCount2_ = 0;
\r
2517 m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,
\r
2519 m_utilBytesCount2_ ++;
\r
2523 m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,
\r
2525 m_utilBytesCount2_ ++;
\r
2526 // Do the special handling for French secondaries
\r
2527 // We need to get continuation elements and do intermediate
\r
2529 // abc1c2c3de with french secondaries need to be edc1c2c3ba
\r
2531 if (notIsContinuation) {
\r
2532 if (m_utilFrenchStart_ != -1) {
\r
2533 // reverse secondaries from frenchStartPtr up to
\r
2535 reverseBuffer(m_utilBytes2_);
\r
2536 m_utilFrenchStart_ = -1;
\r
2540 if (m_utilFrenchStart_ == -1) {
\r
2541 m_utilFrenchStart_ = m_utilBytesCount2_ - 2;
\r
2543 m_utilFrenchEnd_ = m_utilBytesCount2_ - 1;
\r
2550 * Reverse the argument buffer
\r
2551 * @param buffer to reverse
\r
2553 private void reverseBuffer(byte buffer[])
\r
2555 int start = m_utilFrenchStart_;
\r
2556 int end = m_utilFrenchEnd_;
\r
2557 while (start < end) {
\r
2558 byte b = buffer[start];
\r
2559 buffer[start ++] = buffer[end];
\r
2560 buffer[end --] = b;
\r
2565 * Insert the case shifting byte if required
\r
2566 * @param caseshift value
\r
2567 * @return new caseshift value
\r
2569 private final int doCaseShift(int caseshift)
\r
2571 if (caseshift == 0) {
\r
2572 m_utilBytes0_ = append(m_utilBytes0_, m_utilBytesCount0_,
\r
2573 SORT_CASE_BYTE_START_);
\r
2574 m_utilBytesCount0_ ++;
\r
2575 caseshift = SORT_CASE_SHIFT_START_;
\r
2581 * Performs the casing sort
\r
2582 * @param tertiary byte in ints for easy comparison
\r
2583 * @param notIsContinuation flag indicating if the current bytes belong to
\r
2584 * a continuation ce
\r
2585 * @param caseshift
\r
2586 * @return the new value of case shift
\r
2588 private final int doCaseBytes(int tertiary, boolean notIsContinuation,
\r
2591 caseshift = doCaseShift(caseshift);
\r
2593 if (notIsContinuation && tertiary != 0) {
\r
2594 byte casebits = (byte)(tertiary & 0xC0);
\r
2595 if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {
\r
2596 if (casebits == 0) {
\r
2597 m_utilBytes0_[m_utilBytesCount0_ - 1]
\r
2598 |= (1 << (-- caseshift));
\r
2602 caseshift = doCaseShift(caseshift - 1);
\r
2603 m_utilBytes0_[m_utilBytesCount0_ - 1]
\r
2604 |= ((casebits >> 6) & 1) << (-- caseshift);
\r
2608 if (casebits != 0) {
\r
2609 m_utilBytes0_[m_utilBytesCount0_ - 1]
\r
2610 |= 1 << (-- caseshift);
\r
2612 caseshift = doCaseShift(caseshift);
\r
2613 m_utilBytes0_[m_utilBytesCount0_ - 1]
\r
2614 |= ((casebits >> 7) & 1) << (-- caseshift);
\r
2626 * Gets the tertiary byte and adds it to the tertiary byte array
\r
2627 * @param tertiary byte in int for easy comparison
\r
2628 * @param notIsContinuation flag indicating if the current bytes belong to
\r
2629 * a continuation ce
\r
2631 private final void doTertiaryBytes(int tertiary, boolean notIsContinuation)
\r
2633 if (tertiary != 0) {
\r
2634 // This is compression code.
\r
2635 // sequence size check is included in the if clause
\r
2636 if (tertiary == m_common3_ && notIsContinuation) {
\r
2640 int common3 = m_common3_ & LAST_BYTE_MASK_;
\r
2641 if (tertiary > common3 && m_common3_ == COMMON_NORMAL_3_) {
\r
2642 tertiary += m_addition3_;
\r
2644 else if (tertiary <= common3
\r
2645 && m_common3_ == COMMON_UPPER_FIRST_3_) {
\r
2646 tertiary -= m_addition3_;
\r
2648 if (m_utilCount3_ > 0) {
\r
2649 if (tertiary > common3) {
\r
2650 while (m_utilCount3_ > m_topCount3_) {
\r
2651 m_utilBytes3_ = append(m_utilBytes3_,
\r
2652 m_utilBytesCount3_,
\r
2653 (byte)(m_top3_ - m_topCount3_));
\r
2654 m_utilBytesCount3_ ++;
\r
2655 m_utilCount3_ -= m_topCount3_;
\r
2657 m_utilBytes3_ = append(m_utilBytes3_,
\r
2658 m_utilBytesCount3_,
\r
2660 - (m_utilCount3_ - 1)));
\r
2661 m_utilBytesCount3_ ++;
\r
2664 while (m_utilCount3_ > m_bottomCount3_) {
\r
2665 m_utilBytes3_ = append(m_utilBytes3_,
\r
2666 m_utilBytesCount3_,
\r
2667 (byte)(m_bottom3_ + m_bottomCount3_));
\r
2668 m_utilBytesCount3_ ++;
\r
2669 m_utilCount3_ -= m_bottomCount3_;
\r
2671 m_utilBytes3_ = append(m_utilBytes3_,
\r
2672 m_utilBytesCount3_,
\r
2674 + (m_utilCount3_ - 1)));
\r
2675 m_utilBytesCount3_ ++;
\r
2677 m_utilCount3_ = 0;
\r
2679 m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
\r
2681 m_utilBytesCount3_ ++;
\r
2687 * Gets the Quaternary byte and adds it to the Quaternary byte array
\r
2688 * @param isCodePointHiragana flag indicator if the previous codepoint
\r
2689 * we dealt with was Hiragana
\r
2690 * @param commonBottom4 smallest common Quaternary byte
\r
2691 * @param bottomCount4 smallest Quaternary byte
\r
2692 * @param hiragana4 hiragana Quaternary byte
\r
2694 private final void doQuaternaryBytes(boolean isCodePointHiragana,
\r
2695 int commonBottom4, int bottomCount4,
\r
2698 if (isCodePointHiragana) { // This was Hiragana, need to note it
\r
2699 if (m_utilCount4_ > 0) { // Close this part
\r
2700 while (m_utilCount4_ > bottomCount4) {
\r
2701 m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
\r
2702 (byte)(commonBottom4
\r
2704 m_utilBytesCount4_ ++;
\r
2705 m_utilCount4_ -= bottomCount4;
\r
2707 m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
\r
2708 (byte)(commonBottom4
\r
2709 + (m_utilCount4_ - 1)));
\r
2710 m_utilBytesCount4_ ++;
\r
2711 m_utilCount4_ = 0;
\r
2713 m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
\r
2714 hiragana4); // Add the Hiragana
\r
2715 m_utilBytesCount4_ ++;
\r
2717 else { // This wasn't Hiragana, so we can continue adding stuff
\r
2723 * Iterates through the argument string for all ces.
\r
2724 * Split the ces into their relevant primaries, secondaries etc.
\r
2725 * @param source normalized string
\r
2726 * @param doFrench flag indicator if special handling of French has to be
\r
2728 * @param hiragana4 offset for Hiragana quaternary
\r
2729 * @param commonBottom4 smallest common quaternary byte
\r
2730 * @param bottomCount4 smallest quaternary byte
\r
2732 private final void getSortKeyBytes(String source, boolean doFrench,
\r
2733 byte hiragana4, int commonBottom4,
\r
2737 if (m_srcUtilIter_ == null) {
\r
2738 initUtility(true);
\r
2740 int backupDecomposition = getDecomposition();
\r
2741 setDecomposition(NO_DECOMPOSITION); // have to revert to backup later
\r
2742 m_srcUtilIter_.setText(source);
\r
2743 m_srcUtilColEIter_.setText(m_srcUtilIter_);
\r
2744 m_utilFrenchStart_ = -1;
\r
2745 m_utilFrenchEnd_ = -1;
\r
2747 // scriptorder not implemented yet
\r
2748 // const uint8_t *scriptOrder = coll->scriptOrder;
\r
2750 boolean doShift = false;
\r
2751 boolean notIsContinuation = false;
\r
2753 int leadPrimary = 0; // int for easier comparison
\r
2754 int caseShift = 0;
\r
2757 int ce = m_srcUtilColEIter_.next();
\r
2758 if (ce == CollationElementIterator.NULLORDER) {
\r
2762 if (ce == CollationElementIterator.IGNORABLE) {
\r
2766 notIsContinuation = !isContinuation(ce);
\r
2769 * if (notIsContinuation) {
\r
2770 if (scriptOrder != NULL) {
\r
2771 primary1 = scriptOrder[primary1];
\r
2774 boolean isPrimaryByteIgnorable = (ce & CE_PRIMARY_MASK_) == 0;
\r
2775 // actually we can just check that the first byte is 0
\r
2776 // generation stuffs the order left first
\r
2777 boolean isSmallerThanVariableTop = (ce >>> CE_PRIMARY_SHIFT_)
\r
2778 <= m_variableTopValue_;
\r
2779 doShift = (m_isAlternateHandlingShifted_
\r
2780 && ((notIsContinuation && isSmallerThanVariableTop
\r
2781 && !isPrimaryByteIgnorable) // primary byte not 0
\r
2782 || (!notIsContinuation && doShift))
\r
2783 || (doShift && isPrimaryByteIgnorable));
\r
2784 if (doShift && isPrimaryByteIgnorable) {
\r
2785 // amendment to the UCA says that primary ignorables and other
\r
2786 // ignorables should be removed if following a shifted code
\r
2788 // if we were shifted and we got an ignorable code point
\r
2789 // we should just completely ignore it
\r
2792 leadPrimary = doPrimaryBytes(ce, notIsContinuation, doShift,
\r
2793 leadPrimary, commonBottom4,
\r
2798 if (m_utilCompare2_) {
\r
2799 doSecondaryBytes(ce, notIsContinuation, doFrench);
\r
2802 int t = ce & LAST_BYTE_MASK_;
\r
2803 if (!notIsContinuation) {
\r
2804 t = ce & CE_REMOVE_CONTINUATION_MASK_;
\r
2807 if (m_utilCompare0_ && (!isPrimaryByteIgnorable || m_utilCompare2_)) {
\r
2808 // do the case level if we need to do it. We don't want to calculate
\r
2809 // case level for primary ignorables if we have only primary strength and case level
\r
2810 // otherwise we would break well formedness of CEs
\r
2811 caseShift = doCaseBytes(t, notIsContinuation, caseShift);
\r
2813 else if (notIsContinuation) {
\r
2814 t ^= m_caseSwitch_;
\r
2819 if (m_utilCompare3_) {
\r
2820 doTertiaryBytes(t, notIsContinuation);
\r
2823 if (m_utilCompare4_ && notIsContinuation) { // compare quad
\r
2824 doQuaternaryBytes(m_srcUtilColEIter_.m_isCodePointHiragana_,
\r
2825 commonBottom4, bottomCount4, hiragana4);
\r
2828 setDecomposition(backupDecomposition); // reverts to original
\r
2829 if (m_utilFrenchStart_ != -1) {
\r
2830 // one last round of checks
\r
2831 reverseBuffer(m_utilBytes2_);
\r
2836 * From the individual strength byte results the final compact sortkey
\r
2837 * will be calculated.
\r
2838 * @param source text string
\r
2839 * @param doFrench flag indicating that special handling of French has to
\r
2841 * @param commonBottom4 smallest common quaternary byte
\r
2842 * @param bottomCount4 smallest quaternary byte
\r
2843 * @param key output RawCollationKey to store results, key cannot be null
\r
2845 private final void getSortKey(String source, boolean doFrench,
\r
2846 int commonBottom4,
\r
2848 RawCollationKey key)
\r
2850 // we have done all the CE's, now let's put them together to form
\r
2852 if (m_utilCompare2_) {
\r
2853 doSecondary(doFrench);
\r
2855 // adding case level should be independent of secondary level
\r
2856 if (m_utilCompare0_) {
\r
2859 if (m_utilCompare3_) {
\r
2861 if (m_utilCompare4_) {
\r
2862 doQuaternary(commonBottom4, bottomCount4);
\r
2863 if (m_utilCompare5_) {
\r
2864 doIdentical(source);
\r
2869 m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte)0);
\r
2870 m_utilBytesCount1_ ++;
\r
2872 key.set(m_utilBytes1_, 0, m_utilBytesCount1_);
\r
2876 * Packs the French bytes
\r
2878 private final void doFrench()
\r
2880 for (int i = 0; i < m_utilBytesCount2_; i ++) {
\r
2881 byte s = m_utilBytes2_[m_utilBytesCount2_ - i - 1];
\r
2882 // This is compression code.
\r
2883 if (s == COMMON_2_) {
\r
2887 if (m_utilCount2_ > 0) {
\r
2888 // getting the unsigned value
\r
2889 if ((s & LAST_BYTE_MASK_) > COMMON_2_) {
\r
2890 // not necessary for 4th level.
\r
2891 while (m_utilCount2_ > TOP_COUNT_2_) {
\r
2892 m_utilBytes1_ = append(m_utilBytes1_,
\r
2893 m_utilBytesCount1_,
\r
2894 (byte)(COMMON_TOP_2_ - TOP_COUNT_2_));
\r
2895 m_utilBytesCount1_ ++;
\r
2896 m_utilCount2_ -= TOP_COUNT_2_;
\r
2898 m_utilBytes1_ = append(m_utilBytes1_,
\r
2899 m_utilBytesCount1_,
\r
2900 (byte)(COMMON_TOP_2_
\r
2901 - (m_utilCount2_ - 1)));
\r
2902 m_utilBytesCount1_ ++;
\r
2905 while (m_utilCount2_ > BOTTOM_COUNT_2_) {
\r
2906 m_utilBytes1_ = append(m_utilBytes1_,
\r
2907 m_utilBytesCount1_,
\r
2908 (byte)(COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_));
\r
2909 m_utilBytesCount1_ ++;
\r
2910 m_utilCount2_ -= BOTTOM_COUNT_2_;
\r
2912 m_utilBytes1_ = append(m_utilBytes1_,
\r
2913 m_utilBytesCount1_,
\r
2914 (byte)(COMMON_BOTTOM_2_
\r
2915 + (m_utilCount2_ - 1)));
\r
2916 m_utilBytesCount1_ ++;
\r
2918 m_utilCount2_ = 0;
\r
2920 m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, s);
\r
2921 m_utilBytesCount1_ ++;
\r
2924 if (m_utilCount2_ > 0) {
\r
2925 while (m_utilCount2_ > BOTTOM_COUNT_2_) {
\r
2926 m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
\r
2927 (byte)(COMMON_BOTTOM_2_
\r
2928 + BOTTOM_COUNT_2_));
\r
2929 m_utilBytesCount1_ ++;
\r
2930 m_utilCount2_ -= BOTTOM_COUNT_2_;
\r
2932 m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
\r
2933 (byte)(COMMON_BOTTOM_2_
\r
2934 + (m_utilCount2_ - 1)));
\r
2935 m_utilBytesCount1_ ++;
\r
2940 * Compacts the secondary bytes and stores them into the primary array
\r
2941 * @param doFrench flag indicator that French has to be handled specially
\r
2943 private final void doSecondary(boolean doFrench)
\r
2945 if (m_utilCount2_ > 0) {
\r
2946 while (m_utilCount2_ > BOTTOM_COUNT_2_) {
\r
2947 m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,
\r
2948 (byte)(COMMON_BOTTOM_2_
\r
2949 + BOTTOM_COUNT_2_));
\r
2950 m_utilBytesCount2_ ++;
\r
2951 m_utilCount2_ -= BOTTOM_COUNT_2_;
\r
2953 m_utilBytes2_ = append(m_utilBytes2_, m_utilBytesCount2_,
\r
2954 (byte)(COMMON_BOTTOM_2_ +
\r
2955 (m_utilCount2_ - 1)));
\r
2956 m_utilBytesCount2_ ++;
\r
2959 m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
\r
2960 SORT_LEVEL_TERMINATOR_);
\r
2961 m_utilBytesCount1_ ++;
\r
2963 if (doFrench) { // do the reverse copy
\r
2967 if (m_utilBytes1_.length <= m_utilBytesCount1_
\r
2968 + m_utilBytesCount2_) {
\r
2969 m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
\r
2970 m_utilBytesCount2_);
\r
2972 System.arraycopy(m_utilBytes2_, 0, m_utilBytes1_,
\r
2973 m_utilBytesCount1_, m_utilBytesCount2_);
\r
2974 m_utilBytesCount1_ += m_utilBytesCount2_;
\r
2979 * Increase buffer size
\r
2980 * @param buffer array of bytes
\r
2981 * @param size of the byte array
\r
2982 * @param incrementsize size to increase
\r
2983 * @return the new buffer
\r
2985 private static final byte[] increase(byte buffer[], int size,
\r
2986 int incrementsize)
\r
2988 byte result[] = new byte[buffer.length + incrementsize];
\r
2989 System.arraycopy(buffer, 0, result, 0, size);
\r
2994 * Increase buffer size
\r
2995 * @param buffer array of ints
\r
2996 * @param size of the byte array
\r
2997 * @param incrementsize size to increase
\r
2998 * @return the new buffer
\r
3000 private static final int[] increase(int buffer[], int size,
\r
3001 int incrementsize)
\r
3003 int result[] = new int[buffer.length + incrementsize];
\r
3004 System.arraycopy(buffer, 0, result, 0, size);
\r
3009 * Compacts the case bytes and stores them into the primary array
\r
3011 private final void doCase()
\r
3013 m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
\r
3014 SORT_LEVEL_TERMINATOR_);
\r
3015 m_utilBytesCount1_ ++;
\r
3016 if (m_utilBytes1_.length <= m_utilBytesCount1_ + m_utilBytesCount0_) {
\r
3017 m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
\r
3018 m_utilBytesCount0_);
\r
3020 System.arraycopy(m_utilBytes0_, 0, m_utilBytes1_, m_utilBytesCount1_,
\r
3021 m_utilBytesCount0_);
\r
3022 m_utilBytesCount1_ += m_utilBytesCount0_;
\r
3026 * Compacts the tertiary bytes and stores them into the primary array
\r
3028 private final void doTertiary()
\r
3030 if (m_utilCount3_ > 0) {
\r
3031 if (m_common3_ != COMMON_BOTTOM_3_) {
\r
3032 while (m_utilCount3_ >= m_topCount3_) {
\r
3033 m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
\r
3034 (byte)(m_top3_ - m_topCount3_));
\r
3035 m_utilBytesCount3_ ++;
\r
3036 m_utilCount3_ -= m_topCount3_;
\r
3038 m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
\r
3039 (byte)(m_top3_ - m_utilCount3_));
\r
3040 m_utilBytesCount3_ ++;
\r
3043 while (m_utilCount3_ > m_bottomCount3_) {
\r
3044 m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
\r
3046 + m_bottomCount3_));
\r
3047 m_utilBytesCount3_ ++;
\r
3048 m_utilCount3_ -= m_bottomCount3_;
\r
3050 m_utilBytes3_ = append(m_utilBytes3_, m_utilBytesCount3_,
\r
3052 + (m_utilCount3_ - 1)));
\r
3053 m_utilBytesCount3_ ++;
\r
3056 m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
\r
3057 SORT_LEVEL_TERMINATOR_);
\r
3058 m_utilBytesCount1_ ++;
\r
3059 if (m_utilBytes1_.length <= m_utilBytesCount1_ + m_utilBytesCount3_) {
\r
3060 m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
\r
3061 m_utilBytesCount3_);
\r
3063 System.arraycopy(m_utilBytes3_, 0, m_utilBytes1_, m_utilBytesCount1_,
\r
3064 m_utilBytesCount3_);
\r
3065 m_utilBytesCount1_ += m_utilBytesCount3_;
\r
3069 * Compacts the quaternary bytes and stores them into the primary array
\r
3071 private final void doQuaternary(int commonbottom4, int bottomcount4)
\r
3073 if (m_utilCount4_ > 0) {
\r
3074 while (m_utilCount4_ > bottomcount4) {
\r
3075 m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
\r
3076 (byte)(commonbottom4 + bottomcount4));
\r
3077 m_utilBytesCount4_ ++;
\r
3078 m_utilCount4_ -= bottomcount4;
\r
3080 m_utilBytes4_ = append(m_utilBytes4_, m_utilBytesCount4_,
\r
3081 (byte)(commonbottom4
\r
3082 + (m_utilCount4_ - 1)));
\r
3083 m_utilBytesCount4_ ++;
\r
3085 m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
\r
3086 SORT_LEVEL_TERMINATOR_);
\r
3087 m_utilBytesCount1_ ++;
\r
3088 if (m_utilBytes1_.length <= m_utilBytesCount1_ + m_utilBytesCount4_) {
\r
3089 m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
\r
3090 m_utilBytesCount4_);
\r
3092 System.arraycopy(m_utilBytes4_, 0, m_utilBytes1_, m_utilBytesCount1_,
\r
3093 m_utilBytesCount4_);
\r
3094 m_utilBytesCount1_ += m_utilBytesCount4_;
\r
3098 * Deals with the identical sort.
\r
3099 * Appends the BOCSU version of the source string to the ends of the
\r
3101 * @param source text string
\r
3103 private final void doIdentical(String source)
\r
3105 int isize = BOCU.getCompressionLength(source);
\r
3106 m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_,
\r
3107 SORT_LEVEL_TERMINATOR_);
\r
3108 m_utilBytesCount1_ ++;
\r
3109 if (m_utilBytes1_.length <= m_utilBytesCount1_ + isize) {
\r
3110 m_utilBytes1_ = increase(m_utilBytes1_, m_utilBytesCount1_,
\r
3113 m_utilBytesCount1_ = BOCU.compress(source, m_utilBytes1_,
\r
3114 m_utilBytesCount1_);
\r
3118 * Gets the offset of the first unmatched characters in source and target.
\r
3119 * This method returns the offset of the start of a contraction or a
\r
3120 * combining sequence, if the first difference is in the middle of such a
\r
3122 * @param source string
\r
3123 * @param target string
\r
3124 * @return offset of the first unmatched characters in source and target.
\r
3126 private final int getFirstUnmatchedOffset(String source, String target)
\r
3129 int slength = source.length();
\r
3130 int tlength = target.length();
\r
3131 int minlength = slength;
\r
3132 if (minlength > tlength) {
\r
3133 minlength = tlength;
\r
3135 while (result < minlength
\r
3136 && source.charAt(result) == target.charAt(result)) {
\r
3140 // There is an identical portion at the beginning of the two
\r
3141 // strings. If the identical portion ends within a contraction or a
\r
3142 // combining character sequence, back up to the start of that
\r
3146 if (result < minlength) {
\r
3147 schar = source.charAt(result); // first differing chars
\r
3148 tchar = target.charAt(result);
\r
3151 schar = source.charAt(minlength - 1);
\r
3152 if (isUnsafe(schar)) {
\r
3155 else if (slength == tlength) {
\r
3158 else if (slength < tlength) {
\r
3159 tchar = target.charAt(result);
\r
3162 schar = source.charAt(result);
\r
3165 if (isUnsafe(schar) || isUnsafe(tchar))
\r
3167 // We are stopped in the middle of a contraction or combining
\r
3169 // Look backwards for the part of the string for the start of
\r
3171 // It doesn't matter which string we scan, since they are the
\r
3172 // same in this region.
\r
3176 while (result > 0 && isUnsafe(source.charAt(result)));
\r
3183 * Appending an byte to an array of bytes and increases it if we run out of
\r
3185 * @param array of byte arrays
\r
3186 * @param appendindex index in the byte array to append
\r
3187 * @param value to append
\r
3188 * @return array if array size can accomodate the new value, otherwise
\r
3189 * a bigger array will be created and returned
\r
3191 private static final byte[] append(byte array[], int appendindex,
\r
3195 array[appendindex] = value;
\r
3197 catch (ArrayIndexOutOfBoundsException e) {
\r
3198 array = increase(array, appendindex, SORT_BUFFER_INIT_SIZE_);
\r
3199 array[appendindex] = value;
\r
3205 * This is a trick string compare function that goes in and uses sortkeys
\r
3206 * to compare. It is used when compare gets in trouble and needs to bail
\r
3208 * @param source text string
\r
3209 * @param target text string
\r
3211 private final int compareBySortKeys(String source, String target)
\r
3214 m_utilRawCollationKey_ = getRawCollationKey(source,
\r
3215 m_utilRawCollationKey_);
\r
3216 // this method is very seldom called
\r
3217 RawCollationKey targetkey = getRawCollationKey(target, null);
\r
3218 return m_utilRawCollationKey_.compareTo(targetkey);
\r
3222 * Performs the primary comparisons, and fills up the CE buffer at the
\r
3224 * The return value toggles between the comparison result and the hiragana
\r
3225 * result. If either the source is greater than target or vice versa, the
\r
3226 * return result is the comparison result, ie 1 or -1, furthermore the
\r
3227 * cebuffers will be cleared when that happens. If the primary comparisons
\r
3228 * are equal, we'll have to continue with secondary comparison. In this case
\r
3229 * the cebuffer will not be cleared and the return result will be the
\r
3230 * hiragana result.
\r
3231 * @param doHiragana4 flag indicator that Hiragana Quaternary has to be
\r
3233 * @param lowestpvalue the lowest primary value that will not be ignored if
\r
3234 * alternate handling is shifted
\r
3235 * @param source text string
\r
3236 * @param target text string
\r
3237 * @param textoffset offset in text to start the comparison
\r
3238 * @return comparion result if a primary difference is found, otherwise
\r
3241 private final int doPrimaryCompare(boolean doHiragana4, int lowestpvalue,
\r
3242 String source, String target,
\r
3246 // Preparing the context objects for iterating over strings
\r
3247 m_srcUtilIter_.setText(source);
\r
3248 m_srcUtilColEIter_.setText(m_srcUtilIter_, textoffset);
\r
3249 m_tgtUtilIter_.setText(target);
\r
3250 m_tgtUtilColEIter_.setText(m_tgtUtilIter_, textoffset);
\r
3252 // Non shifted primary processing is quite simple
\r
3253 if (!m_isAlternateHandlingShifted_) {
\r
3254 int hiraganaresult = 0;
\r
3257 // We fetch CEs until we hit a non ignorable primary or end.
\r
3259 sorder = m_srcUtilColEIter_.next();
\r
3260 m_srcUtilCEBuffer_ = append(m_srcUtilCEBuffer_,
\r
3261 m_srcUtilCEBufferSize_, sorder);
\r
3262 m_srcUtilCEBufferSize_ ++;
\r
3263 sorder &= CE_PRIMARY_MASK_;
\r
3264 } while (sorder == CollationElementIterator.IGNORABLE);
\r
3268 torder = m_tgtUtilColEIter_.next();
\r
3269 m_tgtUtilCEBuffer_ = append(m_tgtUtilCEBuffer_,
\r
3270 m_tgtUtilCEBufferSize_, torder);
\r
3271 m_tgtUtilCEBufferSize_ ++;
\r
3272 torder &= CE_PRIMARY_MASK_;
\r
3273 } while (torder == CollationElementIterator.IGNORABLE);
\r
3275 // if both primaries are the same
\r
3276 if (sorder == torder) {
\r
3277 // and there are no more CEs, we advance to the next level
\r
3278 // see if we are at the end of either string
\r
3279 if (m_srcUtilCEBuffer_[m_srcUtilCEBufferSize_ - 1]
\r
3280 == CollationElementIterator.NULLORDER) {
\r
3281 if (m_tgtUtilCEBuffer_[m_tgtUtilCEBufferSize_ - 1]
\r
3282 != CollationElementIterator.NULLORDER) {
\r
3287 else if (m_tgtUtilCEBuffer_[m_tgtUtilCEBufferSize_ - 1]
\r
3288 == CollationElementIterator.NULLORDER) {
\r
3291 if (doHiragana4 && hiraganaresult == 0
\r
3292 && m_srcUtilColEIter_.m_isCodePointHiragana_ !=
\r
3293 m_tgtUtilColEIter_.m_isCodePointHiragana_) {
\r
3294 if (m_srcUtilColEIter_.m_isCodePointHiragana_) {
\r
3295 hiraganaresult = -1;
\r
3298 hiraganaresult = 1;
\r
3303 // if two primaries are different, we are done
\r
3304 return endPrimaryCompare(sorder, torder);
\r
3307 // no primary difference... do the rest from the buffers
\r
3308 return hiraganaresult;
\r
3310 else { // shifted - do a slightly more complicated processing :)
\r
3312 int sorder = getPrimaryShiftedCompareCE(m_srcUtilColEIter_,
\r
3313 lowestpvalue, true);
\r
3314 int torder = getPrimaryShiftedCompareCE(m_tgtUtilColEIter_,
\r
3315 lowestpvalue, false);
\r
3316 if (sorder == torder) {
\r
3317 if (m_srcUtilCEBuffer_[m_srcUtilCEBufferSize_ - 1]
\r
3318 == CollationElementIterator.NULLORDER) {
\r
3326 return endPrimaryCompare(sorder, torder);
\r
3328 } // no primary difference... do the rest from the buffers
\r
3334 * This is used only for primary strength when we know that sorder is
\r
3335 * already different from torder.
\r
3336 * Compares sorder and torder, returns -1 if sorder is less than torder.
\r
3337 * Clears the cebuffer at the same time.
\r
3338 * @param sorder source strength order
\r
3339 * @param torder target strength order
\r
3340 * @return the comparison result of sorder and torder
\r
3342 private final int endPrimaryCompare(int sorder, int torder)
\r
3344 // if we reach here, the ce offset accessed is the last ce
\r
3345 // appended to the buffer
\r
3346 boolean isSourceNullOrder = (m_srcUtilCEBuffer_[
\r
3347 m_srcUtilCEBufferSize_ - 1]
\r
3348 == CollationElementIterator.NULLORDER);
\r
3349 boolean isTargetNullOrder = (m_tgtUtilCEBuffer_[
\r
3350 m_tgtUtilCEBufferSize_ - 1]
\r
3351 == CollationElementIterator.NULLORDER);
\r
3352 m_srcUtilCEBufferSize_ = -1;
\r
3353 m_tgtUtilCEBufferSize_ = -1;
\r
3354 if (isSourceNullOrder) {
\r
3357 if (isTargetNullOrder) {
\r
3360 // getting rid of the sign
\r
3361 sorder >>>= CE_PRIMARY_SHIFT_;
\r
3362 torder >>>= CE_PRIMARY_SHIFT_;
\r
3363 if (sorder < torder) {
\r
3370 * Calculates the next primary shifted value and fills up cebuffer with the
\r
3371 * next non-ignorable ce.
\r
3372 * @param coleiter collation element iterator
\r
3373 * @param doHiragana4 flag indicator if hiragana quaternary is to be
\r
3375 * @param lowestpvalue lowest primary shifted value that will not be
\r
3377 * @return result next modified ce
\r
3379 private final int getPrimaryShiftedCompareCE(
\r
3380 CollationElementIterator coleiter,
\r
3381 int lowestpvalue, boolean isSrc)
\r
3384 boolean shifted = false;
\r
3385 int result = CollationElementIterator.IGNORABLE;
\r
3386 int cebuffer[] = m_srcUtilCEBuffer_;
\r
3387 int cebuffersize = m_srcUtilCEBufferSize_;
\r
3389 cebuffer = m_tgtUtilCEBuffer_;
\r
3390 cebuffersize = m_tgtUtilCEBufferSize_;
\r
3393 result = coleiter.next();
\r
3394 if (result == CollationElementIterator.NULLORDER) {
\r
3395 cebuffer = append(cebuffer, cebuffersize, result);
\r
3399 else if (result == CollationElementIterator.IGNORABLE
\r
3401 && (result & CE_PRIMARY_MASK_)
\r
3402 == CollationElementIterator.IGNORABLE)) {
\r
3403 // UCA amendment - ignore ignorables that follow shifted code
\r
3407 else if (isContinuation(result)) {
\r
3408 if ((result & CE_PRIMARY_MASK_)
\r
3409 != CollationElementIterator.IGNORABLE) {
\r
3410 // There is primary value
\r
3412 result = (result & CE_PRIMARY_MASK_)
\r
3413 | CE_CONTINUATION_MARKER_;
\r
3414 // preserve interesting continuation
\r
3415 cebuffer = append(cebuffer, cebuffersize, result);
\r
3420 cebuffer = append(cebuffer, cebuffersize, result);
\r
3425 else { // Just lower level values
\r
3427 cebuffer = append(cebuffer, cebuffersize, result);
\r
3433 if (Utility.compareUnsigned(result & CE_PRIMARY_MASK_,
\r
3434 lowestpvalue) > 0) {
\r
3435 cebuffer = append(cebuffer, cebuffersize, result);
\r
3440 if ((result & CE_PRIMARY_MASK_) != 0) {
\r
3442 result &= CE_PRIMARY_MASK_;
\r
3443 cebuffer = append(cebuffer, cebuffersize, result);
\r
3448 cebuffer = append(cebuffer, cebuffersize, result);
\r
3457 m_srcUtilCEBuffer_ = cebuffer;
\r
3458 m_srcUtilCEBufferSize_ = cebuffersize;
\r
3461 m_tgtUtilCEBuffer_ = cebuffer;
\r
3462 m_tgtUtilCEBufferSize_ = cebuffersize;
\r
3464 result &= CE_PRIMARY_MASK_;
\r
3469 * Appending an int to an array of ints and increases it if we run out of
\r
3471 * @param array of int arrays
\r
3472 * @param appendindex index at which value will be appended
\r
3473 * @param value to append
\r
3474 * @return array if size is not increased, otherwise a new array will be
\r
3477 private static final int[] append(int array[], int appendindex, int value)
\r
3479 if (appendindex + 1 >= array.length) {
\r
3480 array = increase(array, appendindex, CE_BUFFER_SIZE_);
\r
3482 array[appendindex] = value;
\r
3487 * Does secondary strength comparison based on the collected ces.
\r
3488 * @param doFrench flag indicates if French ordering is to be done
\r
3489 * @return the secondary strength comparison result
\r
3491 private final int doSecondaryCompare(boolean doFrench)
\r
3493 // now, we're gonna reexamine collected CEs
\r
3494 if (!doFrench) { // normal
\r
3498 int sorder = CollationElementIterator.IGNORABLE;
\r
3499 while (sorder == CollationElementIterator.IGNORABLE) {
\r
3500 sorder = m_srcUtilCEBuffer_[soffset ++]
\r
3501 & CE_SECONDARY_MASK_;
\r
3503 int torder = CollationElementIterator.IGNORABLE;
\r
3504 while (torder == CollationElementIterator.IGNORABLE) {
\r
3505 torder = m_tgtUtilCEBuffer_[toffset ++]
\r
3506 & CE_SECONDARY_MASK_;
\r
3509 if (sorder == torder) {
\r
3510 if (m_srcUtilCEBuffer_[soffset - 1]
\r
3511 == CollationElementIterator.NULLORDER) {
\r
3512 if (m_tgtUtilCEBuffer_[toffset - 1]
\r
3513 != CollationElementIterator.NULLORDER) {
\r
3518 else if (m_tgtUtilCEBuffer_[toffset - 1]
\r
3519 == CollationElementIterator.NULLORDER) {
\r
3524 if (m_srcUtilCEBuffer_[soffset - 1] ==
\r
3525 CollationElementIterator.NULLORDER) {
\r
3528 if (m_tgtUtilCEBuffer_[toffset - 1] ==
\r
3529 CollationElementIterator.NULLORDER) {
\r
3532 return (sorder < torder) ? -1 : 1;
\r
3536 else { // do the French
\r
3537 m_srcUtilContOffset_ = 0;
\r
3538 m_tgtUtilContOffset_ = 0;
\r
3539 m_srcUtilOffset_ = m_srcUtilCEBufferSize_ - 2;
\r
3540 m_tgtUtilOffset_ = m_tgtUtilCEBufferSize_ - 2;
\r
3542 int sorder = getSecondaryFrenchCE(true);
\r
3543 int torder = getSecondaryFrenchCE(false);
\r
3544 if (sorder == torder) {
\r
3545 if ((m_srcUtilOffset_ < 0 && m_tgtUtilOffset_ < 0)
\r
3546 || (m_srcUtilOffset_ >= 0
\r
3547 && m_srcUtilCEBuffer_[m_srcUtilOffset_]
\r
3548 == CollationElementIterator.NULLORDER)) {
\r
3553 return (sorder < torder) ? -1 : 1;
\r
3561 * Calculates the next secondary french CE.
\r
3562 * @param isSrc flag indicator if we are calculating the src ces
\r
3563 * @return result next modified ce
\r
3565 private final int getSecondaryFrenchCE(boolean isSrc)
\r
3567 int result = CollationElementIterator.IGNORABLE;
\r
3568 int offset = m_srcUtilOffset_;
\r
3569 int continuationoffset = m_srcUtilContOffset_;
\r
3570 int cebuffer[] = m_srcUtilCEBuffer_;
\r
3572 offset = m_tgtUtilOffset_;
\r
3573 continuationoffset = m_tgtUtilContOffset_;
\r
3574 cebuffer = m_tgtUtilCEBuffer_;
\r
3577 while (result == CollationElementIterator.IGNORABLE
\r
3579 if (continuationoffset == 0) {
\r
3580 result = cebuffer[offset];
\r
3581 while (isContinuation(cebuffer[offset --])){
\r
3583 // after this, sorder is at the start of continuation,
\r
3584 // and offset points before that
\r
3585 if (isContinuation(cebuffer[offset + 1])) {
\r
3586 // save offset for later
\r
3587 continuationoffset = offset;
\r
3592 result = cebuffer[offset ++];
\r
3593 if (!isContinuation(result)) {
\r
3594 // we have finished with this continuation
\r
3595 offset = continuationoffset;
\r
3596 // reset the pointer to before continuation
\r
3597 continuationoffset = 0;
\r
3601 result &= CE_SECONDARY_MASK_; // remove continuation bit
\r
3604 m_srcUtilOffset_ = offset;
\r
3605 m_srcUtilContOffset_ = continuationoffset;
\r
3608 m_tgtUtilOffset_ = offset;
\r
3609 m_tgtUtilContOffset_ = continuationoffset;
\r
3615 * Does case strength comparison based on the collected ces.
\r
3616 * @return the case strength comparison result
\r
3618 private final int doCaseCompare()
\r
3623 int sorder = CollationElementIterator.IGNORABLE;
\r
3624 int torder = CollationElementIterator.IGNORABLE;
\r
3625 while ((sorder & CE_REMOVE_CASE_)
\r
3626 == CollationElementIterator.IGNORABLE) {
\r
3627 sorder = m_srcUtilCEBuffer_[soffset ++];
\r
3628 if (!isContinuation(sorder) && ((sorder & CE_PRIMARY_MASK_) != 0 || m_utilCompare2_ == true)) {
\r
3629 // primary ignorables should not be considered on the case level when the strength is primary
\r
3630 // otherwise, the CEs stop being well-formed
\r
3631 sorder &= CE_CASE_MASK_3_;
\r
3632 sorder ^= m_caseSwitch_;
\r
3635 sorder = CollationElementIterator.IGNORABLE;
\r
3639 while ((torder & CE_REMOVE_CASE_)
\r
3640 == CollationElementIterator.IGNORABLE) {
\r
3641 torder = m_tgtUtilCEBuffer_[toffset ++];
\r
3642 if (!isContinuation(torder) && ((torder & CE_PRIMARY_MASK_) != 0 || m_utilCompare2_ == true)) {
\r
3643 // primary ignorables should not be considered on the case level when the strength is primary
\r
3644 // otherwise, the CEs stop being well-formed
\r
3645 torder &= CE_CASE_MASK_3_;
\r
3646 torder ^= m_caseSwitch_;
\r
3649 torder = CollationElementIterator.IGNORABLE;
\r
3653 sorder &= CE_CASE_BIT_MASK_;
\r
3654 torder &= CE_CASE_BIT_MASK_;
\r
3655 if (sorder == torder) {
\r
3656 // checking end of strings
\r
3657 if (m_srcUtilCEBuffer_[soffset - 1]
\r
3658 == CollationElementIterator.NULLORDER) {
\r
3659 if (m_tgtUtilCEBuffer_[toffset - 1]
\r
3660 != CollationElementIterator.NULLORDER) {
\r
3665 else if (m_tgtUtilCEBuffer_[toffset - 1]
\r
3666 == CollationElementIterator.NULLORDER) {
\r
3671 if (m_srcUtilCEBuffer_[soffset - 1]
\r
3672 == CollationElementIterator.NULLORDER) {
\r
3675 if (m_tgtUtilCEBuffer_[soffset - 1]
\r
3676 == CollationElementIterator.NULLORDER) {
\r
3679 return (sorder < torder) ? -1 : 1;
\r
3686 * Does tertiary strength comparison based on the collected ces.
\r
3687 * @return the tertiary strength comparison result
\r
3689 private final int doTertiaryCompare()
\r
3694 int sorder = CollationElementIterator.IGNORABLE;
\r
3695 int torder = CollationElementIterator.IGNORABLE;
\r
3696 while ((sorder & CE_REMOVE_CASE_)
\r
3697 == CollationElementIterator.IGNORABLE) {
\r
3698 sorder = m_srcUtilCEBuffer_[soffset ++] & m_mask3_;
\r
3699 if (!isContinuation(sorder)) {
\r
3700 sorder ^= m_caseSwitch_;
\r
3703 sorder &= CE_REMOVE_CASE_;
\r
3707 while ((torder & CE_REMOVE_CASE_)
\r
3708 == CollationElementIterator.IGNORABLE) {
\r
3709 torder = m_tgtUtilCEBuffer_[toffset ++] & m_mask3_;
\r
3710 if (!isContinuation(torder)) {
\r
3711 torder ^= m_caseSwitch_;
\r
3714 torder &= CE_REMOVE_CASE_;
\r
3718 if (sorder == torder) {
\r
3719 if (m_srcUtilCEBuffer_[soffset - 1]
\r
3720 == CollationElementIterator.NULLORDER) {
\r
3721 if (m_tgtUtilCEBuffer_[toffset - 1]
\r
3722 != CollationElementIterator.NULLORDER) {
\r
3727 else if (m_tgtUtilCEBuffer_[toffset - 1]
\r
3728 == CollationElementIterator.NULLORDER) {
\r
3733 if (m_srcUtilCEBuffer_[soffset - 1] ==
\r
3734 CollationElementIterator.NULLORDER) {
\r
3737 if (m_tgtUtilCEBuffer_[toffset - 1] ==
\r
3738 CollationElementIterator.NULLORDER) {
\r
3741 return (sorder < torder) ? -1 : 1;
\r
3748 * Does quaternary strength comparison based on the collected ces.
\r
3749 * @param lowestpvalue the lowest primary value that will not be ignored if
\r
3750 * alternate handling is shifted
\r
3751 * @return the quaternary strength comparison result
\r
3753 private final int doQuaternaryCompare(int lowestpvalue)
\r
3755 boolean sShifted = true;
\r
3756 boolean tShifted = true;
\r
3760 int sorder = CollationElementIterator.IGNORABLE;
\r
3761 int torder = CollationElementIterator.IGNORABLE;
\r
3762 while (sorder == CollationElementIterator.IGNORABLE
\r
3763 || (isContinuation(sorder) && !sShifted)) {
\r
3764 sorder = m_srcUtilCEBuffer_[soffset ++];
\r
3765 if (isContinuation(sorder)) {
\r
3770 else if (Utility.compareUnsigned(sorder, lowestpvalue) > 0
\r
3771 || (sorder & CE_PRIMARY_MASK_)
\r
3772 == CollationElementIterator.IGNORABLE) {
\r
3773 // non continuation
\r
3774 sorder = CE_PRIMARY_MASK_;
\r
3781 sorder >>>= CE_PRIMARY_SHIFT_;
\r
3782 while (torder == CollationElementIterator.IGNORABLE
\r
3783 || (isContinuation(torder) && !tShifted)) {
\r
3784 torder = m_tgtUtilCEBuffer_[toffset ++];
\r
3785 if (isContinuation(torder)) {
\r
3790 else if (Utility.compareUnsigned(torder, lowestpvalue) > 0
\r
3791 || (torder & CE_PRIMARY_MASK_)
\r
3792 == CollationElementIterator.IGNORABLE) {
\r
3793 // non continuation
\r
3794 torder = CE_PRIMARY_MASK_;
\r
3801 torder >>>= CE_PRIMARY_SHIFT_;
\r
3803 if (sorder == torder) {
\r
3804 if (m_srcUtilCEBuffer_[soffset - 1]
\r
3805 == CollationElementIterator.NULLORDER) {
\r
3806 if (m_tgtUtilCEBuffer_[toffset - 1]
\r
3807 != CollationElementIterator.NULLORDER) {
\r
3812 else if (m_tgtUtilCEBuffer_[toffset - 1]
\r
3813 == CollationElementIterator.NULLORDER) {
\r
3818 if (m_srcUtilCEBuffer_[soffset - 1] ==
\r
3819 CollationElementIterator.NULLORDER) {
\r
3822 if (m_tgtUtilCEBuffer_[toffset - 1] ==
\r
3823 CollationElementIterator.NULLORDER) {
\r
3826 return (sorder < torder) ? -1 : 1;
\r
3833 * Internal function. Does byte level string compare. Used by strcoll if
\r
3834 * strength == identical and strings are otherwise equal. This is a rare
\r
3835 * case. Comparison must be done on NFD normalized strings. FCD is not good
\r
3837 * @param source text
\r
3838 * @param target text
\r
3839 * @param offset of the first difference in the text strings
\r
3840 * @param normalize flag indicating if we are to normalize the text before
\r
3842 * @return 1 if source is greater than target, -1 less than and 0 if equals
\r
3844 private static final int doIdenticalCompare(String source, String target,
\r
3845 int offset, boolean normalize)
\r
3849 if (Normalizer.quickCheck(source, Normalizer.NFD,0)
\r
3850 != Normalizer.YES) {
\r
3851 source = Normalizer.decompose(source, false);
\r
3854 if (Normalizer.quickCheck(target, Normalizer.NFD,0)
\r
3855 != Normalizer.YES) {
\r
3856 target = Normalizer.decompose(target, false);
\r
3861 return doStringCompare(source, target, offset);
\r
3865 * Compares string for their codepoint order.
\r
3866 * This comparison handles surrogate characters and place them after the
\r
3867 * all non surrogate characters.
\r
3868 * @param source text
\r
3869 * @param target text
\r
3870 * @param offset start offset for comparison
\r
3871 * @return 1 if source is greater than target, -1 less than and 0 if equals
\r
3873 private static final int doStringCompare(String source,
\r
3877 // compare identical prefixes - they do not need to be fixed up
\r
3880 int slength = source.length();
\r
3881 int tlength = target.length();
\r
3882 int minlength = Math.min(slength, tlength);
\r
3883 while (offset < minlength) {
\r
3884 schar = source.charAt(offset);
\r
3885 tchar = target.charAt(offset ++);
\r
3886 if (schar != tchar) {
\r
3891 if (schar == tchar && offset == minlength) {
\r
3892 if (slength > minlength) {
\r
3895 if (tlength > minlength) {
\r
3901 // if both values are in or above the surrogate range, Fix them up.
\r
3902 if (schar >= UTF16.LEAD_SURROGATE_MIN_VALUE
\r
3903 && tchar >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
\r
3904 schar = fixupUTF16(schar);
\r
3905 tchar = fixupUTF16(tchar);
\r
3908 // now c1 and c2 are in UTF-32-compatible order
\r
3909 return (schar < tchar) ? -1 : 1; // schar and tchar has to be different
\r
3913 * Rotate surrogates to the top to get code point order
\r
3915 private static final char fixupUTF16(char ch)
\r
3917 if (ch >= 0xe000) {
\r
3927 * Resets the internal case data members and compression values.
\r
3929 private void updateInternalState()
\r
3931 if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {
\r
3932 m_caseSwitch_ = CASE_SWITCH_;
\r
3935 m_caseSwitch_ = NO_CASE_SWITCH_;
\r
3938 if (m_isCaseLevel_ || m_caseFirst_ == AttributeValue.OFF_) {
\r
3939 m_mask3_ = CE_REMOVE_CASE_;
\r
3940 m_common3_ = COMMON_NORMAL_3_;
\r
3941 m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_OFF_;
\r
3942 m_top3_ = COMMON_TOP_CASE_SWITCH_OFF_3_;
\r
3943 m_bottom3_ = COMMON_BOTTOM_3_;
\r
3946 m_mask3_ = CE_KEEP_CASE_;
\r
3947 m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_ON_;
\r
3948 if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) {
\r
3949 m_common3_ = COMMON_UPPER_FIRST_3_;
\r
3950 m_top3_ = COMMON_TOP_CASE_SWITCH_UPPER_3_;
\r
3951 m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_UPPER_3_;
\r
3953 m_common3_ = COMMON_NORMAL_3_;
\r
3954 m_top3_ = COMMON_TOP_CASE_SWITCH_LOWER_3_;
\r
3955 m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_LOWER_3_;
\r
3959 // Set the compression values
\r
3960 int total3 = m_top3_ - COMMON_BOTTOM_3_ - 1;
\r
3961 // we multilply double with int, but need only int
\r
3962 m_topCount3_ = (int)(PROPORTION_3_ * total3);
\r
3963 m_bottomCount3_ = total3 - m_topCount3_;
\r
3965 if (!m_isCaseLevel_ && getStrength() == AttributeValue.TERTIARY_
\r
3966 && !m_isFrenchCollation_ && !m_isAlternateHandlingShifted_) {
\r
3967 m_isSimple3_ = true;
\r
3970 m_isSimple3_ = false;
\r
3972 if(!m_isCaseLevel_ && getStrength() <= AttributeValue.TERTIARY_ && !m_isNumericCollation_
\r
3973 && !m_isAlternateHandlingShifted_ && !latinOneFailed_) {
\r
3974 if(latinOneCEs_ == null || latinOneRegenTable_) {
\r
3975 if(setUpLatinOne()) { // if we succeed in building latin1 table, we'll use it
\r
3976 latinOneUse_ = true;
\r
3978 latinOneUse_ = false;
\r
3979 latinOneFailed_ = true;
\r
3981 latinOneRegenTable_ = false;
\r
3982 } else { // latin1Table exists and it doesn't need to be regenerated, just use it
\r
3983 latinOneUse_ = true;
\r
3986 latinOneUse_ = false;
\r
3992 * Initializes the RuleBasedCollator
\r
3994 private final void init()
\r
3996 for (m_minUnsafe_ = 0; m_minUnsafe_ < DEFAULT_MIN_HEURISTIC_;
\r
3997 m_minUnsafe_ ++) {
\r
3998 // Find the smallest unsafe char.
\r
3999 if (isUnsafe(m_minUnsafe_)) {
\r
4004 for (m_minContractionEnd_ = 0;
\r
4005 m_minContractionEnd_ < DEFAULT_MIN_HEURISTIC_;
\r
4006 m_minContractionEnd_ ++) {
\r
4007 // Find the smallest contraction-ending char.
\r
4008 if (isContractionEnd(m_minContractionEnd_)) {
\r
4012 latinOneFailed_ = true;
\r
4013 setStrength(m_defaultStrength_);
\r
4014 setDecomposition(m_defaultDecomposition_);
\r
4015 m_variableTopValue_ = m_defaultVariableTopValue_;
\r
4016 m_isFrenchCollation_ = m_defaultIsFrenchCollation_;
\r
4017 m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_;
\r
4018 m_isCaseLevel_ = m_defaultIsCaseLevel_;
\r
4019 m_caseFirst_ = m_defaultCaseFirst_;
\r
4020 m_isHiragana4_ = m_defaultIsHiragana4_;
\r
4021 m_isNumericCollation_ = m_defaultIsNumericCollation_;
\r
4022 latinOneFailed_ = false;
\r
4023 updateInternalState();
\r
4027 * Initializes utility iterators and byte buffer used by compare
\r
4029 private final void initUtility(boolean allocate) {
\r
4031 if (m_srcUtilIter_ == null) {
\r
4032 m_srcUtilIter_ = new StringUCharacterIterator();
\r
4033 m_srcUtilColEIter_ = new CollationElementIterator(m_srcUtilIter_, this);
\r
4034 m_tgtUtilIter_ = new StringUCharacterIterator();
\r
4035 m_tgtUtilColEIter_ = new CollationElementIterator(m_tgtUtilIter_, this);
\r
4036 m_utilBytes0_ = new byte[SORT_BUFFER_INIT_SIZE_CASE_]; // case
\r
4037 m_utilBytes1_ = new byte[SORT_BUFFER_INIT_SIZE_1_]; // primary
\r
4038 m_utilBytes2_ = new byte[SORT_BUFFER_INIT_SIZE_2_]; // secondary
\r
4039 m_utilBytes3_ = new byte[SORT_BUFFER_INIT_SIZE_3_]; // tertiary
\r
4040 m_utilBytes4_ = new byte[SORT_BUFFER_INIT_SIZE_4_]; // Quaternary
\r
4041 m_srcUtilCEBuffer_ = new int[CE_BUFFER_SIZE_];
\r
4042 m_tgtUtilCEBuffer_ = new int[CE_BUFFER_SIZE_];
\r
4045 m_srcUtilIter_ = null;
\r
4046 m_srcUtilColEIter_ = null;
\r
4047 m_tgtUtilIter_ = null;
\r
4048 m_tgtUtilColEIter_ = null;
\r
4049 m_utilBytes0_ = null;
\r
4050 m_utilBytes1_ = null;
\r
4051 m_utilBytes2_ = null;
\r
4052 m_utilBytes3_ = null;
\r
4053 m_utilBytes4_ = null;
\r
4054 m_srcUtilCEBuffer_ = null;
\r
4055 m_tgtUtilCEBuffer_ = null;
\r
4059 // Consts for Latin-1 special processing
\r
4060 private static final int ENDOFLATINONERANGE_ = 0xFF;
\r
4061 private static final int LATINONETABLELEN_ = (ENDOFLATINONERANGE_+50);
\r
4062 private static final int BAIL_OUT_CE_ = 0xFF000000;
\r
4065 * Generate latin-1 tables
\r
4068 private class shiftValues {
\r
4069 int primShift = 24;
\r
4070 int secShift = 24;
\r
4071 int terShift = 24;
\r
4074 private final void
\r
4075 addLatinOneEntry(char ch, int CE, shiftValues sh) {
\r
4076 int primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
\r
4077 boolean reverseSecondary = false;
\r
4078 if(!isContinuation(CE)) {
\r
4079 tertiary = ((CE & m_mask3_));
\r
4080 tertiary ^= m_caseSwitch_;
\r
4081 reverseSecondary = true;
\r
4083 tertiary = (byte)((CE & CE_REMOVE_CONTINUATION_MASK_));
\r
4084 tertiary &= CE_REMOVE_CASE_;
\r
4085 reverseSecondary = false;
\r
4088 secondary = ((CE >>>= 8) & LAST_BYTE_MASK_);
\r
4089 primary2 = ((CE >>>= 8) & LAST_BYTE_MASK_);
\r
4090 primary1 = (CE >>> 8);
\r
4092 if(primary1 != 0) {
\r
4093 latinOneCEs_[ch] |= (primary1 << sh.primShift);
\r
4094 sh.primShift -= 8;
\r
4096 if(primary2 != 0) {
\r
4097 if(sh.primShift < 0) {
\r
4098 latinOneCEs_[ch] = BAIL_OUT_CE_;
\r
4099 latinOneCEs_[latinOneTableLen_+ch] = BAIL_OUT_CE_;
\r
4100 latinOneCEs_[2*latinOneTableLen_+ch] = BAIL_OUT_CE_;
\r
4103 latinOneCEs_[ch] |= (primary2 << sh.primShift);
\r
4104 sh.primShift -= 8;
\r
4106 if(secondary != 0) {
\r
4107 if(reverseSecondary && m_isFrenchCollation_) { // reverse secondary
\r
4108 latinOneCEs_[latinOneTableLen_+ch] >>>= 8; // make space for secondary
\r
4109 latinOneCEs_[latinOneTableLen_+ch] |= (secondary << 24);
\r
4110 } else { // normal case
\r
4111 latinOneCEs_[latinOneTableLen_+ch] |= (secondary << sh.secShift);
\r
4115 if(tertiary != 0) {
\r
4116 latinOneCEs_[2*latinOneTableLen_+ch] |= (tertiary << sh.terShift);
\r
4121 private final void
\r
4122 resizeLatinOneTable(int newSize) {
\r
4123 int newTable[] = new int[3*newSize];
\r
4124 int sizeToCopy = ((newSize<latinOneTableLen_)?newSize:latinOneTableLen_);
\r
4125 //uprv_memset(newTable, 0, newSize*sizeof(uint32_t)*3); // automatically cleared.
\r
4126 System.arraycopy(latinOneCEs_, 0, newTable, 0, sizeToCopy);
\r
4127 System.arraycopy(latinOneCEs_, latinOneTableLen_, newTable, newSize, sizeToCopy);
\r
4128 System.arraycopy(latinOneCEs_, 2*latinOneTableLen_, newTable, 2*newSize, sizeToCopy);
\r
4129 latinOneTableLen_ = newSize;
\r
4130 latinOneCEs_ = newTable;
\r
4133 private final boolean setUpLatinOne() {
\r
4134 if(latinOneCEs_ == null || m_reallocLatinOneCEs_) {
\r
4135 latinOneCEs_ = new int[3*LATINONETABLELEN_];
\r
4136 latinOneTableLen_ = LATINONETABLELEN_;
\r
4137 m_reallocLatinOneCEs_ = false;
\r
4139 Arrays.fill(latinOneCEs_, 0);
\r
4141 if(m_ContInfo_ == null) {
\r
4142 m_ContInfo_ = new ContractionInfo();
\r
4145 //StringBuffer sCh = new StringBuffer();
\r
4146 //CollationElementIterator it = getCollationElementIterator(sCh.toString());
\r
4147 CollationElementIterator it = getCollationElementIterator("");
\r
4149 shiftValues s = new shiftValues();
\r
4151 char contractionOffset = ENDOFLATINONERANGE_+1;
\r
4153 for(ch = 0; ch <= ENDOFLATINONERANGE_; ch++) {
\r
4154 s.primShift = 24; s.secShift = 24; s.terShift = 24;
\r
4156 CE = m_trie_.getLatin1LinearValue(ch);
\r
4158 CE = m_trie_.getLeadValue(ch);
\r
4159 if(CE == CollationElementIterator.CE_NOT_FOUND_) {
\r
4160 CE = UCA_.m_trie_.getLeadValue(ch);
\r
4163 if(!isSpecial(CE)) {
\r
4164 addLatinOneEntry(ch, CE, s);
\r
4166 switch (RuleBasedCollator.getTag(CE)) {
\r
4167 case CollationElementIterator.CE_EXPANSION_TAG_:
\r
4168 case CollationElementIterator.CE_DIGIT_TAG_:
\r
4169 //sCh.delete(0, sCh.length());
\r
4171 //it.setText(sCh.toString());
\r
4172 it.setText(UCharacter.toString(ch));
\r
4173 while((CE = it.next()) != CollationElementIterator.NULLORDER) {
\r
4174 if(s.primShift < 0 || s.secShift < 0 || s.terShift < 0) {
\r
4175 latinOneCEs_[ch] = BAIL_OUT_CE_;
\r
4176 latinOneCEs_[latinOneTableLen_+ch] = BAIL_OUT_CE_;
\r
4177 latinOneCEs_[2*latinOneTableLen_+ch] = BAIL_OUT_CE_;
\r
4180 addLatinOneEntry(ch, CE, s);
\r
4183 case CollationElementIterator.CE_CONTRACTION_TAG_:
\r
4184 // here is the trick
\r
4185 // F2 is contraction. We do something very similar to contractions
\r
4186 // but have two indices, one in the real contraction table and the
\r
4187 // other to where we stuffed things. This hopes that we don't have
\r
4188 // many contractions (this should work for latin-1 tables).
\r
4190 if((CE & 0x00FFF000) != 0) {
\r
4191 latinOneFailed_ = true;
\r
4195 int UCharOffset = (CE & 0xFFFFFF) - m_contractionOffset_; //getContractionOffset(CE)]
\r
4197 CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
\r
4199 latinOneCEs_[ch] = CE;
\r
4200 latinOneCEs_[latinOneTableLen_+ch] = CE;
\r
4201 latinOneCEs_[2*latinOneTableLen_+ch] = CE;
\r
4203 // We're going to jump into contraction table, pick the elements
\r
4206 //CE = *(contractionCEs + (UCharOffset - contractionIndex));
\r
4207 CE = m_contractionCE_[UCharOffset];
\r
4210 == CollationElementIterator.CE_EXPANSION_TAG_) {
\r
4211 int i; /* general counter */
\r
4212 //uint32_t *CEOffset = (uint32_t *)image+getExpansionOffset(CE); /* find the offset to expansion table */
\r
4213 int offset = ((CE & 0xFFFFF0) >> 4) - m_expansionOffset_; //it.getExpansionOffset(this, CE);
\r
4214 int size = CE & 0xF; // getExpansionCount(CE);
\r
4215 //CE = *CEOffset++;
\r
4216 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
\r
4217 for(i = 0; i<size; i++) {
\r
4218 if(s.primShift < 0 || s.secShift < 0 || s.terShift < 0) {
\r
4219 latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;
\r
4220 latinOneCEs_[latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;
\r
4221 latinOneCEs_[2*latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;
\r
4224 addLatinOneEntry(contractionOffset, m_expansion_[offset+i], s);
\r
4226 } else { /* else, we do */
\r
4227 while(m_expansion_[offset] != 0) {
\r
4228 if(s.primShift < 0 || s.secShift < 0 || s.terShift < 0) {
\r
4229 latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;
\r
4230 latinOneCEs_[latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;
\r
4231 latinOneCEs_[2*latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;
\r
4234 addLatinOneEntry(contractionOffset, m_expansion_[offset++], s);
\r
4237 contractionOffset++;
\r
4238 } else if(!isSpecial(CE)) {
\r
4239 addLatinOneEntry(contractionOffset++, CE, s);
\r
4241 latinOneCEs_[contractionOffset] = BAIL_OUT_CE_;
\r
4242 latinOneCEs_[latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;
\r
4243 latinOneCEs_[2*latinOneTableLen_+contractionOffset] = BAIL_OUT_CE_;
\r
4244 contractionOffset++;
\r
4247 s.primShift = 24; s.secShift = 24; s.terShift = 24;
\r
4248 if(contractionOffset == latinOneTableLen_) { // we need to reallocate
\r
4249 resizeLatinOneTable(2*latinOneTableLen_);
\r
4251 } while(m_contractionIndex_[UCharOffset] != 0xFFFF);
\r
4254 case CollationElementIterator.CE_SPEC_PROC_TAG_:
\r
4256 // 0xB7 is a precontext character defined in UCA5.1, a special
\r
4257 // handle is implemeted in order to save LatinOne table for
\r
4260 addLatinOneEntry(ch, CE, s);
\r
4263 latinOneFailed_ = true;
\r
4269 latinOneFailed_ = true;
\r
4275 if(contractionOffset < latinOneTableLen_) {
\r
4276 resizeLatinOneTable(contractionOffset);
\r
4281 private class ContractionInfo {
\r
4285 ContractionInfo m_ContInfo_;
\r
4288 getLatinOneContraction(int strength, int CE, String s) {
\r
4289 //int strength, int CE, String s, Integer ind) {
\r
4290 int len = s.length();
\r
4291 //const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
\r
4292 int UCharOffset = (CE & 0xFFF) - m_contractionOffset_;
\r
4294 int latinOneOffset = (CE & 0x00FFF000) >>> 12;
\r
4295 char schar = 0, tchar = 0;
\r
4300 if(s[*index] == 0) { // end of string
\r
4301 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
\r
4303 schar = s[*index];
\r
4307 if(m_ContInfo_.index == len) {
\r
4308 return(latinOneCEs_[strength*latinOneTableLen_+latinOneOffset]);
\r
4310 schar = s.charAt(m_ContInfo_.index);
\r
4314 while(schar > (tchar = m_contractionIndex_[UCharOffset+offset]/**(UCharOffset+offset)*/)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
\r
4318 if (schar == tchar) {
\r
4319 m_ContInfo_.index++;
\r
4320 return(latinOneCEs_[strength*latinOneTableLen_+latinOneOffset+offset]);
\r
4324 if(schar > ENDOFLATINONERANGE_ /*& 0xFF00*/) {
\r
4325 return BAIL_OUT_CE_;
\r
4327 // skip completely ignorables
\r
4328 int isZeroCE = m_trie_.getLeadValue(schar); //UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
\r
4329 if(isZeroCE == 0) { // we have to ignore completely ignorables
\r
4330 m_ContInfo_.index++;
\r
4334 return(latinOneCEs_[strength*latinOneTableLen_+latinOneOffset]);
\r
4341 * This is a fast strcoll, geared towards text in Latin-1.
\r
4342 * It supports contractions of size two, French secondaries
\r
4343 * and case switching. You can use it with strengths primary
\r
4344 * to tertiary. It does not support shifted and case level.
\r
4345 * It relies on the table build by setupLatin1Table. If it
\r
4346 * doesn't understand something, it will go to the regular
\r
4350 compareUseLatin1(String source, String target, int startOffset)
\r
4352 int sLen = source.length();
\r
4353 int tLen = target.length();
\r
4355 int strength = getStrength();
\r
4357 int sIndex = startOffset, tIndex = startOffset;
\r
4358 char sChar = 0, tChar = 0;
\r
4359 int sOrder=0, tOrder=0;
\r
4361 boolean endOfSource = false;
\r
4363 //uint32_t *elements = coll->latinOneCEs;
\r
4365 boolean haveContractions = false; // if we have contractions in our string
\r
4366 // we cannot do French secondary
\r
4368 int offset = latinOneTableLen_;
\r
4370 // Do the primary level
\r
4373 while(sOrder==0) { // this loop skips primary ignorables
\r
4374 // sOrder=getNextlatinOneCE(source);
\r
4375 if(sIndex==sLen) {
\r
4376 endOfSource = true;
\r
4379 sChar=source.charAt(sIndex++); //[sIndex++];
\r
4381 if(sChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out
\r
4382 //fprintf(stderr, "R");
\r
4383 return compareRegular(source, target, startOffset);
\r
4385 sOrder = latinOneCEs_[sChar];
\r
4386 if(isSpecial(sOrder)) { // if we got a special
\r
4387 // specials can basically be either contractions or bail-out signs. If we get anything
\r
4388 // else, we'll bail out anywasy
\r
4389 if(getTag(sOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) {
\r
4390 m_ContInfo_.index = sIndex;
\r
4391 sOrder = getLatinOneContraction(0, sOrder, source);
\r
4392 sIndex = m_ContInfo_.index;
\r
4393 haveContractions = true; // if there are contractions, we cannot do French secondary
\r
4394 // However, if there are contractions in the table, but we always use just one char,
\r
4395 // we might be able to do French. This should be checked out.
\r
4397 if(isSpecial(sOrder) /*== UCOL_BAIL_OUT_CE*/) {
\r
4398 //fprintf(stderr, "S");
\r
4399 return compareRegular(source, target, startOffset);
\r
4404 while(tOrder==0) { // this loop skips primary ignorables
\r
4405 // tOrder=getNextlatinOneCE(target);
\r
4406 if(tIndex==tLen) {
\r
4413 tChar=target.charAt(tIndex++); //[tIndex++];
\r
4414 if(tChar > ENDOFLATINONERANGE_) { // if we encounter non-latin-1, we bail out
\r
4415 //fprintf(stderr, "R");
\r
4416 return compareRegular(source, target, startOffset);
\r
4418 tOrder = latinOneCEs_[tChar];
\r
4419 if(isSpecial(tOrder)) {
\r
4420 // Handling specials, see the comments for source
\r
4421 if(getTag(tOrder) == CollationElementIterator.CE_CONTRACTION_TAG_) {
\r
4422 m_ContInfo_.index = tIndex;
\r
4423 tOrder = getLatinOneContraction(0, tOrder, target);
\r
4424 tIndex = m_ContInfo_.index;
\r
4425 haveContractions = true;
\r
4427 if(isSpecial(tOrder)/*== UCOL_BAIL_OUT_CE*/) {
\r
4428 //fprintf(stderr, "S");
\r
4429 return compareRegular(source, target, startOffset);
\r
4433 if(endOfSource) { // source is finished, but target is not, say the result.
\r
4437 if(sOrder == tOrder) { // if we have same CEs, we continue the loop
\r
4438 sOrder = 0; tOrder = 0;
\r
4441 // compare current top bytes
\r
4442 if(((sOrder^tOrder)&0xFF000000)!=0) {
\r
4443 // top bytes differ, return difference
\r
4444 if(sOrder >>> 8 < tOrder >>> 8) {
\r
4449 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
\r
4450 // since we must return enum value
\r
4453 // top bytes match, continue with following bytes
\r
4459 // after primary loop, we definitely know the sizes of strings,
\r
4460 // so we set it and use simpler loop for secondaries and tertiaries
\r
4461 //sLen = sIndex; tLen = tIndex;
\r
4462 if(strength >= SECONDARY) {
\r
4463 // adjust the table beggining
\r
4464 //latinOneCEs_ += coll->latinOneTableLen;
\r
4465 endOfSource = false;
\r
4467 if(!m_isFrenchCollation_) { // non French
\r
4468 // This loop is a simplified copy of primary loop
\r
4469 // at this point we know that whole strings are latin-1, so we don't
\r
4470 // check for that. We also know that we only have contractions as
\r
4472 //sIndex = 0; tIndex = 0;
\r
4473 sIndex = startOffset; tIndex = startOffset;
\r
4476 while(sOrder==0) {
\r
4477 if(sIndex==sLen) {
\r
4478 endOfSource = true;
\r
4481 sChar=source.charAt(sIndex++); //[sIndex++];
\r
4482 sOrder = latinOneCEs_[offset+sChar];
\r
4483 if(isSpecial(sOrder)) {
\r
4484 m_ContInfo_.index = sIndex;
\r
4485 sOrder = getLatinOneContraction(1, sOrder, source);
\r
4486 sIndex = m_ContInfo_.index;
\r
4490 while(tOrder==0) {
\r
4491 if(tIndex==tLen) {
\r
4498 tChar=target.charAt(tIndex++); //[tIndex++];
\r
4499 tOrder = latinOneCEs_[offset+tChar];
\r
4500 if(isSpecial(tOrder)) {
\r
4501 m_ContInfo_.index = tIndex;
\r
4502 tOrder = getLatinOneContraction(1, tOrder, target);
\r
4503 tIndex = m_ContInfo_.index;
\r
4510 if(sOrder == tOrder) {
\r
4511 sOrder = 0; tOrder = 0;
\r
4514 // see primary loop for comments on this
\r
4515 if(((sOrder^tOrder)&0xFF000000)!=0) {
\r
4516 if(sOrder >>> 8 < tOrder >>> 8) {
\r
4526 } else { // French
\r
4527 if(haveContractions) { // if we have contractions, we have to bail out
\r
4528 // since we don't really know how to handle them here
\r
4529 return compareRegular(source, target, startOffset);
\r
4531 // For French, we go backwards
\r
4532 sIndex = sLen; tIndex = tLen;
\r
4535 while(sOrder==0) {
\r
4536 if(sIndex==startOffset) {
\r
4537 endOfSource = true;
\r
4540 sChar=source.charAt(--sIndex); //[--sIndex];
\r
4541 sOrder = latinOneCEs_[offset+sChar];
\r
4542 // don't even look for contractions
\r
4545 while(tOrder==0) {
\r
4546 if(tIndex==startOffset) {
\r
4553 tChar=target.charAt(--tIndex); //[--tIndex];
\r
4554 tOrder = latinOneCEs_[offset+tChar];
\r
4555 // don't even look for contractions
\r
4561 if(sOrder == tOrder) {
\r
4562 sOrder = 0; tOrder = 0;
\r
4565 // see the primary loop for comments
\r
4566 if(((sOrder^tOrder)&0xFF000000)!=0) {
\r
4567 if(sOrder >>> 8 < tOrder >>> 8) {
\r
4580 if(strength >= TERTIARY) {
\r
4581 // tertiary loop is the same as secondary (except no French)
\r
4582 offset += latinOneTableLen_;
\r
4583 //sIndex = 0; tIndex = 0;
\r
4584 sIndex = startOffset; tIndex = startOffset;
\r
4585 endOfSource = false;
\r
4587 while(sOrder==0) {
\r
4588 if(sIndex==sLen) {
\r
4589 endOfSource = true;
\r
4592 sChar=source.charAt(sIndex++); //[sIndex++];
\r
4593 sOrder = latinOneCEs_[offset+sChar];
\r
4594 if(isSpecial(sOrder)) {
\r
4595 m_ContInfo_.index = sIndex;
\r
4596 sOrder = getLatinOneContraction(2, sOrder, source);
\r
4597 sIndex = m_ContInfo_.index;
\r
4600 while(tOrder==0) {
\r
4601 if(tIndex==tLen) {
\r
4603 return 0; // if both strings are at the end, they are equal
\r
4608 tChar=target.charAt(tIndex++); //[tIndex++];
\r
4609 tOrder = latinOneCEs_[offset+tChar];
\r
4610 if(isSpecial(tOrder)) {
\r
4611 m_ContInfo_.index = tIndex;
\r
4612 tOrder = getLatinOneContraction(2, tOrder, target);
\r
4613 tIndex = m_ContInfo_.index;
\r
4619 if(sOrder == tOrder) {
\r
4620 sOrder = 0; tOrder = 0;
\r
4623 if(((sOrder^tOrder)&0xff000000)!=0) {
\r
4624 if(sOrder >>> 8 < tOrder >>> 8) {
\r
4638 * Get the version of this collator object.
\r
4639 * @return the version object associated with this collator
\r
4642 public VersionInfo getVersion() {
\r
4643 /* RunTime version */
\r
4644 int rtVersion = VersionInfo.UCOL_RUNTIME_VERSION.getMajor();
\r
4645 /* Builder version*/
\r
4646 int bdVersion = m_version_.getMajor();
\r
4648 /* Charset Version. Need to get the version from cnv files
\r
4649 * makeconv should populate cnv files with version and
\r
4650 * an api has to be provided in ucnv.h to obtain this version
\r
4652 int csVersion = 0;
\r
4654 /* combine the version info */
\r
4655 int cmbVersion = ((rtVersion<<11) | (bdVersion<<6) | (csVersion)) & 0xFFFF;
\r
4657 /* Tailoring rules */
\r
4658 return VersionInfo.getInstance(cmbVersion>>8,
\r
4659 cmbVersion & 0xFF,
\r
4660 m_version_.getMinor(),
\r
4661 UCA_.m_UCA_version_.getMajor());
\r
4663 // versionInfo[0] = (uint8_t)(cmbVersion>>8);
\r
4664 // versionInfo[1] = (uint8_t)cmbVersion;
\r
4665 // versionInfo[2] = coll->image->version[1];
\r
4666 // versionInfo[3] = coll->UCA->image->UCAVersion[0];
\r
4670 * Get the UCA version of this collator object.
\r
4671 * @return the version object associated with this collator
\r
4674 public VersionInfo getUCAVersion() {
\r
4675 return UCA_.m_UCA_version_;
\r
4678 private transient boolean m_reallocLatinOneCEs_;
\r