2 *******************************************************************************
3 * Copyright (C) 1996-2011, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 *******************************************************************************
7 package com.ibm.icu.text;
9 import java.io.IOException;
10 import java.text.ParseException;
11 import java.util.ArrayList;
12 import java.util.Arrays;
13 import java.util.Collections;
14 import java.util.Enumeration;
15 import java.util.HashMap;
16 import java.util.List;
19 import com.ibm.icu.impl.IntTrieBuilder;
20 import com.ibm.icu.impl.Norm2AllModes;
21 import com.ibm.icu.impl.Normalizer2Impl;
22 import com.ibm.icu.impl.TrieBuilder;
23 import com.ibm.icu.impl.TrieIterator;
24 import com.ibm.icu.impl.UCharacterProperty;
25 import com.ibm.icu.impl.Utility;
26 import com.ibm.icu.lang.UCharacter;
27 import com.ibm.icu.lang.UCharacterCategory;
28 import com.ibm.icu.util.RangeValueIterator;
29 import com.ibm.icu.util.VersionInfo;
32 * Class for building a collator from a list of collation rules. This class is
33 * uses CollationRuleParser
35 * @author Syn Wee Quek
36 * @since release 2.2, June 11 2002
38 final class CollationParsedRuleBuilder {
39 // package private constructors ------------------------------------------
46 * @exception ParseException
47 * thrown when argument rules have an invalid syntax
49 CollationParsedRuleBuilder(String rules) throws ParseException {
50 m_parser_ = new CollationRuleParser(rules);
51 m_parser_.assembleTokenList();
52 m_utilColEIter_ = RuleBasedCollator.UCA_
53 .getCollationElementIterator("");
56 // package private inner classes -----------------------------------------
61 static class InverseUCA {
62 // package private constructor ---------------------------------------
67 // package private data member ---------------------------------------
70 * Array list of characters
74 * Array list of continuation characters
76 char m_continuations_[];
79 * UCA version of inverse UCA table
81 VersionInfo m_UCA_version_;
83 // package private method --------------------------------------------
86 * Returns the previous inverse ces of the argument ces
91 * continuation ce to test
95 * an array to store the return results previous inverse ce
96 * and previous inverse continuation ce
97 * @return result of the inverse ce
99 final int getInversePrevCE(int ce, int contce, int strength,
101 int result = findInverseCE(ce, contce);
104 prevresult[0] = CollationElementIterator.NULLORDER;
108 ce &= STRENGTH_MASK_[strength];
109 contce &= STRENGTH_MASK_[strength];
112 prevresult[1] = contce;
114 while ((prevresult[0] & STRENGTH_MASK_[strength]) == ce
115 && (prevresult[1] & STRENGTH_MASK_[strength]) == contce
117 // this condition should prevent falling off the edge of the
119 // here, we end up in a singularity - zero
120 prevresult[0] = m_table_[3 * (--result)];
121 prevresult[1] = m_table_[3 * result + 1];
126 final int getCEStrengthDifference(int CE, int contCE, int prevCE,
128 int strength = Collator.TERTIARY;
129 while (((prevCE & STRENGTH_MASK_[strength]) != (CE & STRENGTH_MASK_[strength]) || (prevContCE & STRENGTH_MASK_[strength]) != (contCE & STRENGTH_MASK_[strength]))
130 && (strength != 0)) {
136 private int compareCEs(int source0, int source1, int target0,
138 int s1 = source0, s2, t1 = target0, t2;
139 if (RuleBasedCollator.isContinuation(source1)) {
144 if (RuleBasedCollator.isContinuation(target1)) {
151 if (s1 == t1 && s2 == t2) {
154 s = (s1 & 0xFFFF0000) | ((s2 & 0xFFFF0000) >>> 16);
155 t = (t1 & 0xFFFF0000) | ((t2 & 0xFFFF0000) >>> 16);
157 s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00) >> 8;
158 t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00) >> 8;
160 s = (s1 & 0x000000FF) << 8 | (s2 & 0x000000FF);
161 t = (t1 & 0x000000FF) << 8 | (t2 & 0x000000FF);
162 return Utility.compareUnsigned(s, t);
164 return Utility.compareUnsigned(s, t);
167 return Utility.compareUnsigned(s, t);
172 * Finding the inverse CE of the argument CEs
180 int findInverseCE(int ce, int contce) {
182 int top = m_table_.length / 3;
185 while (bottom < top - 1) {
186 result = (top + bottom) >> 1;
187 int first = m_table_[3 * result];
188 int second = m_table_[3 * result + 1];
189 int comparison = compareCEs(first, second, ce, contce);
190 if (comparison > 0) {
192 } else if (comparison < 0) {
203 * Getting gap offsets in the inverse UCA
207 * @exception Exception
208 * thrown when error occurs while finding the collation
211 void getInverseGapPositions(
212 CollationRuleParser.TokenListHeader listheader)
214 // reset all the gaps
215 CollationRuleParser.Token token = listheader.m_first_;
216 int tokenstrength = token.m_strength_;
218 for (int i = 0; i < 3; i++) {
219 listheader.m_gapsHi_[3 * i] = 0;
220 listheader.m_gapsHi_[3 * i + 1] = 0;
221 listheader.m_gapsHi_[3 * i + 2] = 0;
222 listheader.m_gapsLo_[3 * i] = 0;
223 listheader.m_gapsLo_[3 * i + 1] = 0;
224 listheader.m_gapsLo_[3 * i + 2] = 0;
225 listheader.m_numStr_[i] = 0;
226 listheader.m_fStrToken_[i] = null;
227 listheader.m_lStrToken_[i] = null;
228 listheader.m_pos_[i] = -1;
231 if ((listheader.m_baseCE_ >>> 24) >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_
232 && (listheader.m_baseCE_ >>> 24) <= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_) {
234 listheader.m_pos_[0] = 0;
235 int t1 = listheader.m_baseCE_;
236 int t2 = listheader.m_baseContCE_;
237 listheader.m_gapsLo_[0] = mergeCE(t1, t2, Collator.PRIMARY);
238 listheader.m_gapsLo_[1] = mergeCE(t1, t2, Collator.SECONDARY);
239 listheader.m_gapsLo_[2] = mergeCE(t1, t2, Collator.TERTIARY);
240 int primaryCE = t1 & RuleBasedCollator.CE_PRIMARY_MASK_
241 | (t2 & RuleBasedCollator.CE_PRIMARY_MASK_) >>> 16;
242 primaryCE = RuleBasedCollator.impCEGen_
243 .getImplicitFromRaw(RuleBasedCollator.impCEGen_
244 .getRawFromImplicit(primaryCE) + 1);
246 t1 = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505;
247 t2 = (primaryCE << 16) & RuleBasedCollator.CE_PRIMARY_MASK_
248 | RuleBasedCollator.CE_CONTINUATION_MARKER_;
250 // if (listheader.m_baseCE_ < 0xEF000000) {
251 // // first implicits have three byte primaries, with a gap of
252 // // one so we esentially need to add 2 to the top byte in
253 // // listheader.m_baseContCE_
257 // // second implicits have four byte primaries, with a gap of
258 // // IMPLICIT_LAST2_MULTIPLIER_
259 // // Now, this guy is not really accessible here, so until we
260 // // find a better way to pass it around, assume that the gap
264 listheader.m_gapsHi_[0] = mergeCE(t1, t2, Collator.PRIMARY);
265 listheader.m_gapsHi_[1] = mergeCE(t1, t2, Collator.SECONDARY);
266 listheader.m_gapsHi_[2] = mergeCE(t1, t2, Collator.TERTIARY);
267 } else if (listheader.m_indirect_ == true
268 && listheader.m_nextCE_ != 0) {
269 listheader.m_pos_[0] = 0;
270 int t1 = listheader.m_baseCE_;
271 int t2 = listheader.m_baseContCE_;
272 listheader.m_gapsLo_[0] = mergeCE(t1, t2, Collator.PRIMARY);
273 listheader.m_gapsLo_[1] = mergeCE(t1, t2, Collator.SECONDARY);
274 listheader.m_gapsLo_[2] = mergeCE(t1, t2, Collator.TERTIARY);
275 t1 = listheader.m_nextCE_;
276 t2 = listheader.m_nextContCE_;
277 listheader.m_gapsHi_[0] = mergeCE(t1, t2, Collator.PRIMARY);
278 listheader.m_gapsHi_[1] = mergeCE(t1, t2, Collator.SECONDARY);
279 listheader.m_gapsHi_[2] = mergeCE(t1, t2, Collator.TERTIARY);
282 if (tokenstrength < CE_BASIC_STRENGTH_LIMIT_) {
283 listheader.m_pos_[tokenstrength] = getInverseNext(
284 listheader, tokenstrength);
285 if (listheader.m_pos_[tokenstrength] >= 0) {
286 listheader.m_fStrToken_[tokenstrength] = token;
288 // The CE must be implicit, since it's not in the
291 throw new Exception("Internal program error");
295 while (token != null && token.m_strength_ >= tokenstrength) {
296 if (tokenstrength < CE_BASIC_STRENGTH_LIMIT_) {
297 listheader.m_lStrToken_[tokenstrength] = token;
299 token = token.m_next_;
301 if (tokenstrength < CE_BASIC_STRENGTH_LIMIT_ - 1) {
302 // check if previous interval is the same and merge the
303 // intervals if it is so
304 if (listheader.m_pos_[tokenstrength] == listheader.m_pos_[tokenstrength + 1]) {
305 listheader.m_fStrToken_[tokenstrength] = listheader.m_fStrToken_[tokenstrength + 1];
306 listheader.m_fStrToken_[tokenstrength + 1] = null;
307 listheader.m_lStrToken_[tokenstrength + 1] = null;
308 listheader.m_pos_[tokenstrength + 1] = -1;
312 tokenstrength = token.m_strength_;
317 for (int st = 0; st < 3; st++) {
318 int pos = listheader.m_pos_[st];
320 int t1 = m_table_[3 * pos];
321 int t2 = m_table_[3 * pos + 1];
322 listheader.m_gapsHi_[3 * st] = mergeCE(t1, t2,
324 listheader.m_gapsHi_[3 * st + 1] = mergeCE(t1, t2,
326 listheader.m_gapsHi_[3 * st + 2] = (t1 & 0x3f) << 24
329 // t1 = m_table_[3 * pos];
330 // t2 = m_table_[3 * pos + 1];
331 t1 = listheader.m_baseCE_;
332 t2 = listheader.m_baseContCE_;
334 listheader.m_gapsLo_[3 * st] = mergeCE(t1, t2,
336 listheader.m_gapsLo_[3 * st + 1] = mergeCE(t1, t2,
338 listheader.m_gapsLo_[3 * st + 2] = (t1 & 0x3f) << 24
346 * Gets the next CE in the inverse table
354 private final int getInverseNext(
355 CollationRuleParser.TokenListHeader listheader, int strength) {
356 int ce = listheader.m_baseCE_;
357 int secondce = listheader.m_baseContCE_;
358 int result = findInverseCE(ce, secondce);
364 ce &= STRENGTH_MASK_[strength];
365 secondce &= STRENGTH_MASK_[strength];
368 int nextcontce = secondce;
370 while ((nextce & STRENGTH_MASK_[strength]) == ce
371 && (nextcontce & STRENGTH_MASK_[strength]) == secondce) {
372 nextce = m_table_[3 * (++result)];
373 nextcontce = m_table_[3 * result + 1];
376 listheader.m_nextCE_ = nextce;
377 listheader.m_nextContCE_ = nextcontce;
383 // package private data members ------------------------------------------
386 * Inverse UCA, instantiate only when required
388 static final InverseUCA INVERSE_UCA_;
391 * UCA and Inverse UCA version do not match
393 private static final String INV_UCA_VERSION_MISMATCH_ = "UCA versions of UCA and inverse UCA should match";
396 * UCA and Inverse UCA version do not match
398 private static final String UCA_NOT_INSTANTIATED_ = "UCA is not instantiated!";
401 * Initializing the inverse UCA
404 InverseUCA temp = null;
406 temp = CollatorReader.getInverseUCA();
407 } catch (IOException e) {
410 * try { String invdat = "/com/ibm/icu/impl/data/invuca.icu";
412 * CollationParsedRuleBuilder.class.getResourceAsStream(invdat);
413 * BufferedInputStream b = new BufferedInputStream(i, 110000);
414 * INVERSE_UCA_ = CollatorReader.readInverseUCA(b); b.close();
415 * i.close(); } catch (Exception e) { e.printStackTrace(); throw new
416 * RuntimeException(e.getMessage()); }
419 if (temp != null && RuleBasedCollator.UCA_ != null) {
420 if (!temp.m_UCA_version_
421 .equals(RuleBasedCollator.UCA_.m_UCA_version_)) {
422 throw new RuntimeException(INV_UCA_VERSION_MISMATCH_);
425 throw new RuntimeException(UCA_NOT_INSTANTIATED_);
431 // package private methods -----------------------------------------------
434 * Parse and sets the collation rules in the argument collator
438 * @exception Exception
439 * thrown when internal program error occurs
441 void setRules(RuleBasedCollator collator) throws Exception {
442 if (m_parser_.m_resultLength_ > 0 || m_parser_.m_removeSet_ != null) {
443 // we have a set of rules, let's make something of it
444 assembleTailoringTable(collator);
445 } else { // no rules, but no error either must be only options
446 // We will init the collator from UCA
447 collator.setWithUCATables();
449 // And set only the options
450 m_parser_.setDefaultOptionsInCollator(collator);
453 private void copyRangeFromUCA(BuildTable t, int start, int end) {
455 for (u = start; u <= end; u++) {
456 // if ((CE = ucmpe32_get(t.m_mapping, u)) == UCOL_NOT_FOUND
457 int CE = t.m_mapping_.getValue(u);
458 if (CE == CE_NOT_FOUND_
459 // this test is for contractions that are missing the starting
460 // element. Looks like latin-1 should be done before
461 // assembling the table, even if it results in more false
463 || (isContractionTableElement(CE) && getCE(
464 t.m_contractions_, CE, 0) == CE_NOT_FOUND_)) {
465 // m_utilElement_.m_uchars_ = str.toString();
466 m_utilElement_.m_uchars_ = UCharacter.toString(u);
467 m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
468 m_utilElement_.m_prefix_ = 0;
469 m_utilElement_.m_CELength_ = 0;
470 m_utilElement_.m_prefixChars_ = null;
471 m_utilColEIter_.setText(m_utilElement_.m_uchars_);
472 while (CE != CollationElementIterator.NULLORDER) {
473 CE = m_utilColEIter_.next();
474 if (CE != CollationElementIterator.NULLORDER) {
475 m_utilElement_.m_CEs_[m_utilElement_.m_CELength_++] = CE;
478 addAnElement(t, m_utilElement_);
484 * 2. Eliminate the negative lists by doing the following for each non-null
485 * negative list: o if previousCE(baseCE, strongestN) != some ListHeader X's
486 * baseCE, create new ListHeader X o reverse the list, add to the end of X's
487 * positive list. Reset the strength of the first item you add, based on the
488 * stronger strength levels of the two lists.
490 * 3. For each ListHeader with a non-null positive list: o Find all
491 * character strings with CEs between the baseCE and the next/previous CE,
492 * at the strength of the first token. Add these to the tailoring. ? That
493 * is, if UCA has ... x <<< X << x' <<< X' < y ..., and the tailoring has &
494 * x < z... ? Then we change the tailoring to & x <<< X << x' <<< X' < z ...
496 * It is possible that this part should be done even while constructing list
497 * The problem is that it is unknown what is going to be the strongest
498 * weight. So we might as well do it here o Allocate CEs for each token in
499 * the list, based on the total number N of the largest level difference,
500 * and the gap G between baseCE and nextCE at that level. The relation *
501 * between the last item and nextCE is the same as the strongest strength. o
502 * Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1) ? There are 3
503 * primary items: a, d, e. Fit them into the primary gap. Then fit b and c
504 * into the secondary gap between a and d, then fit q into the tertiary gap
505 * between b and c. o Example: baseCE << b <<< q << c * nextCE(X,2) ? There
506 * are 2 secondary items: b, c. Fit them into the secondary gap. Then fit q
507 * into the tertiary gap between b and c. o When incrementing primary
508 * values, we will not cross high byte boundaries except where there is only
509 * a single-byte primary. That is to ensure that the script reordering will
513 * the rule based collator to update
514 * @exception Exception
515 * thrown when internal program error occurs
517 void assembleTailoringTable(RuleBasedCollator collator) throws Exception {
519 for (int i = 0; i < m_parser_.m_resultLength_; i++) {
520 // now we need to generate the CEs
521 // We stuff the initial value in the buffers, and increase the
522 // appropriate buffer according to strength
523 if (m_parser_.m_listHeader_[i].m_first_ != null) {
524 // if there are any elements
525 // due to the way parser works, subsequent tailorings
526 // may remove all the elements from a sequence, therefore
527 // leaving an empty tailoring sequence.
528 initBuffers(m_parser_.m_listHeader_[i]);
532 if (m_parser_.m_variableTop_ != null) {
533 // stuff the variable top value
534 m_parser_.m_options_.m_variableTopValue_ = m_parser_.m_variableTop_.m_CE_[0] >>> 16;
535 // remove it from the list
536 if (m_parser_.m_variableTop_.m_listHeader_.m_first_ == m_parser_.m_variableTop_) { // first
539 m_parser_.m_variableTop_.m_listHeader_.m_first_ = m_parser_.m_variableTop_.m_next_;
541 if (m_parser_.m_variableTop_.m_listHeader_.m_last_ == m_parser_.m_variableTop_) {
543 m_parser_.m_variableTop_.m_listHeader_.m_last_ = m_parser_.m_variableTop_.m_previous_;
545 if (m_parser_.m_variableTop_.m_next_ != null) {
546 m_parser_.m_variableTop_.m_next_.m_previous_ = m_parser_.m_variableTop_.m_previous_;
548 if (m_parser_.m_variableTop_.m_previous_ != null) {
549 m_parser_.m_variableTop_.m_previous_.m_next_ = m_parser_.m_variableTop_.m_next_;
553 BuildTable t = new BuildTable(m_parser_);
555 // After this, we have assigned CE values to all regular CEs now we
556 // will go through list once more and resolve expansions, make
557 // UCAElements structs and add them to table
558 for (int i = 0; i < m_parser_.m_resultLength_; i++) {
559 // now we need to generate the CEs
560 // We stuff the initial value in the buffers, and increase the
561 // appropriate buffer according to strength */
562 createElements(t, m_parser_.m_listHeader_[i]);
565 m_utilElement_.clear();
568 copyRangeFromUCA(t, 0, 0xFF);
570 // add stuff for copying
571 if (m_parser_.m_copySet_ != null) {
573 for (i = 0; i < m_parser_.m_copySet_.getRangeCount(); i++) {
574 copyRangeFromUCA(t, m_parser_.m_copySet_.getRangeStart(i),
575 m_parser_.m_copySet_.getRangeEnd(i));
579 // copy contractions from the UCA - this is felt mostly for cyrillic
580 char conts[] = RuleBasedCollator.UCA_CONTRACTIONS_;
581 int maxUCAContractionLength = RuleBasedCollator.MAX_UCA_CONTRACTION_LENGTH;
583 while (conts[offset] != 0) {
584 // A continuation is NUL-terminated and NUL-padded
585 // except if it has the maximum length.
586 int contractionLength = maxUCAContractionLength;
587 while (contractionLength > 0 && conts[offset + contractionLength - 1] == 0) {
590 int first = Character.codePointAt(conts, offset);
591 int firstLength = Character.charCount(first);
592 int tailoredCE = t.m_mapping_.getValue(first);
593 Elements prefixElm = null;
594 if (tailoredCE != CE_NOT_FOUND_) {
595 boolean needToAdd = true;
596 if (isContractionTableElement(tailoredCE)) {
597 if (isTailored(t.m_contractions_, tailoredCE, conts,
598 offset + firstLength) == true) {
602 if (!needToAdd && isPrefix(tailoredCE)
603 && conts[offset + 1] == 0) {
604 // pre-context character in UCA
605 // The format for pre-context character is
606 // conts[0]: baseCP conts[1]:0 conts[2]:pre-context CP
607 Elements elm = new Elements();
609 elm.m_uchars_ = Character.toString(conts[offset]);
610 elm.m_cPoints_ = m_utilElement_.m_uchars_;
611 elm.m_prefixChars_ = Character.toString(conts[offset + 2]);
612 elm.m_prefix_ = 0; // TODO(claireho) : confirm!
613 prefixElm = t.m_prefixLookup_.get(elm);
614 if ((prefixElm == null)
615 || (prefixElm.m_prefixChars_.charAt(0) != conts[offset + 2])) {
619 if (m_parser_.m_removeSet_ != null
620 && m_parser_.m_removeSet_.contains(first)) {
624 if (needToAdd == true) {
625 // we need to add if this contraction is not tailored.
626 if (conts[offset + 1] != 0) { // not precontext
627 m_utilElement_.m_prefix_ = 0;
628 m_utilElement_.m_prefixChars_ = null;
629 m_utilElement_.m_uchars_ = new String(conts, offset, contractionLength);
630 m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
631 m_utilElement_.m_CELength_ = 0;
632 m_utilColEIter_.setText(m_utilElement_.m_uchars_);
633 } else { // add a pre-context element
635 m_utilElement_.m_uchars_ = Character.toString(conts[offset]);
636 m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
637 m_utilElement_.m_CELength_ = 0;
638 m_utilElement_.m_prefixChars_ = Character.toString(conts[offset + 2]);
639 if (prefixElm == null) {
640 m_utilElement_.m_prefix_ = 0;
641 } else { // TODO (claireho): confirm!
642 m_utilElement_.m_prefix_ = m_utilElement_.m_prefix_;
643 // m_utilElement_.m_prefix_= prefixElm.m_prefix_;
645 m_utilColEIter_.setText(m_utilElement_.m_prefixChars_);
646 while (m_utilColEIter_.next() != CollationElementIterator.NULLORDER) {
647 // count number of keys for pre-context char.
650 m_utilColEIter_.setText(m_utilElement_.m_prefixChars_ + m_utilElement_.m_uchars_);
651 // Skip the keys for prefix character, then copy the
653 while ((preKeyLen-- > 0)
654 && m_utilColEIter_.next() != CollationElementIterator.NULLORDER) {
660 int CE = m_utilColEIter_.next();
661 if (CE != CollationElementIterator.NULLORDER) {
662 m_utilElement_.m_CEs_[m_utilElement_.m_CELength_++] = CE;
667 addAnElement(t, m_utilElement_);
669 } else if (m_parser_.m_removeSet_ != null
670 && m_parser_.m_removeSet_.contains(first)) {
671 copyRangeFromUCA(t, first, first);
674 offset += maxUCAContractionLength;
677 // Add completely ignorable elements
678 processUCACompleteIgnorables(t);
683 // still need to produce compatibility closure
684 assembleTable(t, collator);
687 // private inner classes -------------------------------------------------
689 @SuppressWarnings("unused")
690 private static class CEGenerator {
691 // package private data members --------------------------------------
693 WeightRange m_ranges_[];
701 int m_fLow_; // forbidden Low
702 int m_fHigh_; // forbidden High
704 // package private constructor ---------------------------------------
707 m_ranges_ = new WeightRange[7];
708 for (int i = 6; i >= 0; i--) {
709 m_ranges_[i] = new WeightRange();
714 private static class WeightRange implements Comparable<WeightRange> {
715 // public methods ----------------------------------------------------
718 * Compares this object with target
720 * @param target object to compare with
721 * @return 0 if equals, 1 if this is > target, -1 otherwise
723 public int compareTo(WeightRange target) {
724 return Utility.compareUnsigned(m_start_, target.m_start_);
730 public void clear() {
739 // package private data members --------------------------------------
748 // package private constructor ---------------------------------------
755 * Copy constructor. Cloneable is troublesome, needs to check for
761 WeightRange(WeightRange source) {
762 m_start_ = source.m_start_;
763 m_end_ = source.m_end_;
764 m_length_ = source.m_length_;
765 m_count_ = source.m_count_;
766 m_length2_ = source.m_length2_;
767 m_count2_ = source.m_count2_;
771 private static class MaxJamoExpansionTable {
772 // package private data members --------------------------------------
774 List<Integer> m_endExpansionCE_;
775 // vector of booleans
776 List<Boolean> m_isV_;
781 // package private constructor ---------------------------------------
783 MaxJamoExpansionTable() {
784 m_endExpansionCE_ = new ArrayList<Integer>();
785 m_isV_ = new ArrayList<Boolean>();
786 m_endExpansionCE_.add(Integer.valueOf(0));
787 m_isV_.add(Boolean.FALSE);
793 MaxJamoExpansionTable(MaxJamoExpansionTable table) {
794 m_endExpansionCE_ = new ArrayList<Integer>(table.m_endExpansionCE_);
795 m_isV_ = new ArrayList<Boolean>(table.m_isV_);
796 m_maxLSize_ = table.m_maxLSize_;
797 m_maxVSize_ = table.m_maxVSize_;
798 m_maxTSize_ = table.m_maxTSize_;
802 private static class MaxExpansionTable {
803 // package private constructor --------------------------------------
805 MaxExpansionTable() {
806 m_endExpansionCE_ = new ArrayList<Integer>();
807 m_expansionCESize_ = new ArrayList<Byte>();
808 m_endExpansionCE_.add(Integer.valueOf(0));
809 m_expansionCESize_.add(Byte.valueOf((byte) 0));
812 MaxExpansionTable(MaxExpansionTable table) {
813 m_endExpansionCE_ = new ArrayList<Integer>(table.m_endExpansionCE_);
814 m_expansionCESize_ = new ArrayList<Byte>(table.m_expansionCESize_);
817 // package private data member --------------------------------------
819 List<Integer> m_endExpansionCE_;
820 List<Byte> m_expansionCESize_;
823 private static class BasicContractionTable {
824 // package private constructors -------------------------------------
826 BasicContractionTable() {
827 m_CEs_ = new ArrayList<Integer>();
828 m_codePoints_ = new StringBuilder();
831 // package private data members -------------------------------------
833 StringBuilder m_codePoints_;
834 List<Integer> m_CEs_;
837 private static class ContractionTable {
838 // package private constructor --------------------------------------
841 * Builds a contraction table
845 ContractionTable(IntTrieBuilder mapping) {
846 m_mapping_ = mapping;
847 m_elements_ = new ArrayList<BasicContractionTable>();
848 m_CEs_ = new ArrayList<Integer>();
849 m_codePoints_ = new StringBuilder();
850 m_offsets_ = new ArrayList<Integer>();
851 m_currentTag_ = CE_NOT_FOUND_TAG_;
855 * Copies a contraction table. Not all data will be copied into their
860 ContractionTable(ContractionTable table) {
861 m_mapping_ = table.m_mapping_;
862 m_elements_ = new ArrayList<BasicContractionTable>(table.m_elements_);
863 m_codePoints_ = new StringBuilder(table.m_codePoints_);
864 m_CEs_ = new ArrayList<Integer>(table.m_CEs_);
865 m_offsets_ = new ArrayList<Integer>(table.m_offsets_);
866 m_currentTag_ = table.m_currentTag_;
869 // package private data members ------------------------------------
872 * Vector of BasicContractionTable
874 List<BasicContractionTable> m_elements_;
875 IntTrieBuilder m_mapping_;
876 StringBuilder m_codePoints_;
877 List<Integer> m_CEs_;
878 List<Integer> m_offsets_;
883 * Private class for combining mark table. The table is indexed by the class
886 @SuppressWarnings("unused")
887 private static class CombinClassTable {
889 * accumulated numbers of combining marks.
891 int[] index = new int[256];
894 * code point array for combining marks.
912 * Copy the combining mark table from ccc and index in compact way.
919 * : index of combining classes(0-255)
921 void generate(char[] cps, int numOfCM, int[] ccIndex) {
924 cPoints = new char[numOfCM];
925 for (int i = 0; i < 256; i++) {
926 for (int j = 0; j < ccIndex[i]; j++) {
927 cPoints[count++] = cps[(i << 8) + j];
935 * Get first CM(combining mark) with the combining class value cClass.
938 * : combining class value.
939 * @return combining mark codepoint or 0 if no combining make with class
942 char GetFirstCM(int cClass) {
944 if (cPoints == null || cClass == 0
945 || index[cClass] == index[cClass - 1]) {
949 return cPoints[index[cClass - 1]];
953 * Get next CM(combining mark) with the combining class value cClass.
954 * Return combining mark codepoint or 0 if no next CM.
958 || index[curClass] == (index[curClass - 1] + pos)) {
961 return cPoints[index[curClass - 1] + (pos++)];
964 // private data members
969 private static final class BuildTable implements TrieBuilder.DataManipulate {
970 // package private methods ------------------------------------------
973 * For construction of the Trie tables. Has to be labeled public
975 * @param cp The value of the code point.
976 * @param offset The value of the offset.
977 * @return data offset or 0
979 public int getFoldedValue(int cp, int offset) {
980 int limit = cp + 0x400;
982 int value = m_mapping_.getValue(cp);
983 boolean inBlockZero = m_mapping_.isInZeroBlock(cp);
984 int tag = getCETag(value);
985 if (inBlockZero == true) {
986 cp += TrieBuilder.DATA_BLOCK_LENGTH;
987 } else if (!(isSpecial(value) && (tag == CE_IMPLICIT_TAG_ || tag == CE_NOT_FOUND_TAG_))) {
988 // These are values that are starting in either UCA
989 // (IMPLICIT_TAG) or in the tailorings (NOT_FOUND_TAG).
990 // Presence of these tags means that there is nothing in
991 // this position and that it should be skipped.
992 return RuleBasedCollator.CE_SPECIAL_FLAG_
993 | (CE_SURROGATE_TAG_ << 24) | offset;
1001 // package private constructor --------------------------------------
1006 BuildTable(CollationRuleParser parser) {
1007 m_collator_ = new RuleBasedCollator();
1008 m_collator_.setWithUCAData();
1009 MaxExpansionTable maxet = new MaxExpansionTable();
1010 MaxJamoExpansionTable maxjet = new MaxJamoExpansionTable();
1011 m_options_ = parser.m_options_;
1012 m_expansions_ = new ArrayList<Integer>();
1013 // Do your own mallocs for the structure, array and have linear
1015 int trieinitialvalue = RuleBasedCollator.CE_SPECIAL_FLAG_
1016 | (CE_NOT_FOUND_TAG_ << 24);
1017 // temporary fix for jb3822, 0x100000 -> 30000
1018 m_mapping_ = new IntTrieBuilder(null, 0x30000, trieinitialvalue,
1019 trieinitialvalue, true);
1020 m_prefixLookup_ = new HashMap<Elements, Elements>();
1021 // uhash_open(prefixLookupHash, prefixLookupComp);
1022 m_contractions_ = new ContractionTable(m_mapping_);
1023 // copy UCA's maxexpansion and merge as we go along
1024 m_maxExpansions_ = maxet;
1025 // adding an extra initial value for easier manipulation
1026 for (int i = 0; i < RuleBasedCollator.UCA_.m_expansionEndCE_.length; i++) {
1027 maxet.m_endExpansionCE_.add(Integer.valueOf(
1028 RuleBasedCollator.UCA_.m_expansionEndCE_[i]));
1029 maxet.m_expansionCESize_.add(Byte.valueOf(
1030 RuleBasedCollator.UCA_.m_expansionEndCEMaxSize_[i]));
1032 m_maxJamoExpansions_ = maxjet;
1034 m_unsafeCP_ = new byte[UNSAFECP_TABLE_SIZE_];
1035 m_contrEndCP_ = new byte[UNSAFECP_TABLE_SIZE_];
1036 Arrays.fill(m_unsafeCP_, (byte) 0);
1037 Arrays.fill(m_contrEndCP_, (byte) 0);
1041 * Duplicating a BuildTable. Not all data will be duplicated into their
1047 BuildTable(BuildTable table) {
1048 m_collator_ = table.m_collator_;
1049 m_mapping_ = new IntTrieBuilder(table.m_mapping_);
1050 m_expansions_ = new ArrayList<Integer>(table.m_expansions_);
1051 m_contractions_ = new ContractionTable(table.m_contractions_);
1052 m_contractions_.m_mapping_ = m_mapping_;
1053 m_options_ = table.m_options_;
1054 m_maxExpansions_ = new MaxExpansionTable(table.m_maxExpansions_);
1055 m_maxJamoExpansions_ = new MaxJamoExpansionTable(
1056 table.m_maxJamoExpansions_);
1057 m_unsafeCP_ = new byte[table.m_unsafeCP_.length];
1058 System.arraycopy(table.m_unsafeCP_, 0, m_unsafeCP_, 0,
1059 m_unsafeCP_.length);
1060 m_contrEndCP_ = new byte[table.m_contrEndCP_.length];
1061 System.arraycopy(table.m_contrEndCP_, 0, m_contrEndCP_, 0,
1062 m_contrEndCP_.length);
1065 // package private data members -------------------------------------
1067 RuleBasedCollator m_collator_;
1068 IntTrieBuilder m_mapping_;
1069 List<Integer> m_expansions_;
1070 ContractionTable m_contractions_;
1071 // UCATableHeader image;
1072 CollationRuleParser.OptionSet m_options_;
1073 MaxExpansionTable m_maxExpansions_;
1074 MaxJamoExpansionTable m_maxJamoExpansions_;
1076 byte m_contrEndCP_[];
1077 Map<Elements, Elements> m_prefixLookup_;
1078 CombinClassTable cmLookup = null;
1081 private static class Elements {
1082 // package private data members -------------------------------------
1084 String m_prefixChars_;
1092 * Offset to the working string
1094 int m_cPointsOffset_;
1096 * These are collation elements - there could be more than one - in case
1102 * This is the value element maps in original table
1108 boolean m_variableTop_;
1111 // package private constructors -------------------------------------
1114 * Package private constructor
1117 m_sizePrim_ = new int[128];
1118 m_sizeSec_ = new int[128];
1119 m_sizeTer_ = new int[128];
1120 m_CEs_ = new int[256];
1125 * Package private constructor
1127 Elements(Elements element) {
1128 m_prefixChars_ = element.m_prefixChars_;
1129 m_prefix_ = element.m_prefix_;
1130 m_uchars_ = element.m_uchars_;
1131 m_cPoints_ = element.m_cPoints_;
1132 m_cPointsOffset_ = element.m_cPointsOffset_;
1133 m_CEs_ = element.m_CEs_;
1134 m_CELength_ = element.m_CELength_;
1135 m_mapCE_ = element.m_mapCE_;
1136 m_sizePrim_ = element.m_sizePrim_;
1137 m_sizeSec_ = element.m_sizeSec_;
1138 m_sizeTer_ = element.m_sizeTer_;
1139 m_variableTop_ = element.m_variableTop_;
1140 m_caseBit_ = element.m_caseBit_;
1143 // package private methods -------------------------------------------
1146 * Initializing the elements
1148 public void clear() {
1149 m_prefixChars_ = null;
1153 m_cPointsOffset_ = 0;
1156 Arrays.fill(m_sizePrim_, 0);
1157 Arrays.fill(m_sizeSec_, 0);
1158 Arrays.fill(m_sizeTer_, 0);
1159 m_variableTop_ = false;
1164 * Hashcode calculation for token
1166 * @return the hashcode
1168 public int hashCode() {
1169 String str = m_cPoints_.substring(m_cPointsOffset_);
1170 return str.hashCode();
1174 * Equals calculation
1176 * @param target Object to compare
1177 * @return true if target is the same as this object
1179 public boolean equals(Object target) {
1180 if (target == this) {
1183 if (target instanceof Elements) {
1184 Elements t = (Elements) target;
1185 int size = m_cPoints_.length() - m_cPointsOffset_;
1186 if (size == t.m_cPoints_.length() - t.m_cPointsOffset_) {
1187 return t.m_cPoints_.regionMatches(t.m_cPointsOffset_,
1188 m_cPoints_, m_cPointsOffset_, size);
1195 // private data member ---------------------------------------------------
1198 * Maximum strength used in CE building
1200 private static final int CE_BASIC_STRENGTH_LIMIT_ = 3;
1202 * Maximum collation strength
1204 private static final int CE_STRENGTH_LIMIT_ = 16;
1206 * Strength mask array, used in inverse UCA
1208 private static final int STRENGTH_MASK_[] = { 0xFFFF0000, 0xFFFFFF00,
1211 * CE tag for not found
1213 private static final int CE_NOT_FOUND_ = 0xF0000000;
1215 * CE tag for not found
1217 private static final int CE_NOT_FOUND_TAG_ = 0;
1219 * This code point results in an expansion
1221 private static final int CE_EXPANSION_TAG_ = 1;
1223 * Start of a contraction
1225 private static final int CE_CONTRACTION_TAG_ = 2;
1227 * Thai character - do the reordering
1229 // private static final int CE_THAI_TAG_ = 3;
1231 * Charset processing, not yet implemented
1233 // private static final int CE_CHARSET_TAG_ = 4;
1235 * Lead surrogate that is tailored and doesn't start a contraction
1237 private static final int CE_SURROGATE_TAG_ = 5;
1241 // private static final int CE_HANGUL_SYLLABLE_TAG_ = 6;
1245 // private static final int CE_LEAD_SURROGATE_TAG_ = 7;
1249 // private static final int CE_TRAIL_SURROGATE_TAG_ = 8;
1251 * 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
1253 // private static final int CE_CJK_IMPLICIT_TAG_ = 9;
1254 private static final int CE_IMPLICIT_TAG_ = 10;
1255 private static final int CE_SPEC_PROC_TAG_ = 11;
1257 * This is a three byte primary with starting secondaries and tertiaries. It
1258 * fits in a single 32 bit CE and is used instead of expansion to save space
1259 * without affecting the performance (hopefully)
1261 private static final int CE_LONG_PRIMARY_TAG_ = 12;
1263 * Unsafe UChar hash table table size. Size is 32 bytes for 1 bit for each
1264 * latin 1 char + some power of two for hashing the rest of the chars. Size
1267 private static final int UNSAFECP_TABLE_SIZE_ = 1056;
1269 * Mask value down to "some power of two" -1. Number of bits, not num of
1272 private static final int UNSAFECP_TABLE_MASK_ = 0x1fff;
1276 private static final int UPPER_CASE_ = 0x80;
1277 private static final int MIXED_CASE_ = 0x40;
1278 private static final int LOWER_CASE_ = 0x00;
1280 * Initial table size
1282 // private static final int INIT_TABLE_SIZE_ = 1028;
1284 * Header size, copied from ICU4C, to be changed when that value changes
1286 // private static final int HEADER_SIZE_ = 0xC4;
1288 * Contraction table new element indicator
1290 private static final int CONTRACTION_TABLE_NEW_ELEMENT_ = 0xFFFFFF;
1292 * Parser for the rules
1294 private CollationRuleParser m_parser_;
1296 * Utility UCA collation element iterator
1298 private CollationElementIterator m_utilColEIter_;
1300 * Utility data members
1302 private CEGenerator m_utilGens_[] = { new CEGenerator(), new CEGenerator(),
1303 new CEGenerator() };
1304 private int m_utilCEBuffer_[] = new int[CE_BASIC_STRENGTH_LIMIT_];
1305 private int m_utilIntBuffer_[] = new int[CE_STRENGTH_LIMIT_];
1306 private Elements m_utilElement_ = new Elements();
1307 private Elements m_utilElement2_ = new Elements();
1308 private CollationRuleParser.Token m_utilToken_ = new CollationRuleParser.Token();
1309 private int m_utilCountBuffer_[] = new int[6];
1310 private long m_utilLongBuffer_[] = new long[5];
1311 private WeightRange m_utilLowerWeightRange_[] = { new WeightRange(),
1312 new WeightRange(), new WeightRange(), new WeightRange(),
1313 new WeightRange() };
1314 private WeightRange m_utilUpperWeightRange_[] = { new WeightRange(),
1315 new WeightRange(), new WeightRange(), new WeightRange(),
1316 new WeightRange() };
1317 private WeightRange m_utilWeightRange_ = new WeightRange();
1318 private final Normalizer2Impl m_nfcImpl_ = Norm2AllModes.getNFCInstance().impl;
1319 private CanonicalIterator m_utilCanIter_ = new CanonicalIterator("");
1320 private StringBuilder m_utilStringBuffer_ = new StringBuilder("");
1321 // Flag indicating a combining marks table is required or not.
1322 private static boolean buildCMTabFlag = false;
1324 // private methods -------------------------------------------------------
1328 * parsed rule tokens
1329 * @exception Exception
1330 * thrown when internal error occurs
1332 private void initBuffers(CollationRuleParser.TokenListHeader listheader)
1334 CollationRuleParser.Token token = listheader.m_last_;
1335 Arrays.fill(m_utilIntBuffer_, 0, CE_STRENGTH_LIMIT_, 0);
1337 token.m_toInsert_ = 1;
1338 m_utilIntBuffer_[token.m_strength_] = 1;
1339 while (token.m_previous_ != null) {
1340 if (token.m_previous_.m_strength_ < token.m_strength_) {
1342 m_utilIntBuffer_[token.m_strength_] = 0;
1343 m_utilIntBuffer_[token.m_previous_.m_strength_]++;
1344 } else if (token.m_previous_.m_strength_ > token.m_strength_) {
1346 m_utilIntBuffer_[token.m_previous_.m_strength_] = 1;
1348 m_utilIntBuffer_[token.m_strength_]++;
1350 token = token.m_previous_;
1351 token.m_toInsert_ = m_utilIntBuffer_[token.m_strength_];
1354 token.m_toInsert_ = m_utilIntBuffer_[token.m_strength_];
1355 INVERSE_UCA_.getInverseGapPositions(listheader);
1357 token = listheader.m_first_;
1358 int fstrength = Collator.IDENTICAL;
1359 int initstrength = Collator.IDENTICAL;
1361 m_utilCEBuffer_[Collator.PRIMARY] = mergeCE(listheader.m_baseCE_,
1362 listheader.m_baseContCE_, Collator.PRIMARY);
1363 m_utilCEBuffer_[Collator.SECONDARY] = mergeCE(listheader.m_baseCE_,
1364 listheader.m_baseContCE_, Collator.SECONDARY);
1365 m_utilCEBuffer_[Collator.TERTIARY] = mergeCE(listheader.m_baseCE_,
1366 listheader.m_baseContCE_, Collator.TERTIARY);
1367 while (token != null) {
1368 fstrength = token.m_strength_;
1369 if (fstrength < initstrength) {
1370 initstrength = fstrength;
1371 if (listheader.m_pos_[fstrength] == -1) {
1372 while (listheader.m_pos_[fstrength] == -1 && fstrength > 0) {
1375 if (listheader.m_pos_[fstrength] == -1) {
1376 throw new Exception("Internal program error");
1379 if (initstrength == Collator.TERTIARY) {
1380 // starting with tertiary
1381 m_utilCEBuffer_[Collator.PRIMARY] = listheader.m_gapsLo_[fstrength * 3];
1382 m_utilCEBuffer_[Collator.SECONDARY] = listheader.m_gapsLo_[fstrength * 3 + 1];
1383 m_utilCEBuffer_[Collator.TERTIARY] = getCEGenerator(
1384 m_utilGens_[Collator.TERTIARY],
1385 listheader.m_gapsLo_, listheader.m_gapsHi_, token,
1387 } else if (initstrength == Collator.SECONDARY) {
1389 m_utilCEBuffer_[Collator.PRIMARY] = listheader.m_gapsLo_[fstrength * 3];
1390 m_utilCEBuffer_[Collator.SECONDARY] = getCEGenerator(
1391 m_utilGens_[Collator.SECONDARY],
1392 listheader.m_gapsLo_, listheader.m_gapsHi_, token,
1394 m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator(
1395 m_utilGens_[Collator.TERTIARY], token,
1399 m_utilCEBuffer_[Collator.PRIMARY] = getCEGenerator(
1400 m_utilGens_[Collator.PRIMARY],
1401 listheader.m_gapsLo_, listheader.m_gapsHi_, token,
1403 m_utilCEBuffer_[Collator.SECONDARY] = getSimpleCEGenerator(
1404 m_utilGens_[Collator.SECONDARY], token,
1405 Collator.SECONDARY);
1406 m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator(
1407 m_utilGens_[Collator.TERTIARY], token,
1411 if (token.m_strength_ == Collator.TERTIARY) {
1412 m_utilCEBuffer_[Collator.TERTIARY] = getNextGenerated(m_utilGens_[Collator.TERTIARY]);
1413 } else if (token.m_strength_ == Collator.SECONDARY) {
1414 m_utilCEBuffer_[Collator.SECONDARY] = getNextGenerated(m_utilGens_[Collator.SECONDARY]);
1415 m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator(
1416 m_utilGens_[Collator.TERTIARY], token,
1418 } else if (token.m_strength_ == Collator.PRIMARY) {
1419 m_utilCEBuffer_[Collator.PRIMARY] = getNextGenerated(m_utilGens_[Collator.PRIMARY]);
1420 m_utilCEBuffer_[Collator.SECONDARY] = getSimpleCEGenerator(
1421 m_utilGens_[Collator.SECONDARY], token,
1422 Collator.SECONDARY);
1423 m_utilCEBuffer_[Collator.TERTIARY] = getSimpleCEGenerator(
1424 m_utilGens_[Collator.TERTIARY], token,
1428 doCE(m_utilCEBuffer_, token);
1429 token = token.m_next_;
1434 * Get the next generated ce
1438 * @return next generated ce
1440 private int getNextGenerated(CEGenerator g) {
1441 g.m_current_ = nextWeight(g);
1442 return g.m_current_;
1451 * @return ce generator
1452 * @exception Exception
1453 * thrown when internal error occurs
1455 private int getSimpleCEGenerator(CEGenerator g,
1456 CollationRuleParser.Token token, int strength) throws Exception {
1457 int high, low, count = 1;
1458 int maxbyte = (strength == Collator.TERTIARY) ? 0x3F : 0xFF;
1460 if (strength == Collator.SECONDARY) {
1461 low = RuleBasedCollator.COMMON_TOP_2_ << 24;
1463 count = 0xFF - RuleBasedCollator.COMMON_TOP_2_;
1465 low = RuleBasedCollator.BYTE_COMMON_ << 24; // 0x05000000;
1467 count = 0x40 - RuleBasedCollator.BYTE_COMMON_;
1470 if (token.m_next_ != null && token.m_next_.m_strength_ == strength) {
1471 count = token.m_next_.m_toInsert_;
1474 g.m_rangesLength_ = allocateWeights(low, high, count, maxbyte,
1476 g.m_current_ = RuleBasedCollator.BYTE_COMMON_ << 24;
1478 if (g.m_rangesLength_ == 0) {
1479 throw new Exception("Internal program error");
1481 return g.m_current_;
1485 * Combines 2 ce into one with respect to the argument strength
1493 * @return combined ce
1495 private static int mergeCE(int ce1, int ce2, int strength) {
1496 int mask = RuleBasedCollator.CE_TERTIARY_MASK_;
1497 if (strength == Collator.SECONDARY) {
1498 mask = RuleBasedCollator.CE_SECONDARY_MASK_;
1499 } else if (strength == Collator.PRIMARY) {
1500 mask = RuleBasedCollator.CE_PRIMARY_MASK_;
1505 case Collator.PRIMARY:
1506 return ce1 | ce2 >>> 16;
1507 case Collator.SECONDARY:
1508 return ce1 << 16 | ce2 << 8;
1510 return ce1 << 24 | ce2 << 16;
1524 * @exception Exception
1525 * thrown when internal error occurs
1527 private int getCEGenerator(CEGenerator g, int lows[], int highs[],
1528 CollationRuleParser.Token token, int fstrength) throws Exception {
1529 int strength = token.m_strength_;
1530 int low = lows[fstrength * 3 + strength];
1531 int high = highs[fstrength * 3 + strength];
1533 if (strength == Collator.TERTIARY) {
1535 } else if (strength == Collator.PRIMARY) {
1541 int count = token.m_toInsert_;
1543 if (Utility.compareUnsigned(low, high) >= 0
1544 && strength > Collator.PRIMARY) {
1548 if (lows[fstrength * 3 + s] != highs[fstrength * 3 + s]) {
1549 if (strength == Collator.SECONDARY) {
1550 if (low < (RuleBasedCollator.COMMON_TOP_2_ << 24)) {
1551 // Override if low range is less than
1552 // UCOL_COMMON_TOP2.
1553 low = RuleBasedCollator.COMMON_TOP_2_ << 24;
1557 if (low < RuleBasedCollator.COMMON_BOTTOM_3 << 24) {
1558 // Override if low range is less than
1559 // UCOL_COMMON_BOT3.
1560 low = RuleBasedCollator.COMMON_BOTTOM_3 << 24;
1567 throw new Exception("Internal program error");
1571 if(0 <= low && low < 0x02000000) { // unsigned comparison < 0x02000000
1572 // We must not use CE weight byte 02, so we set it as the minimum lower bound.
1573 // See http://site.icu-project.org/design/collation/bytes
1577 if (strength == Collator.SECONDARY) { // similar as simple
1578 if (Utility.compareUnsigned(low,
1579 RuleBasedCollator.COMMON_BOTTOM_2_ << 24) >= 0
1580 && Utility.compareUnsigned(low,
1581 RuleBasedCollator.COMMON_TOP_2_ << 24) < 0) {
1582 low = RuleBasedCollator.COMMON_TOP_2_ << 24;
1584 if (Utility.compareUnsigned(high,
1585 RuleBasedCollator.COMMON_BOTTOM_2_ << 24) > 0
1586 && Utility.compareUnsigned(high,
1587 RuleBasedCollator.COMMON_TOP_2_ << 24) < 0) {
1588 high = RuleBasedCollator.COMMON_TOP_2_ << 24;
1590 if (Utility.compareUnsigned(low,
1591 RuleBasedCollator.COMMON_BOTTOM_2_ << 24) < 0) {
1592 g.m_rangesLength_ = allocateWeights(
1593 RuleBasedCollator.BYTE_UNSHIFTED_MIN_ << 24, high,
1594 count, maxbyte, g.m_ranges_);
1595 g.m_current_ = nextWeight(g);
1596 // g.m_current_ = RuleBasedCollator.COMMON_BOTTOM_2_ << 24;
1597 return g.m_current_;
1601 g.m_rangesLength_ = allocateWeights(low, high, count, maxbyte,
1603 if (g.m_rangesLength_ == 0) {
1604 throw new Exception("Internal program error");
1606 g.m_current_ = nextWeight(g);
1607 return g.m_current_;
1612 * list of collation elements parts
1615 * @exception Exception
1616 * thrown when forming case bits for expansions fails
1618 private void doCE(int ceparts[], CollationRuleParser.Token token)
1620 // this one makes the table and stuff
1621 // int noofbytes[] = new int[3];
1622 for (int i = 0; i < 3; i++) {
1623 // noofbytes[i] = countBytes(ceparts[i]);
1624 m_utilIntBuffer_[i] = countBytes(ceparts[i]);
1627 // Here we have to pack CEs from parts
1631 while ((cei << 1) < m_utilIntBuffer_[0] || cei < m_utilIntBuffer_[1]
1632 || cei < m_utilIntBuffer_[2]) {
1634 value = RuleBasedCollator.CE_CONTINUATION_MARKER_;
1639 if ((cei << 1) < m_utilIntBuffer_[0]) {
1640 value |= ((ceparts[0] >> (32 - ((cei + 1) << 4))) & 0xFFFF) << 16;
1642 if (cei < m_utilIntBuffer_[1]) {
1643 value |= ((ceparts[1] >> (32 - ((cei + 1) << 3))) & 0xFF) << 8;
1646 if (cei < m_utilIntBuffer_[2]) {
1647 value |= ((ceparts[2] >> (32 - ((cei + 1) << 3))) & 0x3F);
1649 token.m_CE_[cei] = value;
1652 if (cei == 0) { // totally ignorable
1653 token.m_CELength_ = 1;
1655 } else { // there is at least something
1656 token.m_CELength_ = cei;
1659 // Case bits handling for expansion
1660 if (token.m_CE_[0] != 0) { // case bits should be set only for
1662 token.m_CE_[0] &= 0xFFFFFF3F; // Clean the case bits field
1663 int cSize = (token.m_source_ & 0xFF000000) >>> 24;
1664 int startoftokenrule = token.m_source_ & 0x00FFFFFF;
1668 String tokenstr = token.m_rules_.substring(startoftokenrule,
1669 startoftokenrule + cSize);
1670 token.m_CE_[0] |= getCaseBits(tokenstr);
1672 // Copy it from the UCA
1673 int caseCE = getFirstCE(token.m_rules_.charAt(startoftokenrule));
1674 token.m_CE_[0] |= (caseCE & 0xC0);
1680 * Count the number of non-zero bytes used in the ce
1683 * @return number of non-zero bytes used in ce
1685 private static final int countBytes(int ce) {
1686 int mask = 0xFFFFFFFF;
1689 if ((ce & mask) != 0) {
1698 * We are ready to create collation elements
1701 * build table to insert
1703 * rule token list header
1705 private void createElements(BuildTable t,
1706 CollationRuleParser.TokenListHeader lh) {
1707 CollationRuleParser.Token tok = lh.m_first_;
1708 m_utilElement_.clear();
1709 while (tok != null) {
1710 // first, check if there are any expansions
1711 // if there are expansions, we need to do a little bit more
1712 // processing since parts of expansion can be tailored, while
1714 if (tok.m_expansion_ != 0) {
1715 int len = tok.m_expansion_ >>> 24;
1716 int currentSequenceLen = len;
1717 int expOffset = tok.m_expansion_ & 0x00FFFFFF;
1718 m_utilToken_.m_source_ = currentSequenceLen | expOffset;
1719 m_utilToken_.m_rules_ = m_parser_.m_source_;
1722 currentSequenceLen = len;
1723 while (currentSequenceLen > 0) {
1724 m_utilToken_.m_source_ = (currentSequenceLen << 24)
1726 CollationRuleParser.Token expt = m_parser_.m_hashTable_.get(m_utilToken_);
1728 && expt.m_strength_ != CollationRuleParser.TOKEN_RESET_) {
1729 // expansion is tailored
1730 int noOfCEsToCopy = expt.m_CELength_;
1731 for (int j = 0; j < noOfCEsToCopy; j++) {
1732 tok.m_expCE_[tok.m_expCELength_ + j] = expt.m_CE_[j];
1734 tok.m_expCELength_ += noOfCEsToCopy;
1735 // never try to add codepoints and CEs.
1736 // For some odd reason, it won't work.
1737 expOffset += currentSequenceLen; // noOfCEsToCopy;
1738 len -= currentSequenceLen; // noOfCEsToCopy;
1741 currentSequenceLen--;
1744 if (currentSequenceLen == 0) {
1745 // couldn't find any tailored subsequence, will have to
1746 // get one from UCA. first, get the UChars from the
1747 // rules then pick CEs out until there is no more and
1748 // stuff them into expansion
1749 m_utilColEIter_.setText(m_parser_.m_source_.substring(
1750 expOffset, expOffset + 1));
1752 int order = m_utilColEIter_.next();
1753 if (order == CollationElementIterator.NULLORDER) {
1756 tok.m_expCE_[tok.m_expCELength_++] = order;
1763 tok.m_expCELength_ = 0;
1766 // set the ucaelement with obtained values
1767 m_utilElement_.m_CELength_ = tok.m_CELength_ + tok.m_expCELength_;
1770 System.arraycopy(tok.m_CE_, 0, m_utilElement_.m_CEs_, 0,
1772 System.arraycopy(tok.m_expCE_, 0, m_utilElement_.m_CEs_,
1773 tok.m_CELength_, tok.m_expCELength_);
1776 // We kept prefix and source kind of together, as it is a kind of a
1778 // However, now we have to slice the prefix off the main thing -
1779 m_utilElement_.m_prefix_ = 0;// el.m_prefixChars_;
1780 m_utilElement_.m_cPointsOffset_ = 0; // el.m_uchars_;
1781 if (tok.m_prefix_ != 0) {
1782 // we will just copy the prefix here, and adjust accordingly in
1783 // the addPrefix function in ucol_elm. The reason is that we
1784 // need to add both composed AND decomposed elements to the
1786 int size = tok.m_prefix_ >> 24;
1787 int offset = tok.m_prefix_ & 0x00FFFFFF;
1788 m_utilElement_.m_prefixChars_ = m_parser_.m_source_.substring(
1789 offset, offset + size);
1790 size = (tok.m_source_ >> 24) - (tok.m_prefix_ >> 24);
1791 offset = (tok.m_source_ & 0x00FFFFFF) + (tok.m_prefix_ >> 24);
1792 m_utilElement_.m_uchars_ = m_parser_.m_source_.substring(
1793 offset, offset + size);
1795 m_utilElement_.m_prefixChars_ = null;
1796 int offset = tok.m_source_ & 0x00FFFFFF;
1797 int size = tok.m_source_ >>> 24;
1798 m_utilElement_.m_uchars_ = m_parser_.m_source_.substring(
1799 offset, offset + size);
1801 m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
1803 boolean containCombinMarks = false;
1804 for (int i = 0; i < m_utilElement_.m_cPoints_.length()
1805 - m_utilElement_.m_cPointsOffset_; i++) {
1806 if (isJamo(m_utilElement_.m_cPoints_.charAt(i))) {
1807 t.m_collator_.m_isJamoSpecial_ = true;
1810 if (!buildCMTabFlag) {
1811 // check combining class
1812 int fcd = m_nfcImpl_.getFCD16(m_utilElement_.m_cPoints_.charAt(i)); // TODO: review for handling supplementary characters
1813 if ((fcd & 0xff) == 0) {
1814 // reset flag when current char is not combining mark.
1815 containCombinMarks = false;
1817 containCombinMarks = true;
1822 if (!buildCMTabFlag && containCombinMarks) {
1823 buildCMTabFlag = true;
1827 * // Case bits handling m_utilElement_.m_CEs_[0] &= 0xFFFFFF3F; //
1828 * Clean the case bits field if (m_utilElement_.m_cPoints_.length()
1829 * - m_utilElement_.m_cPointsOffset_ > 1) { // Do it manually
1830 * m_utilElement_.m_CEs_[0] |=
1831 * getCaseBits(m_utilElement_.m_cPoints_); } else { // Copy it from
1832 * the UCA int caseCE =
1833 * getFirstCE(m_utilElement_.m_cPoints_.charAt(0));
1834 * m_utilElement_.m_CEs_[0] |= (caseCE & 0xC0); }
1837 addAnElement(t, m_utilElement_);
1843 * Testing if the string argument has case
1847 * @return the case for this char array
1848 * @exception Exception
1849 * thrown when internal program error occurs
1851 private final int getCaseBits(String src) throws Exception {
1854 src = Normalizer.decompose(src, true);
1855 m_utilColEIter_.setText(src);
1856 for (int i = 0; i < src.length(); i++) {
1857 m_utilColEIter_.setText(src.substring(i, i + 1));
1858 int order = m_utilColEIter_.next();
1859 if (RuleBasedCollator.isContinuation(order)) {
1860 throw new Exception("Internal program error");
1862 if ((order & RuleBasedCollator.CE_CASE_BIT_MASK_) == UPPER_CASE_) {
1865 char ch = src.charAt(i);
1866 if (UCharacter.isLowerCase(ch)) {
1869 if (toSmallKana(ch) == ch && toLargeKana(ch) != ch) {
1876 if (uCount != 0 && lCount != 0) {
1878 } else if (uCount != 0) {
1886 * Converts a char to the uppercase Kana
1889 * character to convert
1890 * @return the converted Kana character
1892 private static final char toLargeKana(char ch) {
1893 if (0x3042 < ch && ch < 0x30ef) { // Kana range
1894 switch (ch - 0x3000) {
1927 * Converts a char to the lowercase Kana
1930 * character to convert
1931 * @return the converted Kana character
1933 private static final char toSmallKana(char ch) {
1934 if (0x3042 < ch && ch < 0x30ef) { // Kana range
1935 switch (ch - 0x3000) {
1968 * This should be connected to special Jamo handling.
1970 private int getFirstCE(char ch) {
1971 m_utilColEIter_.setText(UCharacter.toString(ch));
1972 return m_utilColEIter_.next();
1976 * This adds a read element, while testing for existence
1983 private int addAnElement(BuildTable t, Elements element) {
1984 List<Integer> expansions = t.m_expansions_;
1985 element.m_mapCE_ = 0;
1987 if (element.m_CELength_ == 1) {
1988 element.m_mapCE_ = element.m_CEs_[0];
1991 // unfortunately, it looks like we have to look for a long primary
1992 // here since in canonical closure we are going to hit some long
1993 // primaries from the first phase, and they will come back as
1994 // continuations/expansions destroying the effect of the previous
1995 // opitimization. A long primary is a three byte primary with
1996 // starting secondaries and tertiaries. It can appear in long runs
1997 // of only primary differences (like east Asian tailorings) also,
1998 // it should not be an expansion, as expansions would break with
2000 if (element.m_CELength_ == 2 // a two CE expansion
2001 && RuleBasedCollator.isContinuation(element.m_CEs_[1])
2002 && (element.m_CEs_[1] & (~(0xFF << 24 | RuleBasedCollator.CE_CONTINUATION_MARKER_))) == 0 // that
2008 && (((element.m_CEs_[0] >> 8) & 0xFF) == RuleBasedCollator.BYTE_COMMON_)
2009 // a common secondary
2010 && ((element.m_CEs_[0] & 0xFF) == RuleBasedCollator.BYTE_COMMON_) // and
2015 element.m_mapCE_ = RuleBasedCollator.CE_SPECIAL_FLAG_
2016 // a long primary special
2017 | (CE_LONG_PRIMARY_TAG_ << 24)
2018 // first and second byte of primary
2019 | ((element.m_CEs_[0] >> 8) & 0xFFFF00)
2020 // third byte of primary
2021 | ((element.m_CEs_[1] >> 24) & 0xFF);
2023 // omitting expansion offset in builder
2024 // (HEADER_SIZE_ >> 2)
2025 int expansion = RuleBasedCollator.CE_SPECIAL_FLAG_
2026 | (CE_EXPANSION_TAG_ << RuleBasedCollator.CE_TAG_SHIFT_)
2027 | (addExpansion(expansions, element.m_CEs_[0]) << 4)
2030 for (int i = 1; i < element.m_CELength_; i++) {
2031 addExpansion(expansions, element.m_CEs_[i]);
2033 if (element.m_CELength_ <= 0xF) {
2034 expansion |= element.m_CELength_;
2036 addExpansion(expansions, 0);
2038 element.m_mapCE_ = expansion;
2039 setMaxExpansion(element.m_CEs_[element.m_CELength_ - 1],
2040 (byte) element.m_CELength_, t.m_maxExpansions_);
2041 if (isJamo(element.m_cPoints_.charAt(0))) {
2042 t.m_collator_.m_isJamoSpecial_ = true;
2043 setMaxJamoExpansion(element.m_cPoints_.charAt(0),
2044 element.m_CEs_[element.m_CELength_ - 1],
2045 (byte) element.m_CELength_, t.m_maxJamoExpansions_);
2050 // We treat digits differently - they are "uber special" and should be
2051 // processed differently if numeric collation is on.
2053 if ((element.m_uchars_.length() == 2)
2054 && UTF16.isLeadSurrogate(element.m_uchars_.charAt(0))) {
2055 uniChar = UCharacterProperty.getRawSupplementary(element.m_uchars_
2056 .charAt(0), element.m_uchars_.charAt(1));
2057 } else if (element.m_uchars_.length() == 1) {
2058 uniChar = element.m_uchars_.charAt(0);
2061 // Here, we either have one normal CE OR mapCE is set. Therefore, we
2062 // stuff only one element to the expansion buffer. When we encounter a
2063 // digit and we don't do numeric collation, we will just pick the CE
2064 // we have and break out of case (see ucol.cpp ucol_prv_getSpecialCE
2065 // && ucol_prv_getSpecialPrevCE). If we picked a special, further
2066 // processing will occur. If it's a simple CE, we'll return due
2067 // to how the loop is constructed.
2068 if (uniChar != 0 && UCharacter.isDigit(uniChar)) {
2069 // prepare the element
2070 int expansion = RuleBasedCollator.CE_SPECIAL_FLAG_
2071 | (CollationElementIterator.CE_DIGIT_TAG_ << RuleBasedCollator.CE_TAG_SHIFT_)
2073 if (element.m_mapCE_ != 0) {
2074 // if there is an expansion, we'll pick it here
2075 expansion |= (addExpansion(expansions, element.m_mapCE_) << 4);
2077 expansion |= (addExpansion(expansions, element.m_CEs_[0]) << 4);
2079 element.m_mapCE_ = expansion;
2082 // here we want to add the prefix structure.
2083 // I will try to process it as a reverse contraction, if possible.
2084 // prefix buffer is already reversed.
2086 if (element.m_prefixChars_ != null
2087 && element.m_prefixChars_.length() - element.m_prefix_ > 0) {
2088 // We keep the seen prefix starter elements in a hashtable we need
2089 // it to be able to distinguish between the simple codepoints and
2090 // prefix starters. Also, we need to use it for canonical closure.
2091 m_utilElement2_.m_caseBit_ = element.m_caseBit_;
2092 m_utilElement2_.m_CELength_ = element.m_CELength_;
2093 m_utilElement2_.m_CEs_ = element.m_CEs_;
2094 m_utilElement2_.m_mapCE_ = element.m_mapCE_;
2095 // m_utilElement2_.m_prefixChars_ = element.m_prefixChars_;
2096 m_utilElement2_.m_sizePrim_ = element.m_sizePrim_;
2097 m_utilElement2_.m_sizeSec_ = element.m_sizeSec_;
2098 m_utilElement2_.m_sizeTer_ = element.m_sizeTer_;
2099 m_utilElement2_.m_variableTop_ = element.m_variableTop_;
2100 m_utilElement2_.m_prefix_ = element.m_prefix_;
2101 m_utilElement2_.m_prefixChars_ = Normalizer.compose(
2102 element.m_prefixChars_, false);
2103 m_utilElement2_.m_uchars_ = element.m_uchars_;
2104 m_utilElement2_.m_cPoints_ = element.m_cPoints_;
2105 m_utilElement2_.m_cPointsOffset_ = 0;
2107 if (t.m_prefixLookup_ != null) {
2108 Elements uCE = t.m_prefixLookup_.get(element);
2110 // there is already a set of code points here
2111 element.m_mapCE_ = addPrefix(t, uCE.m_mapCE_, element);
2112 } else { // no code points, so this spot is clean
2113 element.m_mapCE_ = addPrefix(t, CE_NOT_FOUND_, element);
2114 uCE = new Elements(element);
2115 uCE.m_cPoints_ = uCE.m_uchars_;
2116 t.m_prefixLookup_.put(uCE, uCE);
2118 if (m_utilElement2_.m_prefixChars_.length() != element.m_prefixChars_
2121 || !m_utilElement2_.m_prefixChars_.regionMatches(0,
2122 element.m_prefixChars_, element.m_prefix_,
2123 m_utilElement2_.m_prefixChars_.length())) {
2125 m_utilElement2_.m_mapCE_ = addPrefix(t, element.m_mapCE_,
2131 // We need to use the canonical iterator here
2132 // the way we do it is to generate the canonically equivalent strings
2133 // for the contraction and then add the sequences that pass FCD check
2134 if (element.m_cPoints_.length() - element.m_cPointsOffset_ > 1
2135 && !(element.m_cPoints_.length() - element.m_cPointsOffset_ == 2
2136 && UTF16.isLeadSurrogate(element.m_cPoints_.charAt(0)) && UTF16
2137 .isTrailSurrogate(element.m_cPoints_.charAt(1)))) {
2138 // this is a contraction, we should check whether a composed form
2139 // should also be included
2140 m_utilCanIter_.setSource(element.m_cPoints_);
2141 String source = m_utilCanIter_.next();
2142 while (source != null && source.length() > 0) {
2143 if (Normalizer.quickCheck(source, Normalizer.FCD, 0) != Normalizer.NO) {
2144 element.m_uchars_ = source;
2145 element.m_cPoints_ = element.m_uchars_;
2146 finalizeAddition(t, element);
2148 source = m_utilCanIter_.next();
2151 return element.m_mapCE_;
2153 return finalizeAddition(t, element);
2158 * Adds an expansion ce to the expansion vector
2164 * @return the current position of the new element
2166 private static final int addExpansion(List<Integer> expansions, int value) {
2167 expansions.add(Integer.valueOf(value));
2168 return expansions.size() - 1;
2172 * Looks for the maximum length of all expansion sequences ending with the
2173 * same collation element. The size required for maxexpansion and maxsize is
2174 * returned if the arrays are too small.
2176 * @param endexpansion
2177 * the last expansion collation element to be added
2178 * @param expansionsize
2179 * size of the expansion
2180 * @param maxexpansion
2181 * data structure to store the maximum expansion data.
2182 * @returns size of the maxexpansion and maxsize used.
2184 private static int setMaxExpansion(int endexpansion, byte expansionsize,
2185 MaxExpansionTable maxexpansion) {
2187 int limit = maxexpansion.m_endExpansionCE_.size();
2188 long unsigned = (long) endexpansion;
2189 unsigned &= 0xFFFFFFFFl;
2191 // using binary search to determine if last expansion element is
2192 // already in the array
2195 while (start < limit - 1) {
2196 int mid = (start + limit) >> 1;
2197 long unsignedce = (maxexpansion.m_endExpansionCE_
2198 .get(mid)).intValue();
2199 unsignedce &= 0xFFFFFFFFl;
2200 if (unsigned < unsignedce) {
2207 if ((maxexpansion.m_endExpansionCE_.get(start)).intValue() == endexpansion) {
2212 // found the ce in expansion, we'll just modify the size if it
2214 Object currentsize = maxexpansion.m_expansionCESize_.get(result);
2215 if (((Byte) currentsize).byteValue() < expansionsize) {
2216 maxexpansion.m_expansionCESize_.set(result, Byte.valueOf(
2220 // we'll need to squeeze the value into the array. initial
2221 // implementation. shifting the subarray down by 1
2222 maxexpansion.m_endExpansionCE_.add(start + 1, Integer.valueOf(endexpansion));
2223 maxexpansion.m_expansionCESize_.add(start + 1, Byte.valueOf(expansionsize));
2225 return maxexpansion.m_endExpansionCE_.size();
2229 * Sets the maximum length of all jamo expansion sequences ending with the
2230 * same collation element. The size required for maxexpansion and maxsize is
2231 * returned if the arrays are too small.
2234 * the jamo codepoint
2235 * @param endexpansion
2236 * the last expansion collation element to be added
2237 * @param expansionsize
2238 * size of the expansion
2239 * @param maxexpansion
2240 * data structure to store the maximum expansion data.
2241 * @returns size of the maxexpansion and maxsize used.
2243 private static int setMaxJamoExpansion(char ch, int endexpansion,
2244 byte expansionsize, MaxJamoExpansionTable maxexpansion) {
2246 if (ch >= 0x1100 && ch <= 0x1112) {
2247 // determines L for Jamo, doesn't need to store this since it is
2248 // never at the end of a expansion
2249 if (maxexpansion.m_maxLSize_ < expansionsize) {
2250 maxexpansion.m_maxLSize_ = expansionsize;
2252 return maxexpansion.m_endExpansionCE_.size();
2255 if (ch >= 0x1161 && ch <= 0x1175) {
2256 // determines V for Jamo
2257 if (maxexpansion.m_maxVSize_ < expansionsize) {
2258 maxexpansion.m_maxVSize_ = expansionsize;
2262 if (ch >= 0x11A8 && ch <= 0x11C2) {
2264 // determines T for Jamo
2265 if (maxexpansion.m_maxTSize_ < expansionsize) {
2266 maxexpansion.m_maxTSize_ = expansionsize;
2270 int pos = maxexpansion.m_endExpansionCE_.size();
2273 if ((maxexpansion.m_endExpansionCE_.get(pos)).intValue() == endexpansion) {
2274 return maxexpansion.m_endExpansionCE_.size();
2277 maxexpansion.m_endExpansionCE_.add(Integer.valueOf(endexpansion));
2278 maxexpansion.m_isV_.add(isV ? Boolean.TRUE : Boolean.FALSE);
2280 return maxexpansion.m_endExpansionCE_.size();
2284 * Adds a prefix to the table
2287 * build table to update
2289 * collation element to add
2291 * rule element to add
2292 * @return modified ce
2294 private int addPrefix(BuildTable t, int CE, Elements element) {
2295 // currently the longest prefix we're supporting in Japanese is two
2296 // characters long. Although this table could quite easily mimic
2297 // complete contraction stuff there is no good reason to make a general
2298 // solution, as it would require some error prone messing.
2299 ContractionTable contractions = t.m_contractions_;
2300 String oldCP = element.m_cPoints_;
2301 int oldCPOffset = element.m_cPointsOffset_;
2303 contractions.m_currentTag_ = CE_SPEC_PROC_TAG_;
2304 // here, we will normalize & add prefix to the table.
2305 int size = element.m_prefixChars_.length() - element.m_prefix_;
2306 for (int j = 1; j < size; j++) {
2307 // First add NFD prefix chars to unsafe CP hash table
2308 // Unless it is a trail surrogate, which is handled algoritmically
2309 // and shouldn't take up space in the table.
2310 char ch = element.m_prefixChars_.charAt(j + element.m_prefix_);
2311 if (!UTF16.isTrailSurrogate(ch)) {
2312 unsafeCPSet(t.m_unsafeCP_, ch);
2316 // StringBuffer reversed = new StringBuffer();
2317 m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length());
2318 for (int j = 0; j < size; j++) {
2319 // prefixes are going to be looked up backwards
2320 // therefore, we will promptly reverse the prefix buffer...
2321 int offset = element.m_prefixChars_.length() - j - 1;
2322 m_utilStringBuffer_.append(element.m_prefixChars_.charAt(offset));
2324 element.m_prefixChars_ = m_utilStringBuffer_.toString();
2325 element.m_prefix_ = 0;
2327 // the first codepoint is also unsafe, as it forms a 'contraction' with
2329 if (!UTF16.isTrailSurrogate(element.m_cPoints_.charAt(0))) {
2330 unsafeCPSet(t.m_unsafeCP_, element.m_cPoints_.charAt(0));
2333 element.m_cPoints_ = element.m_prefixChars_;
2334 element.m_cPointsOffset_ = element.m_prefix_;
2336 // Add the last char of the contraction to the contraction-end hash
2337 // table. unless it is a trail surrogate, which is handled
2338 // algorithmically and shouldn't be in the table
2339 if (!UTF16.isTrailSurrogate(element.m_cPoints_
2340 .charAt(element.m_cPoints_.length() - 1))) {
2341 ContrEndCPSet(t.m_contrEndCP_, element.m_cPoints_
2342 .charAt(element.m_cPoints_.length() - 1));
2344 // First we need to check if contractions starts with a surrogate
2345 // int cp = UTF16.charAt(element.m_cPoints_, element.m_cPointsOffset_);
2347 // If there are any Jamos in the contraction, we should turn on special
2348 // processing for Jamos
2349 if (isJamo(element.m_prefixChars_.charAt(element.m_prefix_))) {
2350 t.m_collator_.m_isJamoSpecial_ = true;
2352 // then we need to deal with it
2353 // we could aready have something in table - or we might not
2354 if (!isPrefix(CE)) {
2355 // if it wasn't contraction, we wouldn't end up here
2356 int firstContractionOffset = addContraction(contractions,
2357 CONTRACTION_TABLE_NEW_ELEMENT_, (char) 0, CE);
2358 int newCE = processContraction(contractions, element, CE_NOT_FOUND_);
2359 addContraction(contractions, firstContractionOffset,
2360 element.m_prefixChars_.charAt(element.m_prefix_), newCE);
2361 addContraction(contractions, firstContractionOffset, (char) 0xFFFF,
2363 CE = constructSpecialCE(CE_SPEC_PROC_TAG_, firstContractionOffset);
2365 // we are adding to existing contraction
2366 // there were already some elements in the table, so we need to add
2367 // a new contraction
2368 // Two things can happen here: either the codepoint is already in
2369 // the table, or it is not
2370 char ch = element.m_prefixChars_.charAt(element.m_prefix_);
2371 int position = findCP(contractions, CE, ch);
2373 // if it is we just continue down the chain
2374 int eCE = getCE(contractions, CE, position);
2375 int newCE = processContraction(contractions, element, eCE);
2376 setContraction(contractions, CE, position, ch, newCE);
2378 // if it isn't, we will have to create a new sequence
2379 processContraction(contractions, element, CE_NOT_FOUND_);
2380 insertContraction(contractions, CE, ch, element.m_mapCE_);
2384 element.m_cPoints_ = oldCP;
2385 element.m_cPointsOffset_ = oldCPOffset;
2391 * Checks if the argument ce is a contraction
2395 * @return true if argument ce is a contraction
2397 private static final boolean isContraction(int CE) {
2398 return isSpecial(CE) && (getCETag(CE) == CE_CONTRACTION_TAG_);
2402 * Checks if the argument ce has a prefix
2406 * @return true if argument ce has a prefix
2408 private static final boolean isPrefix(int CE) {
2409 return isSpecial(CE) && (getCETag(CE) == CE_SPEC_PROC_TAG_);
2413 * Checks if the argument ce is special
2417 * @return true if argument ce is special
2419 private static final boolean isSpecial(int CE) {
2420 return (CE & RuleBasedCollator.CE_SPECIAL_FLAG_) == 0xF0000000;
2424 * Checks if the argument ce has a prefix
2428 * @return true if argument ce has a prefix
2430 private static final int getCETag(int CE) {
2431 return (CE & RuleBasedCollator.CE_TAG_MASK_) >>> RuleBasedCollator.CE_TAG_SHIFT_;
2435 * Gets the ce at position in contraction table
2440 * offset to the contraction table
2443 private static final int getCE(ContractionTable table, int element,
2445 element &= 0xFFFFFF;
2446 BasicContractionTable tbl = getBasicContractionTable(table, element);
2449 return CE_NOT_FOUND_;
2451 if (position > tbl.m_CEs_.size() || position == -1) {
2452 return CE_NOT_FOUND_;
2454 return tbl.m_CEs_.get(position).intValue();
2459 * Sets the unsafe character
2464 * character to be added
2466 private static final void unsafeCPSet(byte table[], char c) {
2468 if (hash >= (UNSAFECP_TABLE_SIZE_ << 3)) {
2469 if (hash >= 0xd800 && hash <= 0xf8ff) {
2470 // Part of a surrogate, or in private use area.
2471 // These don't go in the table
2474 hash = (hash & UNSAFECP_TABLE_MASK_) + 256;
2476 table[hash >> 3] |= (1 << (hash & 7));
2480 * Sets the contraction end character
2483 * contraction end table
2485 * character to be added
2487 private static final void ContrEndCPSet(byte table[], char c) {
2489 if (hash >= (UNSAFECP_TABLE_SIZE_ << 3)) {
2490 hash = (hash & UNSAFECP_TABLE_MASK_) + 256;
2492 table[hash >> 3] |= (1 << (hash & 7));
2496 * Adds more contractions in table. If element is non existant, it creates
2497 * on. Returns element handle
2502 * offset to the contraction table
2506 * @return collation element
2508 private static int addContraction(ContractionTable table, int element,
2509 char codePoint, int value) {
2510 BasicContractionTable tbl = getBasicContractionTable(table, element);
2512 tbl = addAContractionElement(table);
2513 element = table.m_elements_.size() - 1;
2516 tbl.m_CEs_.add(Integer.valueOf(value));
2517 tbl.m_codePoints_.append(codePoint);
2518 return constructSpecialCE(table.m_currentTag_, element);
2522 * Adds a contraction element to the table
2525 * contraction table to update
2526 * @return contraction
2528 private static BasicContractionTable addAContractionElement(
2529 ContractionTable table) {
2530 BasicContractionTable result = new BasicContractionTable();
2531 table.m_elements_.add(result);
2536 * Constructs a special ce
2542 * @return a contraction ce
2544 private static final int constructSpecialCE(int tag, int CE) {
2545 return RuleBasedCollator.CE_SPECIAL_FLAG_
2546 | (tag << RuleBasedCollator.CE_TAG_SHIFT_) | (CE & 0xFFFFFF);
2550 * Sets and inserts the element that has a contraction
2552 * @param contractions
2555 * contracting element
2557 * @return contraction ce
2559 private static int processContraction(ContractionTable contractions,
2560 Elements element, int existingCE) {
2561 int firstContractionOffset = 0;
2563 if (element.m_cPoints_.length() - element.m_cPointsOffset_ == 1) {
2564 if (isContractionTableElement(existingCE)
2565 && getCETag(existingCE) == contractions.m_currentTag_) {
2566 changeContraction(contractions, existingCE, (char) 0,
2568 changeContraction(contractions, existingCE, (char) 0xFFFF,
2572 // can't do just that. existingCe might be a contraction,
2573 // meaning that we need to do another step
2574 return element.m_mapCE_;
2578 // this recursion currently feeds on the only element we have...
2579 // We will have to copy it in order to accomodate for both backward
2580 // and forward cycles
2581 // we encountered either an empty space or a non-contraction element
2582 // this means we are constructing a new contraction sequence
2583 element.m_cPointsOffset_++;
2584 if (!isContractionTableElement(existingCE)) {
2585 // if it wasn't contraction, we wouldn't end up here
2586 firstContractionOffset = addContraction(contractions,
2587 CONTRACTION_TABLE_NEW_ELEMENT_, (char) 0, existingCE);
2588 int newCE = processContraction(contractions, element, CE_NOT_FOUND_);
2589 addContraction(contractions, firstContractionOffset,
2590 element.m_cPoints_.charAt(element.m_cPointsOffset_), newCE);
2591 addContraction(contractions, firstContractionOffset, (char) 0xFFFF,
2593 existingCE = constructSpecialCE(contractions.m_currentTag_,
2594 firstContractionOffset);
2596 // we are adding to existing contraction
2597 // there were already some elements in the table, so we need to add
2598 // a new contraction
2599 // Two things can happen here: either the codepoint is already in
2600 // the table, or it is not
2601 int position = findCP(contractions, existingCE, element.m_cPoints_
2602 .charAt(element.m_cPointsOffset_));
2604 // if it is we just continue down the chain
2605 int eCE = getCE(contractions, existingCE, position);
2606 int newCE = processContraction(contractions, element, eCE);
2607 setContraction(contractions, existingCE, position,
2608 element.m_cPoints_.charAt(element.m_cPointsOffset_),
2611 // if it isn't, we will have to create a new sequence
2612 int newCE = processContraction(contractions, element,
2614 insertContraction(contractions, existingCE, element.m_cPoints_
2615 .charAt(element.m_cPointsOffset_), newCE);
2618 element.m_cPointsOffset_--;
2623 * Checks if CE belongs to the contraction table
2626 * collation element to test
2627 * @return true if CE belongs to the contraction table
2629 private static final boolean isContractionTableElement(int CE) {
2630 return isSpecial(CE)
2631 && (getCETag(CE) == CE_CONTRACTION_TAG_ || getCETag(CE) == CE_SPEC_PROC_TAG_);
2635 * Gets the codepoint
2640 * offset to the contraction element in the table
2642 * code point to look for
2643 * @return the offset to the code point
2645 private static int findCP(ContractionTable table, int element,
2647 BasicContractionTable tbl = getBasicContractionTable(table, element);
2653 while (codePoint > tbl.m_codePoints_.charAt(position)) {
2655 if (position > tbl.m_codePoints_.length()) {
2659 if (codePoint == tbl.m_codePoints_.charAt(position)) {
2667 * Gets the contraction element out of the contraction table
2672 * to the element in the contraction table
2673 * @return basic contraction element at offset in the contraction table
2675 private static final BasicContractionTable getBasicContractionTable(
2676 ContractionTable table, int offset) {
2678 if (offset == 0xFFFFFF) {
2681 return table.m_elements_.get(offset);
2685 * Changes the contraction element
2690 * offset to the element in the contraction table
2694 * new collation element
2695 * @return basic contraction element at offset in the contraction table
2697 private static final int changeContraction(ContractionTable table,
2698 int element, char codePoint, int newCE) {
2699 BasicContractionTable tbl = getBasicContractionTable(table, element);
2704 while (codePoint > tbl.m_codePoints_.charAt(position)) {
2706 if (position > tbl.m_codePoints_.length()) {
2707 return CE_NOT_FOUND_;
2710 if (codePoint == tbl.m_codePoints_.charAt(position)) {
2711 tbl.m_CEs_.set(position, Integer.valueOf(newCE));
2712 return element & 0xFFFFFF;
2714 return CE_NOT_FOUND_;
2719 * Sets a part of contraction sequence in table. If element is non existant,
2720 * it creates on. Returns element handle.
2725 * offset to the contraction table
2728 * contraction character
2731 * @return new contraction ce
2733 private static final int setContraction(ContractionTable table,
2734 int element, int offset, char codePoint, int value) {
2735 element &= 0xFFFFFF;
2736 BasicContractionTable tbl = getBasicContractionTable(table, element);
2738 tbl = addAContractionElement(table);
2739 element = table.m_elements_.size() - 1;
2742 tbl.m_CEs_.set(offset, Integer.valueOf(value));
2743 tbl.m_codePoints_.setCharAt(offset, codePoint);
2744 return constructSpecialCE(table.m_currentTag_, element);
2748 * Inserts a part of contraction sequence in table. Sequences behind the
2749 * offset are moved back. If element is non existent, it creates on.
2754 * offset to the table contraction
2758 * collation element value
2759 * @return contraction collation element
2761 private static final int insertContraction(ContractionTable table,
2762 int element, char codePoint, int value) {
2763 element &= 0xFFFFFF;
2764 BasicContractionTable tbl = getBasicContractionTable(table, element);
2766 tbl = addAContractionElement(table);
2767 element = table.m_elements_.size() - 1;
2771 while (tbl.m_codePoints_.charAt(offset) < codePoint
2772 && offset < tbl.m_codePoints_.length()) {
2776 tbl.m_CEs_.add(offset, Integer.valueOf(value));
2777 tbl.m_codePoints_.insert(offset, codePoint);
2779 return constructSpecialCE(table.m_currentTag_, element);
2790 private final static int finalizeAddition(BuildTable t, Elements element) {
2791 int CE = CE_NOT_FOUND_;
2792 // This should add a completely ignorable element to the
2793 // unsafe table, so that backward iteration will skip
2794 // over it when treating contractions.
2795 if (element.m_mapCE_ == 0) {
2796 for (int i = 0; i < element.m_cPoints_.length(); i++) {
2797 char ch = element.m_cPoints_.charAt(i);
2798 if (!UTF16.isTrailSurrogate(ch)) {
2799 unsafeCPSet(t.m_unsafeCP_, ch);
2804 if (element.m_cPoints_.length() - element.m_cPointsOffset_ > 1) {
2805 // we're adding a contraction
2806 int cp = UTF16.charAt(element.m_cPoints_, element.m_cPointsOffset_);
2807 CE = t.m_mapping_.getValue(cp);
2808 CE = addContraction(t, CE, element);
2811 CE = t.m_mapping_.getValue(element.m_cPoints_
2812 .charAt(element.m_cPointsOffset_));
2814 if (CE != CE_NOT_FOUND_) {
2815 if (isContractionTableElement(CE)) {
2816 // adding a non contraction element (thai, expansion,
2817 // single) to already existing contraction
2818 if (!isPrefix(element.m_mapCE_)) {
2819 // we cannot reenter prefix elements - as we are going
2820 // to create a dead loop
2821 // Only expansions and regular CEs can go here...
2822 // Contractions will never happen in this place
2823 setContraction(t.m_contractions_, CE, 0, (char) 0,
2825 // This loop has to change the CE at the end of
2826 // contraction REDO!
2827 changeLastCE(t.m_contractions_, CE, element.m_mapCE_);
2831 .setValue(element.m_cPoints_
2832 .charAt(element.m_cPointsOffset_),
2834 if (element.m_prefixChars_ != null
2835 && element.m_prefixChars_.length() > 0
2836 && getCETag(CE) != CE_IMPLICIT_TAG_) {
2837 // Add CE for standalone precontext char.
2838 Elements origElem = new Elements();
2839 origElem.m_prefixChars_ = null;
2840 origElem.m_uchars_ = element.m_cPoints_;
2841 origElem.m_cPoints_ = origElem.m_uchars_;
2842 origElem.m_CEs_[0] = CE;
2843 origElem.m_mapCE_ = CE;
2844 origElem.m_CELength_ = 1;
2845 finalizeAddition(t, origElem);
2849 t.m_mapping_.setValue(element.m_cPoints_
2850 .charAt(element.m_cPointsOffset_), element.m_mapCE_);
2857 * Note regarding surrogate handling: We are interested only in the single
2858 * or leading surrogates in a contraction. If a surrogate is somewhere else
2859 * in the contraction, it is going to be handled as a pair of code units, as
2860 * it doesn't affect the performance AND handling surrogates specially would
2861 * complicate code way too much.
2863 private static int addContraction(BuildTable t, int CE, Elements element) {
2864 ContractionTable contractions = t.m_contractions_;
2865 contractions.m_currentTag_ = CE_CONTRACTION_TAG_;
2867 // First we need to check if contractions starts with a surrogate
2868 int cp = UTF16.charAt(element.m_cPoints_, 0);
2870 if (UCharacter.isSupplementary(cp)) {
2873 if (cpsize < element.m_cPoints_.length()) {
2874 // This is a real contraction, if there are other characters after
2876 int size = element.m_cPoints_.length() - element.m_cPointsOffset_;
2877 for (int j = 1; j < size; j++) {
2878 // First add contraction chars to unsafe CP hash table
2879 // Unless it is a trail surrogate, which is handled
2880 // algoritmically and shouldn't take up space in the table.
2881 if (!UTF16.isTrailSurrogate(element.m_cPoints_
2882 .charAt(element.m_cPointsOffset_ + j))) {
2883 unsafeCPSet(t.m_unsafeCP_, element.m_cPoints_
2884 .charAt(element.m_cPointsOffset_ + j));
2887 // Add the last char of the contraction to the contraction-end
2888 // hash table. unless it is a trail surrogate, which is handled
2889 // algorithmically and shouldn't be in the table
2890 if (!UTF16.isTrailSurrogate(element.m_cPoints_
2891 .charAt(element.m_cPoints_.length() - 1))) {
2892 ContrEndCPSet(t.m_contrEndCP_, element.m_cPoints_
2893 .charAt(element.m_cPoints_.length() - 1));
2896 // If there are any Jamos in the contraction, we should turn on
2897 // special processing for Jamos
2898 if (isJamo(element.m_cPoints_.charAt(element.m_cPointsOffset_))) {
2899 t.m_collator_.m_isJamoSpecial_ = true;
2901 // then we need to deal with it
2902 // we could aready have something in table - or we might not
2903 element.m_cPointsOffset_ += cpsize;
2904 if (!isContraction(CE)) {
2905 // if it wasn't contraction, we wouldn't end up here
2906 int firstContractionOffset = addContraction(contractions,
2907 CONTRACTION_TABLE_NEW_ELEMENT_, (char) 0, CE);
2908 int newCE = processContraction(contractions, element,
2910 addContraction(contractions, firstContractionOffset,
2911 element.m_cPoints_.charAt(element.m_cPointsOffset_),
2913 addContraction(contractions, firstContractionOffset,
2915 CE = constructSpecialCE(CE_CONTRACTION_TAG_,
2916 firstContractionOffset);
2918 // we are adding to existing contraction
2919 // there were already some elements in the table, so we need to
2920 // add a new contraction
2921 // Two things can happen here: either the codepoint is already
2922 // in the table, or it is not
2923 int position = findCP(contractions, CE, element.m_cPoints_
2924 .charAt(element.m_cPointsOffset_));
2926 // if it is we just continue down the chain
2927 int eCE = getCE(contractions, CE, position);
2928 int newCE = processContraction(contractions, element, eCE);
2933 element.m_cPoints_.charAt(element.m_cPointsOffset_),
2936 // if it isn't, we will have to create a new sequence
2937 int newCE = processContraction(contractions, element,
2939 insertContraction(contractions, CE, element.m_cPoints_
2940 .charAt(element.m_cPointsOffset_), newCE);
2943 element.m_cPointsOffset_ -= cpsize;
2944 t.m_mapping_.setValue(cp, CE);
2945 } else if (!isContraction(CE)) {
2946 // this is just a surrogate, and there is no contraction
2947 t.m_mapping_.setValue(cp, element.m_mapCE_);
2949 // fill out the first stage of the contraction with the surrogate
2951 changeContraction(contractions, CE, (char) 0, element.m_mapCE_);
2952 changeContraction(contractions, CE, (char) 0xFFFF, element.m_mapCE_);
2958 * this is for adding non contractions
2963 * offset to the contraction table
2965 * collation element value
2966 * @return new collation element
2968 private static final int changeLastCE(ContractionTable table, int element,
2970 BasicContractionTable tbl = getBasicContractionTable(table, element);
2975 tbl.m_CEs_.set(tbl.m_CEs_.size() - 1, Integer.valueOf(value));
2976 return constructSpecialCE(table.m_currentTag_, element & 0xFFFFFF);
2980 * Given a set of ranges calculated by allocWeights(), iterate through the
2981 * weights. Sets the next weight in cegenerator.m_current_.
2983 * @param cegenerator
2984 * object that contains ranges weight range array and its
2986 * @return the next weight
2988 private static int nextWeight(CEGenerator cegenerator) {
2989 if (cegenerator.m_rangesLength_ > 0) {
2990 // get maxByte from the .count field
2991 int maxByte = cegenerator.m_ranges_[0].m_count_;
2992 // get the next weight
2993 int weight = cegenerator.m_ranges_[0].m_start_;
2994 if (weight == cegenerator.m_ranges_[0].m_end_) {
2995 // this range is finished, remove it and move the following
2997 cegenerator.m_rangesLength_--;
2998 if (cegenerator.m_rangesLength_ > 0) {
2999 System.arraycopy(cegenerator.m_ranges_, 1,
3000 cegenerator.m_ranges_, 0,
3001 cegenerator.m_rangesLength_);
3002 cegenerator.m_ranges_[0].m_count_ = maxByte;
3003 // keep maxByte in ranges[0]
3006 // increment the weight for the next value
3007 cegenerator.m_ranges_[0].m_start_ = incWeight(weight,
3008 cegenerator.m_ranges_[0].m_length2_, maxByte);
3016 * Increment the collation weight
3022 * @return new incremented weight
3024 private static final int incWeight(int weight, int length, int maxByte) {
3026 int b = getWeightByte(weight, length);
3028 return setWeightByte(weight, length, b + 1);
3030 // roll over, set this byte to BYTE_FIRST_TAILORED_ and
3031 // increment the previous one
3032 weight = setWeightByte(weight, length,
3033 RuleBasedCollator.BYTE_FIRST_TAILORED_);
3040 * Gets the weight byte
3046 private static final int getWeightByte(int weight, int index) {
3047 return (weight >> ((4 - index) << 3)) & 0xff;
3051 * Set the weight byte in table
3058 private static final int setWeightByte(int weight, int index, int b) {
3060 // 0xffffffff except a 00 "hole" for the index-th byte
3063 mask = 0xffffffff >>> index;
3065 // Do not use int>>>32 because that does not shift at all
3066 // while we need it to become 0.
3068 // Java Language Specification (Third Edition) 15.19 Shift Operators:
3069 // "If the promoted type of the left-hand operand is int,
3070 // only the five lowest-order bits of the right-hand operand
3071 // are used as the shift distance.
3072 // It is as if the right-hand operand were subjected to
3073 // a bitwise logical AND operator & (§15.22.1) with the mask value 0x1f.
3074 // The shift distance actually used is therefore
3075 // always in the range 0 to 31, inclusive."
3079 mask |= 0xffffff00 << index;
3080 return (weight & mask) | (b << index);
3084 * Call getWeightRanges and then determine heuristically which ranges to use
3085 * for a given number of weights between (excluding) two limits
3094 private int allocateWeights(int lowerLimit, int upperLimit, int n,
3095 int maxByte, WeightRange ranges[]) {
3096 // number of usable byte values 3..maxByte
3097 int countBytes = maxByte - RuleBasedCollator.BYTE_FIRST_TAILORED_ + 1;
3098 // [0] unused, [5] to make index checks unnecessary, m_utilCountBuffer_
3099 // countBytes to the power of index, m_utilLongBuffer_ for unsignedness
3100 // gcc requires explicit initialization
3101 m_utilLongBuffer_[0] = 1;
3102 m_utilLongBuffer_[1] = countBytes;
3103 m_utilLongBuffer_[2] = m_utilLongBuffer_[1] * countBytes;
3104 m_utilLongBuffer_[3] = m_utilLongBuffer_[2] * countBytes;
3105 m_utilLongBuffer_[4] = m_utilLongBuffer_[3] * countBytes;
3106 int rangeCount = getWeightRanges(lowerLimit, upperLimit, maxByte,
3107 countBytes, ranges);
3108 if (rangeCount <= 0) {
3111 // what is the maximum number of weights with these ranges?
3113 for (int i = 0; i < rangeCount; ++i) {
3114 maxCount += (long) ranges[i].m_count_
3115 * m_utilLongBuffer_[4 - ranges[i].m_length_];
3120 // set the length2 and count2 fields
3121 for (int i = 0; i < rangeCount; ++i) {
3122 ranges[i].m_length2_ = ranges[i].m_length_;
3123 ranges[i].m_count2_ = ranges[i].m_count_;
3125 // try until we find suitably large ranges
3127 // get the smallest number of bytes in a range
3128 int minLength = ranges[0].m_length2_;
3129 // sum up the number of elements that fit into ranges of each byte
3131 Arrays.fill(m_utilCountBuffer_, 0);
3132 for (int i = 0; i < rangeCount; ++i) {
3133 m_utilCountBuffer_[ranges[i].m_length2_] += ranges[i].m_count2_;
3135 // now try to allocate n elements in the available short ranges
3136 if (n <= m_utilCountBuffer_[minLength]
3137 + m_utilCountBuffer_[minLength + 1]) {
3138 // trivial cases, use the first few ranges
3142 maxCount += ranges[rangeCount].m_count2_;
3144 } while (n > maxCount);
3146 } else if (n <= ranges[0].m_count2_ * countBytes) {
3147 // easy case, just make this one range large enough by
3148 // lengthening it once more, possibly split it
3150 // calculate how to split the range between maxLength-1
3151 // (count1) and maxLength (count2)
3152 long power_1 = m_utilLongBuffer_[minLength
3153 - ranges[0].m_length_];
3154 long power = power_1 * countBytes;
3155 int count2 = (int) ((n + power - 1) / power);
3156 int count1 = ranges[0].m_count_ - count2;
3159 // lengthen the entire range to maxLength
3160 lengthenRange(ranges, 0, maxByte, countBytes);
3162 // really split the range
3163 // create a new range with the end and initial and current
3164 // length of the old one
3166 ranges[1].m_end_ = ranges[0].m_end_;
3167 ranges[1].m_length_ = ranges[0].m_length_;
3168 ranges[1].m_length2_ = minLength;
3169 // set the end of the first range according to count1
3170 int i = ranges[0].m_length_;
3171 int b = getWeightByte(ranges[0].m_start_, i) + count1 - 1;
3172 // ranges[0].count and count1 may be >countBytes from
3173 // merging adjacent ranges; b > maxByte is possible
3175 ranges[0].m_end_ = setWeightByte(ranges[0].m_start_, i,
3178 ranges[0].m_end_ = setWeightByte(incWeight(
3179 ranges[0].m_start_, i - 1, maxByte), i, b
3182 // set the bytes in the end weight at length + 1..length2
3184 b = (maxByte << 24) | (maxByte << 16) | (maxByte << 8)
3185 | maxByte; // this used to be 0xffffffff
3186 ranges[0].m_end_ = truncateWeight(ranges[0].m_end_, i)
3187 | (b >>> (i << 3)) & (b << ((4 - minLength) << 3));
3188 // set the start of the second range to immediately follow
3189 // the end of the first one
3190 ranges[1].m_start_ = incWeight(ranges[0].m_end_, minLength,
3192 // set the count values (informational)
3193 ranges[0].m_count_ = count1;
3194 ranges[1].m_count_ = count2;
3196 ranges[0].m_count2_ = (int) (count1 * power_1);
3197 // will be *countBytes when lengthened
3198 ranges[1].m_count2_ = (int) (count2 * power_1);
3200 // lengthen the second range to maxLength
3201 lengthenRange(ranges, 1, maxByte, countBytes);
3205 // no good match, lengthen all minLength ranges and iterate
3206 for (int i = 0; ranges[i].m_length2_ == minLength; ++i) {
3207 lengthenRange(ranges, i, maxByte, countBytes);
3211 if (rangeCount > 1) {
3212 // sort the ranges by weight values
3213 Arrays.sort(ranges, 0, rangeCount);
3216 // set maxByte in ranges[0] for ucol_nextWeight()
3217 ranges[0].m_count_ = maxByte;
3223 * Updates the range length
3226 * weight range array
3228 * to weight range array
3231 * @return new length
3233 private static final int lengthenRange(WeightRange range[], int offset,
3234 int maxByte, int countBytes) {
3235 int length = range[offset].m_length2_ + 1;
3236 range[offset].m_start_ = setWeightTrail(range[offset].m_start_, length,
3237 RuleBasedCollator.BYTE_FIRST_TAILORED_);
3238 range[offset].m_end_ = setWeightTrail(range[offset].m_end_, length,
3240 range[offset].m_count2_ *= countBytes;
3241 range[offset].m_length2_ = length;
3251 * @return new weight
3253 private static final int setWeightTrail(int weight, int length, int trail) {
3254 length = (4 - length) << 3;
3255 return (weight & (0xffffff00 << length)) | (trail << length);
3259 * take two CE weights and calculate the possible ranges of weights between
3260 * the two limits, excluding them for weights with up to 4 bytes there are
3261 * up to 2*4-1=7 ranges
3268 * @return weight ranges
3270 private int getWeightRanges(int lowerLimit, int upperLimit, int maxByte,
3271 int countBytes, WeightRange ranges[]) {
3272 // assume that both lowerLimit & upperLimit are not 0
3273 // get the lengths of the limits
3274 int lowerLength = lengthOfWeight(lowerLimit);
3275 int upperLength = lengthOfWeight(upperLimit);
3276 if (Utility.compareUnsigned(lowerLimit, upperLimit) >= 0) {
3279 // check that neither is a prefix of the other
3280 if (lowerLength < upperLength) {
3281 if (lowerLimit == truncateWeight(upperLimit, lowerLength)) {
3285 // if the upper limit is a prefix of the lower limit then the earlier
3286 // test lowerLimit >= upperLimit has caught it
3287 // reset local variables
3288 // With the limit lengths of 1..4, there are up to 7 ranges for
3290 // range minimum length
3298 // We are now going to calculate up to 7 ranges.
3299 // Some of them will typically overlap, so we will then have to merge
3300 // and eliminate ranges.
3302 // We have to clean cruft from previous invocations
3303 // before doing anything. C++ already does that
3304 for (int length = 0; length < 5; length++) {
3305 m_utilLowerWeightRange_[length].clear();
3306 m_utilUpperWeightRange_[length].clear();
3308 m_utilWeightRange_.clear();
3310 int weight = lowerLimit;
3311 for (int length = lowerLength; length >= 2; --length) {
3312 m_utilLowerWeightRange_[length].clear();
3313 int trail = getWeightByte(weight, length);
3314 if (trail < maxByte) {
3315 m_utilLowerWeightRange_[length].m_start_ = incWeightTrail(
3317 m_utilLowerWeightRange_[length].m_end_ = setWeightTrail(weight,
3319 m_utilLowerWeightRange_[length].m_length_ = length;
3320 m_utilLowerWeightRange_[length].m_count_ = maxByte - trail;
3322 weight = truncateWeight(weight, length - 1);
3324 m_utilWeightRange_.m_start_ = incWeightTrail(weight, 1);
3326 weight = upperLimit;
3327 // [0] and [1] are not used - this simplifies indexing,
3328 // m_utilUpperWeightRange_
3330 for (int length = upperLength; length >= 2; length--) {
3331 int trail = getWeightByte(weight, length);
3332 if (trail > RuleBasedCollator.BYTE_FIRST_TAILORED_) {
3333 m_utilUpperWeightRange_[length].m_start_ = setWeightTrail(
3334 weight, length, RuleBasedCollator.BYTE_FIRST_TAILORED_);
3335 m_utilUpperWeightRange_[length].m_end_ = decWeightTrail(weight,
3337 m_utilUpperWeightRange_[length].m_length_ = length;
3338 m_utilUpperWeightRange_[length].m_count_ = trail
3339 - RuleBasedCollator.BYTE_FIRST_TAILORED_;
3341 weight = truncateWeight(weight, length - 1);
3343 m_utilWeightRange_.m_end_ = decWeightTrail(weight, 1);
3345 // set the middle range
3346 m_utilWeightRange_.m_length_ = 1;
3347 if (Utility.compareUnsigned(m_utilWeightRange_.m_end_,
3348 m_utilWeightRange_.m_start_) >= 0) {
3349 // if (m_utilWeightRange_.m_end_ >= m_utilWeightRange_.m_start_) {
3350 m_utilWeightRange_.m_count_ = ((m_utilWeightRange_.m_end_ - m_utilWeightRange_.m_start_) >>> 24) + 1;
3352 // eliminate overlaps
3353 // remove the middle range
3354 m_utilWeightRange_.m_count_ = 0;
3355 // reduce or remove the lower ranges that go beyond upperLimit
3356 for (int length = 4; length >= 2; --length) {
3357 if (m_utilLowerWeightRange_[length].m_count_ > 0
3358 && m_utilUpperWeightRange_[length].m_count_ > 0) {
3359 int start = m_utilUpperWeightRange_[length].m_start_;
3360 int end = m_utilLowerWeightRange_[length].m_end_;
3362 || incWeight(end, length, maxByte) == start) {
3363 // lower and upper ranges collide or are directly
3364 // adjacent: merge these two and remove all shorter
3366 start = m_utilLowerWeightRange_[length].m_start_;
3367 end = m_utilLowerWeightRange_[length].m_end_ = m_utilUpperWeightRange_[length].m_end_;
3368 // merging directly adjacent ranges needs to subtract
3369 // the 0/1 gaps in between;
3370 // it may result in a range with count>countBytes
3371 m_utilLowerWeightRange_[length].m_count_ = getWeightByte(
3373 - getWeightByte(start, length)
3376 * (getWeightByte(end, length - 1) - getWeightByte(
3377 start, length - 1));
3378 m_utilUpperWeightRange_[length].m_count_ = 0;
3379 while (--length >= 2) {
3380 m_utilLowerWeightRange_[length].m_count_ = m_utilUpperWeightRange_[length].m_count_ = 0;
3388 // copy the ranges, shortest first, into the result array
3390 if (m_utilWeightRange_.m_count_ > 0) {
3391 ranges[0] = new WeightRange(m_utilWeightRange_);
3394 for (int length = 2; length <= 4; ++length) {
3395 // copy upper first so that later the middle range is more likely
3396 // the first one to use
3397 if (m_utilUpperWeightRange_[length].m_count_ > 0) {
3398 ranges[rangeCount] = new WeightRange(
3399 m_utilUpperWeightRange_[length]);
3402 if (m_utilLowerWeightRange_[length].m_count_ > 0) {
3403 ranges[rangeCount] = new WeightRange(
3404 m_utilLowerWeightRange_[length]);
3412 * Truncates the weight with length
3416 * @return truncated weight
3418 private static final int truncateWeight(int weight, int length) {
3419 return weight & (0xffffffff << ((4 - length) << 3));
3423 * Length of the weight
3426 * @return length of the weight
3428 private static final int lengthOfWeight(int weight) {
3429 if ((weight & 0xffffff) == 0) {
3431 } else if ((weight & 0xffff) == 0) {
3433 } else if ((weight & 0xff) == 0) {
3440 * Increment the weight trail
3444 * @return new weight
3446 private static final int incWeightTrail(int weight, int length) {
3447 return weight + (1 << ((4 - length) << 3));
3451 * Decrement the weight trail
3455 * @return new weight
3457 private static int decWeightTrail(int weight, int length) {
3458 return weight - (1 << ((4 - length) << 3));
3462 * Gets the codepoint
3467 * code point to look for
3468 * @return the offset to the code point
3470 private static int findCP(BasicContractionTable tbl, char codePoint) {
3472 while (codePoint > tbl.m_codePoints_.charAt(position)) {
3474 if (position > tbl.m_codePoints_.length()) {
3478 if (codePoint == tbl.m_codePoints_.charAt(position)) {
3486 * Finds a contraction ce
3493 private static int findCE(ContractionTable table, int element, char ch) {
3494 if (table == null) {
3495 return CE_NOT_FOUND_;
3497 BasicContractionTable tbl = getBasicContractionTable(table, element);
3499 return CE_NOT_FOUND_;
3501 int position = findCP(tbl, ch);
3502 if (position > tbl.m_CEs_.size() || position < 0) {
3503 return CE_NOT_FOUND_;
3505 return tbl.m_CEs_.get(position).intValue();
3509 * Checks if the string is tailored in the contraction
3515 * character array to check
3518 * @return true if it is tailored
3520 private static boolean isTailored(ContractionTable table, int element,
3521 char array[], int offset) {
3522 while (array[offset] != 0) {
3523 element = findCE(table, element, array[offset]);
3524 if (element == CE_NOT_FOUND_) {
3527 if (!isContractionTableElement(element)) {
3532 if (getCE(table, element, 0) != CE_NOT_FOUND_) {
3540 * Assemble RuleBasedCollator
3547 private void assembleTable(BuildTable t, RuleBasedCollator collator) {
3548 IntTrieBuilder mapping = t.m_mapping_;
3549 List<Integer> expansions = t.m_expansions_;
3550 ContractionTable contractions = t.m_contractions_;
3551 MaxExpansionTable maxexpansion = t.m_maxExpansions_;
3553 // contraction offset has to be in since we are building on the
3555 // int beforeContractions = (HEADER_SIZE_
3556 // + paddedsize(expansions.size() << 2)) >>> 1;
3557 collator.m_contractionOffset_ = 0;
3558 int contractionsSize = constructTable(contractions);
3560 // the following operation depends on the trie data. Therefore, we have
3561 // to do it before the trie is compacted
3562 // sets jamo expansions
3563 getMaxExpansionJamo(mapping, maxexpansion, t.m_maxJamoExpansions_,
3564 collator.m_isJamoSpecial_);
3566 // TODO: LATIN1 array is now in the utrie - it should be removed from
3568 setAttributes(collator, t.m_options_);
3570 int size = expansions.size();
3571 collator.m_expansion_ = new int[size];
3572 for (int i = 0; i < size; i++) {
3573 collator.m_expansion_[i] = expansions.get(i).intValue();
3575 // contractions block
3576 if (contractionsSize != 0) {
3577 // copy contraction index
3578 collator.m_contractionIndex_ = new char[contractionsSize];
3579 contractions.m_codePoints_.getChars(0, contractionsSize,
3580 collator.m_contractionIndex_, 0);
3581 // copy contraction collation elements
3582 collator.m_contractionCE_ = new int[contractionsSize];
3583 for (int i = 0; i < contractionsSize; i++) {
3584 collator.m_contractionCE_[i] = contractions.m_CEs_.get(i).intValue();
3587 // copy mapping table
3588 collator.m_trie_ = mapping.serialize(t,
3589 RuleBasedCollator.DataManipulate.getInstance());
3590 // copy max expansion table
3591 // not copying the first element which is a dummy
3592 // to be in synch with icu4c's builder, we continue to use the
3594 // omitting expansion offset in builder
3595 collator.m_expansionOffset_ = 0;
3596 size = maxexpansion.m_endExpansionCE_.size();
3597 collator.m_expansionEndCE_ = new int[size - 1];
3598 for (int i = 1; i < size; i++) {
3599 collator.m_expansionEndCE_[i - 1] = maxexpansion.m_endExpansionCE_
3602 collator.m_expansionEndCEMaxSize_ = new byte[size - 1];
3603 for (int i = 1; i < size; i++) {
3604 collator.m_expansionEndCEMaxSize_[i - 1] = maxexpansion.m_expansionCESize_
3605 .get(i).byteValue();
3607 // Unsafe chars table. Finish it off, then copy it.
3609 // Or in unsafebits from UCA, making a combined table.
3610 for (int i = 0; i < UNSAFECP_TABLE_SIZE_; i++) {
3611 t.m_unsafeCP_[i] |= RuleBasedCollator.UCA_.m_unsafe_[i];
3613 collator.m_unsafe_ = t.m_unsafeCP_;
3615 // Finish building Contraction Ending chars hash table and then copy it
3617 // Or in unsafebits from UCA, making a combined table
3618 for (int i = 0; i < UNSAFECP_TABLE_SIZE_; i++) {
3619 t.m_contrEndCP_[i] |= RuleBasedCollator.UCA_.m_contractionEnd_[i];
3621 collator.m_contractionEnd_ = t.m_contrEndCP_;
3625 * Sets this collator to use the all options and tables in UCA.
3628 * which attribute is to be set
3632 private static final void setAttributes(RuleBasedCollator collator,
3633 CollationRuleParser.OptionSet option) {
3634 collator.latinOneFailed_ = true;
3635 collator.m_caseFirst_ = option.m_caseFirst_;
3636 collator.setDecomposition(option.m_decomposition_);
3638 .setAlternateHandlingShifted(option.m_isAlternateHandlingShifted_);
3639 collator.setCaseLevel(option.m_isCaseLevel_);
3640 collator.setFrenchCollation(option.m_isFrenchCollation_);
3641 collator.m_isHiragana4_ = option.m_isHiragana4_;
3642 collator.setStrength(option.m_strength_);
3643 collator.m_variableTopValue_ = option.m_variableTopValue_;
3644 collator.m_reorderCodes_ = option.m_scriptOrder_;
3645 collator.latinOneFailed_ = false;
3649 * Constructing the contraction table
3655 private int constructTable(ContractionTable table) {
3656 // See how much memory we need
3657 int tsize = table.m_elements_.size();
3661 table.m_offsets_.clear();
3663 for (int i = 0; i < tsize; i++) {
3664 table.m_offsets_.add(Integer.valueOf(position));
3665 position += table.m_elements_.get(i).m_CEs_
3668 table.m_CEs_.clear();
3669 table.m_codePoints_.delete(0, table.m_codePoints_.length());
3670 // Now stuff the things in
3671 StringBuilder cpPointer = table.m_codePoints_;
3672 List<Integer> CEPointer = table.m_CEs_;
3673 for (int i = 0; i < tsize; i++) {
3674 BasicContractionTable bct = table.m_elements_.get(i);
3675 int size = bct.m_CEs_.size();
3678 int offset = CEPointer.size();
3679 CEPointer.add(bct.m_CEs_.get(0));
3680 for (int j = 1; j < size; j++) {
3681 char ch = bct.m_codePoints_.charAt(j);
3682 char cc = (char) (UCharacter.getCombiningClass(ch) & 0xFF);
3689 cpPointer.append(ch);
3690 CEPointer.add(bct.m_CEs_.get(j));
3692 cpPointer.insert(offset,
3693 (char) (((ccMin == ccMax) ? 1 : 0 << 8) | ccMax));
3694 for (int j = 0; j < size; j++) {
3695 if (isContractionTableElement(CEPointer.get(offset + j).intValue())) {
3696 int ce = CEPointer.get(offset + j).intValue();
3697 CEPointer.set(offset + j,
3698 Integer.valueOf(constructSpecialCE(getCETag(ce),
3699 table.m_offsets_.get(getContractionOffset(ce))
3705 for (int i = 0; i <= 0x10FFFF; i++) {
3706 int CE = table.m_mapping_.getValue(i);
3707 if (isContractionTableElement(CE)) {
3708 CE = constructSpecialCE(getCETag(CE),
3709 table.m_offsets_.get(getContractionOffset(CE)).intValue());
3710 table.m_mapping_.setValue(i, CE);
3717 * Get contraction offset
3721 * @return contraction offset
3723 private static final int getContractionOffset(int ce) {
3724 return ce & 0xFFFFFF;
3728 * Gets the maximum Jamo expansion
3732 * @param maxexpansion
3733 * maximum expansion table
3734 * @param maxjamoexpansion
3735 * maximum jamo expansion table
3736 * @param jamospecial
3739 private static void getMaxExpansionJamo(IntTrieBuilder mapping,
3740 MaxExpansionTable maxexpansion,
3741 MaxJamoExpansionTable maxjamoexpansion, boolean jamospecial) {
3746 int v = VBASE + VCOUNT - 1;
3747 int t = TBASE + TCOUNT - 1;
3749 while (v >= VBASE) {
3750 int ce = mapping.getValue(v);
3751 if ((ce & RuleBasedCollator.CE_SPECIAL_FLAG_) != RuleBasedCollator.CE_SPECIAL_FLAG_) {
3752 setMaxExpansion(ce, (byte) 2, maxexpansion);
3757 while (t >= TBASE) {
3758 int ce = mapping.getValue(t);
3759 if ((ce & RuleBasedCollator.CE_SPECIAL_FLAG_) != RuleBasedCollator.CE_SPECIAL_FLAG_) {
3760 setMaxExpansion(ce, (byte) 3, maxexpansion);
3764 // According to the docs, 99% of the time, the Jamo will not be special
3766 // gets the max expansion in all unicode characters
3767 int count = maxjamoexpansion.m_endExpansionCE_.size();
3768 byte maxTSize = (byte) (maxjamoexpansion.m_maxLSize_
3769 + maxjamoexpansion.m_maxVSize_ + maxjamoexpansion.m_maxTSize_);
3770 byte maxVSize = (byte) (maxjamoexpansion.m_maxLSize_ + maxjamoexpansion.m_maxVSize_);
3774 if ((maxjamoexpansion.m_isV_.get(count))
3775 .booleanValue() == true) {
3777 (maxjamoexpansion.m_endExpansionCE_
3778 .get(count)).intValue(), maxVSize,
3782 (maxjamoexpansion.m_endExpansionCE_
3783 .get(count)).intValue(), maxTSize,
3791 * To the UnsafeCP hash table, add all chars with combining class != 0
3796 private final void unsafeCPAddCCNZ(BuildTable t) {
3797 boolean buildCMTable = (buildCMTabFlag & (t.cmLookup == null));
3798 char[] cm = null; // combining mark array
3799 int[] index = new int[256];
3803 cm = new char[0x10000];
3805 for (char c = 0; c < 0xffff; c++) {
3807 if (UTF16.isLeadSurrogate(c)) {
3809 if (m_nfcImpl_.singleLeadMightHaveNonZeroFCD16(c)) {
3810 int supp = Character.toCodePoint(c, (char)0xdc00);
3811 int suppLimit = supp + 0x400;
3812 while (supp < suppLimit) {
3813 fcd |= m_nfcImpl_.getFCD16FromNormData(supp++);
3817 fcd = m_nfcImpl_.getFCD16(c);
3819 // TODO: review for handling supplementary characters
3820 if (fcd >= 0x100 || // if the leading combining class(c) > 0 ||
3821 (UTF16.isLeadSurrogate(c) && fcd != 0)) {
3822 // c is a leading surrogate with some FCD data
3823 unsafeCPSet(t.m_unsafeCP_, c);
3825 int cc = (fcd & 0xff);
3826 int pos = (cc << 8) + index[cc];
3834 if (t.m_prefixLookup_ != null) {
3835 Enumeration<Elements> els = Collections.enumeration(t.m_prefixLookup_.values());
3836 while (els.hasMoreElements()) {
3837 Elements e = els.nextElement();
3838 // codepoints here are in the NFD form. We need to add the
3839 // first code point of the NFC form to unsafe, because
3840 // strcoll needs to backup over them.
3841 // weiv: This is wrong! See the comment above.
3842 // String decomp = Normalizer.decompose(e.m_cPoints_, true);
3843 // unsafeCPSet(t.m_unsafeCP_, decomp.charAt(0));
3845 String comp = Normalizer.compose(e.m_cPoints_, false);
3846 unsafeCPSet(t.m_unsafeCP_, comp.charAt(0));
3851 t.cmLookup = new CombinClassTable();
3852 t.cmLookup.generate(cm, count, index);
3864 * collation element iterator
3871 private boolean enumCategoryRangeClosureCategory(BuildTable t,
3872 RuleBasedCollator collator, CollationElementIterator colEl,
3873 int start, int limit, int type) {
3874 if (type != UCharacterCategory.UNASSIGNED
3875 && type != UCharacterCategory.PRIVATE_USE) {
3876 // if the range is assigned - we might ommit more categories later
3878 for (int u32 = start; u32 < limit; u32++) {
3879 String decomp = m_nfcImpl_.getDecomposition(u32);
3880 if (decomp != null) {
3881 String comp = UCharacter.toString(u32);
3882 if (!collator.equals(comp, decomp)) {
3883 m_utilElement_.m_cPoints_ = decomp;
3884 m_utilElement_.m_prefix_ = 0;
3885 Elements prefix = t.m_prefixLookup_.get(m_utilElement_);
3886 if (prefix == null) {
3887 m_utilElement_.m_cPoints_ = comp;
3888 m_utilElement_.m_prefix_ = 0;
3889 m_utilElement_.m_prefixChars_ = null;
3890 colEl.setText(decomp);
3891 int ce = colEl.next();
3892 m_utilElement_.m_CELength_ = 0;
3893 while (ce != CollationElementIterator.NULLORDER) {
3894 m_utilElement_.m_CEs_[m_utilElement_.m_CELength_++] = ce;
3898 m_utilElement_.m_cPoints_ = comp;
3899 m_utilElement_.m_prefix_ = 0;
3900 m_utilElement_.m_prefixChars_ = null;
3901 m_utilElement_.m_CELength_ = 1;
3902 m_utilElement_.m_CEs_[0] = prefix.m_mapCE_;
3903 // This character uses a prefix. We have to add it
3904 // to the unsafe table, as it decomposed form is
3905 // already in. In Japanese, this happens for \u309e
3907 // Since unsafeCPSet is static in ucol_elm, we are
3908 // going to wrap it up in the unsafeCPAddCCNZ
3911 addAnElement(t, m_utilElement_);
3920 * Determine if a character is a Jamo
3924 * @return true if ch is a Jamo, false otherwise
3926 private static final boolean isJamo(char ch) {
3927 return (ch >= 0x1100 && ch <= 0x1112) || (ch >= 0x1175 && ch <= 0x1161)
3928 || (ch >= 0x11A8 && ch <= 0x11C2);
3932 * Produces canonical closure
3934 private void canonicalClosure(BuildTable t) {
3935 BuildTable temp = new BuildTable(t);
3936 assembleTable(temp, temp.m_collator_);
3937 // produce canonical closure
3938 CollationElementIterator coleiter = temp.m_collator_
3939 .getCollationElementIterator("");
3940 RangeValueIterator typeiter = UCharacter.getTypeIterator();
3941 RangeValueIterator.Element element = new RangeValueIterator.Element();
3942 while (typeiter.next(element)) {
3943 enumCategoryRangeClosureCategory(t, temp.m_collator_, coleiter,
3944 element.start, element.limit, element.value);
3947 t.cmLookup = temp.cmLookup;
3948 temp.cmLookup = null;
3950 for (int i = 0; i < m_parser_.m_resultLength_; i++) {
3951 char baseChar, firstCM;
3952 // now we need to generate the CEs
3953 // We stuff the initial value in the buffers, and increase the
3954 // appropriate buffer according to strength */
3955 // createElements(t, m_parser_.m_listHeader_[i]);
3956 CollationRuleParser.Token tok = m_parser_.m_listHeader_[i].m_first_;
3957 m_utilElement_.clear();
3958 while (tok != null) {
3959 m_utilElement_.m_prefix_ = 0;// el.m_prefixChars_;
3960 m_utilElement_.m_cPointsOffset_ = 0; // el.m_uchars_;
3961 if (tok.m_prefix_ != 0) {
3962 // we will just copy the prefix here, and adjust accordingly
3964 // the addPrefix function in ucol_elm. The reason is that we
3965 // need to add both composed AND decomposed elements to the
3967 int size = tok.m_prefix_ >> 24;
3968 int offset = tok.m_prefix_ & 0x00FFFFFF;
3969 m_utilElement_.m_prefixChars_ = m_parser_.m_source_
3970 .substring(offset, offset + size);
3971 size = (tok.m_source_ >> 24) - (tok.m_prefix_ >> 24);
3972 offset = (tok.m_source_ & 0x00FFFFFF)
3973 + (tok.m_prefix_ >> 24);
3974 m_utilElement_.m_uchars_ = m_parser_.m_source_.substring(
3975 offset, offset + size);
3977 m_utilElement_.m_prefixChars_ = null;
3978 int offset = tok.m_source_ & 0x00FFFFFF;
3979 int size = tok.m_source_ >>> 24;
3980 m_utilElement_.m_uchars_ = m_parser_.m_source_.substring(
3981 offset, offset + size);
3983 m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
3985 baseChar = firstCM = 0; // reset
3986 for (int j = 0; j < m_utilElement_.m_cPoints_.length()
3987 - m_utilElement_.m_cPointsOffset_; j++) {
3989 int fcd = m_nfcImpl_.getFCD16(m_utilElement_.m_cPoints_.charAt(j)); // TODO: review for handling supplementary characters
3990 if ((fcd & 0xff) == 0) {
3991 baseChar = m_utilElement_.m_cPoints_.charAt(j);
3993 if ((baseChar != 0) && (firstCM == 0)) {
3994 firstCM = m_utilElement_.m_cPoints_.charAt(j); // first
4001 if ((baseChar != 0) && (firstCM != 0)) {
4002 addTailCanonicalClosures(t, temp.m_collator_, coleiter,
4010 private void addTailCanonicalClosures(BuildTable t,
4011 RuleBasedCollator m_collator, CollationElementIterator colEl,
4012 char baseChar, char cMark) {
4013 if (t.cmLookup == null) {
4016 CombinClassTable cmLookup = t.cmLookup;
4017 int[] index = cmLookup.index;
4018 int cClass = m_nfcImpl_.getFCD16(cMark) & 0xff; // TODO: review for handling supplementary characters
4020 char[] precompCh = new char[256];
4021 int[] precompClass = new int[256];
4023 Elements element = new Elements();
4026 maxIndex = index[cClass - 1];
4028 for (int i = 0; i < maxIndex; i++) {
4029 StringBuilder decompBuf = new StringBuilder();
4030 decompBuf.append(baseChar).append(cmLookup.cPoints[i]);
4031 String comp = Normalizer.compose(decompBuf.toString(), false);
4032 if (comp.length() == 1) {
4033 precompCh[precompLen] = comp.charAt(0);
4034 precompClass[precompLen] = m_nfcImpl_.getFCD16(cmLookup.cPoints[i]) & 0xff; // TODO: review for handling supplementary characters
4036 StringBuilder decomp = new StringBuilder();
4037 for (int j = 0; j < m_utilElement_.m_cPoints_.length(); j++) {
4038 if (m_utilElement_.m_cPoints_.charAt(j) == cMark) {
4039 decomp.append(cmLookup.cPoints[i]);
4041 decomp.append(m_utilElement_.m_cPoints_.charAt(j));
4044 comp = Normalizer.compose(decomp.toString(), false);
4045 StringBuilder buf = new StringBuilder(comp);
4047 decomp.append(cMark);
4048 comp = buf.toString();
4050 element.m_cPoints_ = decomp.toString();
4051 element.m_CELength_ = 0;
4052 element.m_prefix_ = 0;
4053 Elements prefix = t.m_prefixLookup_.get(element);
4054 element.m_cPoints_ = comp;
4055 element.m_uchars_ = comp;
4057 if (prefix == null) {
4058 element.m_prefix_ = 0;
4059 element.m_prefixChars_ = null;
4060 colEl.setText(decomp.toString());
4061 int ce = colEl.next();
4062 element.m_CELength_ = 0;
4063 while (ce != CollationElementIterator.NULLORDER) {
4064 element.m_CEs_[element.m_CELength_++] = ce;
4068 element.m_cPoints_ = comp;
4069 element.m_prefix_ = 0;
4070 element.m_prefixChars_ = null;
4071 element.m_CELength_ = 1;
4072 element.m_CEs_[0] = prefix.m_mapCE_;
4074 setMapCE(t, element);
4075 finalizeAddition(t, element);
4077 if (comp.length() > 2) {
4078 // This is a fix for tailoring contractions with accented
4079 // character at the end of contraction string.
4080 addFCD4AccentedContractions(t, colEl, comp, element);
4082 if (precompLen > 1) {
4083 precompLen = addMultiCMontractions(t, colEl, element,
4084 precompCh, precompClass, precompLen, cMark, i,
4092 private void setMapCE(BuildTable t, Elements element) {
4093 List<Integer> expansions = t.m_expansions_;
4094 element.m_mapCE_ = 0;
4096 if (element.m_CELength_ == 2 // a two CE expansion
4097 && RuleBasedCollator.isContinuation(element.m_CEs_[1])
4098 && (element.m_CEs_[1] & (~(0xFF << 24 | RuleBasedCollator.CE_CONTINUATION_MARKER_))) == 0 // that
4104 && (((element.m_CEs_[0] >> 8) & 0xFF) == RuleBasedCollator.BYTE_COMMON_)
4105 // a common secondary
4106 && ((element.m_CEs_[0] & 0xFF) == RuleBasedCollator.BYTE_COMMON_)) { // and
4111 element.m_mapCE_ = RuleBasedCollator.CE_SPECIAL_FLAG_
4112 // a long primary special
4113 | (CE_LONG_PRIMARY_TAG_ << 24)
4114 // first and second byte of primary
4115 | ((element.m_CEs_[0] >> 8) & 0xFFFF00)
4116 // third byte of primary
4117 | ((element.m_CEs_[1] >> 24) & 0xFF);
4119 // omitting expansion offset in builder
4120 // (HEADER_SIZE_ >> 2)
4121 int expansion = RuleBasedCollator.CE_SPECIAL_FLAG_
4122 | (CE_EXPANSION_TAG_ << RuleBasedCollator.CE_TAG_SHIFT_)
4123 | (addExpansion(expansions, element.m_CEs_[0]) << 4)
4126 for (int i = 1; i < element.m_CELength_; i++) {
4127 addExpansion(expansions, element.m_CEs_[i]);
4129 if (element.m_CELength_ <= 0xF) {
4130 expansion |= element.m_CELength_;
4132 addExpansion(expansions, 0);
4134 element.m_mapCE_ = expansion;
4135 setMaxExpansion(element.m_CEs_[element.m_CELength_ - 1],
4136 (byte) element.m_CELength_, t.m_maxExpansions_);
4140 private int addMultiCMontractions(BuildTable t,
4141 CollationElementIterator colEl, Elements element, char[] precompCh,
4142 int[] precompClass, int maxComp, char cMark, int cmPos,
4145 CombinClassTable cmLookup = t.cmLookup;
4146 char[] combiningMarks = { cMark };
4147 int cMarkClass = UCharacter.getCombiningClass(cMark) & 0xFF;
4148 String comMark = new String(combiningMarks);
4149 int noOfPrecomposedChs = maxComp;
4151 for (int j = 0; j < maxComp; j++) {
4156 String newDecomp, comp;
4158 if (count == 0) { // Decompose the saved precomposed char.
4159 newDecomp = Normalizer.decompose(
4160 new String(precompCh, j, 1), false);
4161 temp = new StringBuilder(newDecomp);
4162 temp.append(cmLookup.cPoints[cmPos]);
4163 newDecomp = temp.toString();
4165 temp = new StringBuilder(decomp);
4166 temp.append(precompCh[j]);
4167 newDecomp = temp.toString();
4169 comp = Normalizer.compose(newDecomp, false);
4170 if (comp.length() == 1) {
4172 element.m_cPoints_ = temp.toString();
4173 element.m_CELength_ = 0;
4174 element.m_prefix_ = 0;
4175 Elements prefix = t.m_prefixLookup_.get(element);
4176 element.m_cPoints_ = comp + comMark;
4177 if (prefix == null) {
4178 element.m_prefix_ = 0;
4179 element.m_prefixChars_ = null;
4180 colEl.setText(temp.toString());
4181 int ce = colEl.next();
4182 element.m_CELength_ = 0;
4183 while (ce != CollationElementIterator.NULLORDER) {
4184 element.m_CEs_[element.m_CELength_++] = ce;
4188 element.m_cPoints_ = comp;
4189 element.m_prefix_ = 0;
4190 element.m_prefixChars_ = null;
4191 element.m_CELength_ = 1;
4192 element.m_CEs_[0] = prefix.m_mapCE_;
4194 setMapCE(t, element);
4195 finalizeAddition(t, element);
4196 precompCh[noOfPrecomposedChs] = comp.charAt(0);
4197 precompClass[noOfPrecomposedChs] = cMarkClass;
4198 noOfPrecomposedChs++;
4200 } while (++count < 2 && (precompClass[j] == cMarkClass));
4202 return noOfPrecomposedChs;
4205 private void addFCD4AccentedContractions(BuildTable t,
4206 CollationElementIterator colEl, String data, Elements element) {
4207 String decomp = Normalizer.decompose(data, false);
4208 String comp = Normalizer.compose(data, false);
4210 element.m_cPoints_ = decomp;
4211 element.m_CELength_ = 0;
4212 element.m_prefix_ = 0;
4213 Elements prefix = t.m_prefixLookup_.get(element);
4214 if (prefix == null) {
4215 element.m_cPoints_ = comp;
4216 element.m_prefix_ = 0;
4217 element.m_prefixChars_ = null;
4218 element.m_CELength_ = 0;
4219 colEl.setText(decomp);
4220 int ce = colEl.next();
4221 element.m_CELength_ = 0;
4222 while (ce != CollationElementIterator.NULLORDER) {
4223 element.m_CEs_[element.m_CELength_++] = ce;
4226 addAnElement(t, element);
4230 private void processUCACompleteIgnorables(BuildTable t) {
4231 TrieIterator trieiterator = new TrieIterator(
4232 RuleBasedCollator.UCA_.m_trie_);
4233 RangeValueIterator.Element element = new RangeValueIterator.Element();
4234 while (trieiterator.next(element)) {
4235 int start = element.start;
4236 int limit = element.limit;
4237 if (element.value == 0) {
4238 while (start < limit) {
4239 int CE = t.m_mapping_.getValue(start);
4240 if (CE == CE_NOT_FOUND_) {
4241 m_utilElement_.m_prefix_ = 0;
4242 m_utilElement_.m_uchars_ = UCharacter.toString(start);
4243 m_utilElement_.m_cPoints_ = m_utilElement_.m_uchars_;
4244 m_utilElement_.m_cPointsOffset_ = 0;
4245 m_utilElement_.m_CELength_ = 1;
4246 m_utilElement_.m_CEs_[0] = 0;
4247 addAnElement(t, m_utilElement_);